├── src
    ├── marlin
    │   ├── basic
    │   │   ├── stl-basic.cc
    │   │   ├── stl-utils.cc
    │   │   ├── city.o
    │   │   ├── opt.o
    │   │   ├── std.o
    │   │   ├── str.o
    │   │   ├── stl-basic.o
    │   │   ├── stl-utils.o
    │   │   ├── str.h
    │   │   ├── COPYRIGHT
    │   │   ├── str.cc
    │   │   ├── std.cc
    │   │   ├── opt.h
    │   │   ├── stl-basic.h
    │   │   ├── city.h
    │   │   ├── std.h
    │   │   ├── opt.cc
    │   │   ├── stl-utils.h
    │   │   └── city.cc
    │   ├── marlin_cluster
    │   ├── marlin_cluster.o
    │   ├── example
    │   │   ├── example.txt.bz2
    │   │   └── example.sh
    │   ├── Makefile
    │   ├── README
    │   ├── log
    │   ├── marlin_count
    │   └── marlin_cluster.cc
    ├── brown
    └── code
    │   ├── create_fst.sh
    │   ├── rom_conllu.sh
    │   ├── clean_map_decode.py
    │   ├── pos2char.py
    │   ├── label_dictionary.py
    │   ├── tag_with_clusters.sh
    │   ├── replace-unicode-punctuation.perl
    │   ├── preprocess.sh
    │   ├── conllu2txt.py
    │   ├── arpa2wfst.sh
    │   ├── run_clustering.sh
    │   ├── decode.sh
    │   ├── train_combined_lm.sh
    │   ├── train_channel.sh
    │   ├── combine_channels.py
    │   ├── tag_text.py
    │   ├── makelmfsa_x.cpp
    │   ├── setup_ud-treebank_data.sh
    │   ├── makelmfsa.cpp
    │   ├── elisa2flat.py
    │   ├── filter_lowfreq.py
    │   ├── train_cipher.py
    │   └── utils.py
├── .gitignore
├── .gitmodules
├── train_srilm_langmodel.sh
├── README.md
├── train_format_lm_ud.sh
└── utagger


/src/marlin/basic/stl-basic.cc:
--------------------------------------------------------------------------------
1 | #include "stl-basic.h" 
2 | 


--------------------------------------------------------------------------------
/src/marlin/basic/stl-utils.cc:
--------------------------------------------------------------------------------
1 | #include "stl-utils.h"
2 | 


--------------------------------------------------------------------------------
/src/brown:
--------------------------------------------------------------------------------
1 | /home/ronald/universal-cipher-pos-tagging/src/brown-cluster


--------------------------------------------------------------------------------
/src/marlin/basic/city.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/city.o


--------------------------------------------------------------------------------
/src/marlin/basic/opt.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/opt.o


--------------------------------------------------------------------------------
/src/marlin/basic/std.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/std.o


--------------------------------------------------------------------------------
/src/marlin/basic/str.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/str.o


--------------------------------------------------------------------------------
/src/marlin/marlin_cluster:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/marlin_cluster


--------------------------------------------------------------------------------
/src/marlin/basic/stl-basic.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/stl-basic.o


--------------------------------------------------------------------------------
/src/marlin/basic/stl-utils.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/stl-utils.o


--------------------------------------------------------------------------------
/src/marlin/marlin_cluster.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/marlin_cluster.o


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/*
2 | src/code/__pycache__/*
3 | lm_data/*
4 | lms/*
5 | exp-cipher/*
6 | *.pyc
7 | *.o
8 | utagger_hpc
9 | gen_exp.sh


--------------------------------------------------------------------------------
/src/marlin/example/example.txt.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/example/example.txt.bz2


--------------------------------------------------------------------------------
/src/marlin/example/example.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | set -ue
4 | 
5 | ../marlin_count --text example.txt.bz2 --bigrams bigrams --words words 
6 | ../marlin_cluster --words words --bigrams bigrams --output classes --c 100 --steps 5
7 | rm words bigrams
8 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "src/uroman"]
 2 | 	path = src/uroman
 3 | 	url = git@github.com:isi-nlp/uroman.git
 4 | [submodule "src/brown-cluster"]
 5 | 	path = src/brown-cluster
 6 | 	url = git@github.com:percyliang/brown-cluster.git
 7 | [submodule "src/carmel"]
 8 | 	path = src/carmel
 9 | 	url = git@github.com:isi-nlp/carmel.git
10 | 


--------------------------------------------------------------------------------
/src/code/create_fst.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | nclusters=$1
 4 | tagset="N O A R I B C J D T M F P E Y V X"
 5 | nc1=$(($nclusters - 1))
 6 | 
 7 | echo "S"
 8 | echo '(S (S "<s>" "<s>" 1))'
 9 | echo '(S (S "</s>" "</s>" 1))'
10 | for tag in $tagset
11 | do
12 |     for nc in `seq 0 $nc1`
13 |     do
14 |         echo '(S (S "'$tag'" "'$nc'" 1))'
15 |     done
16 | done


--------------------------------------------------------------------------------
/src/code/rom_conllu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | lang=$1
 4 | input=$2
 5 | output=$3
 6 | UR_DIR=$4
 7 | 
 8 | cut -f 2 $input | $UR_DIR/bin/uroman.pl -l $lang | \
 9 | awk -F'\t' '{OFS = FS} FNR==NR{a[NR]=$1;next}{$2=a[FNR]}1' \
10 | /dev/stdin $input > temp
11 | 
12 | cut -f 3 temp | $UR_DIR/bin/uroman.pl -l $lang | \
13 | awk -F'\t' '{OFS = FS} FNR==NR{a[NR]=$1;next}{$3=a[FNR]}1' \
14 | /dev/stdin temp > $output
15 | 
16 | rm temp


--------------------------------------------------------------------------------
/src/code/clean_map_decode.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from utils import char2upos
 3 | 
 4 | inputfn = sys.argv[1]
 5 | outputfn = sys.argv[2]
 6 | outfile = open(outputfn,'w')
 7 | for line in open(inputfn,'r'):
 8 | 	line=line.strip('\n')
 9 | 	if line=='': continue
10 | 	dec = [char2upos[c.strip('"')] for c in line.split()[1:-1] ]
11 | 	# assuming line always like this: <s> ...  </s> (#eos already strips while cl_tagging)
12 | 	print(" ".join(dec),file=outfile)


--------------------------------------------------------------------------------
/src/marlin/Makefile:
--------------------------------------------------------------------------------
 1 | # 1.2: need to make sure opt.o goes in the right order to get the right scope on the command-line arguments
 2 | # Use this for Linux
 3 | files=$(subst .cc,.o,$(shell /bin/ls basic/*.cc))
 4 | 
 5 | all: marlin_cluster
 6 | 
 7 | marlin_cluster: marlin_cluster.o $(files)
 8 | 	g++ -std=c++11 -Wall -g -O3 -o marlin_cluster marlin_cluster.o $(files)
 9 | 
10 | %.o: %.cc
11 | 	g++ -Wunused -std=c++11 -Wall -g -O3 -o $@ -c $<
12 | 
13 | clean:
14 | 	rm -rf marlin_cluster basic/*.o *.o
15 | 


--------------------------------------------------------------------------------
/src/code/pos2char.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from utils import *
 3 | import pdb
 4 | import argparse
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |   parser = argparse.ArgumentParser() 
 9 |   parser.add_argument("--ts"   ,"-ts", type=str, default="ud", help="Tagset name [ud,ut]")
10 | 
11 |   args = parser.parse_args()
12 | 
13 |   mapper = upos2char if args.ts=="ud" else ut2char
14 | 
15 |   for line in sys.stdin:
16 |     line = line.strip("\n")
17 |     if line=='': continue
18 |     mapped = [mapper[tag] for tag in line.split(" ") ]
19 |     print(" ".join(mapped))
20 | 
21 |     
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/src/marlin/basic/str.h:
--------------------------------------------------------------------------------
 1 | #ifndef __STR_H__
 2 | #define __STR_H__
 3 | 
 4 | #include "stl-basic.h"
 5 | 
 6 | string substr(const string &s, int i, int j);
 7 | string substr(const string &s, int i);
 8 | 
 9 | string str_printf(const char *fmt, ...);
10 | char *copy_str(const char *s);
11 | string int2str(int x);
12 | string double2str(double x);
13 | 
14 | StringVec split(const char *str, const char *delims, bool keep_empty);
15 | StrVec mutate_split(char *str, const char *delims);
16 | 
17 | char *trim(char *s);
18 | string tolower(const char *s);
19 | 
20 | int index_of(const char *s, const char *t);
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/src/code/label_dictionary.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import warnings
 3 | 
 4 | class LabelDictionary(dict):
 5 |     '''This class implements a dictionary of labels. Labels as mapped to 
 6 |     integers, and it is efficient to retrieve the label name from its 
 7 |     integer representation, and vice-versa.'''
 8 |     def __init__(self, label_names=[]):
 9 |         self.names = []
10 |         for name in label_names:
11 |             self.add(name)
12 | 
13 |     def add(self, name):
14 |         if name in self:
15 |             return self[name]
16 |             #warnings.warn('Ignoring duplicated label ' +  name) 
17 |         label_id = len(self.names)
18 |         self[name] = label_id
19 |         self.names.append(name)
20 |         return label_id
21 | 
22 |     def get_label_name(self, label_id):
23 |         return self.names[label_id]
24 | 
25 |     def get_label_id(self, name):
26 |         return self[name]
27 | 


--------------------------------------------------------------------------------
/src/marlin/basic/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | The code in basic/ is taken from:
 2 | https://github.com/percyliang/brown-cluster
 3 | 
 4 | The following copyright / usage agreement applies:
 5 | 
 6 | (C) Copyright 2007-2012, Percy Liang
 7 | 
 8 | Implementation of the Brown hierarchical word clustering algorithm.
 9 | Percy Liang
10 | Release 1.3
11 | 2012.07.24
12 | 
13 | http://cs.stanford.edu/~pliang
14 | 
15 | Permission is granted for anyone to copy, use, or modify these programs and
16 | accompanying documents for purposes of research or education, provided this
17 | copyright notice is retained, and note is made of any changes that have been
18 | made.
19 | 
20 | These programs and documents are distributed without any warranty, express or
21 | implied.  As the programs were written for research purposes only, they have
22 | not been tested to the degree that would be advisable in any important
23 | application.  All use of these programs is entirely at the user's own risk.
24 | 
25 | 


--------------------------------------------------------------------------------
/src/code/tag_with_clusters.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #SBATCH --time=50:00:00
 3 | #SBATCH --partition=isi
 4 | 
 5 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )"
 6 | BASELINE="brown"
 7 | NCLUSTERS=500
 8 | INPUT=""
 9 | EXP_DIR="$BASEDIR/exp-cipher"
10 | 
11 | while getopts "h?b:n:i:e:" opt; do
12 |     case "$opt" in
13 |     h|\?)
14 |         show_help
15 |         exit 0
16 |         ;;
17 |     b)  BASELINE=$OPTARG
18 |         ;;
19 |     n)  NCLUSTERS=$OPTARG
20 |         ;;
21 |     i)  INPUT=$OPTARG
22 |         ;;
23 |     e)
24 |         EXP_DIR=$OPTARG
25 |         ;;
26 |     esac
27 | done
28 | 
29 | clt_out="$EXP_DIR/$BASELINE-$NCLUSTERS"
30 | 
31 | if [ ! -f $clt_out/clt.mapper.pickle ]; then
32 |     python3 $BASEDIR/src/code/tag_text.py -i $INPUT \
33 |     -b $BASELINE -m train -c $clt_out/clusters.$BASELINE -op output -nc $NCLUSTERS
34 | else
35 |     python3 $BASEDIR/src/code/tag_text.py -i $INPUT \
36 |     -b $BASELINE -m eval -v $clt_out/clt.mapper.pickle -op output -nc $NCLUSTERS
37 | fi
38 | 


--------------------------------------------------------------------------------
/src/marlin/README:
--------------------------------------------------------------------------------
 1 | For more information visit http://cistern.cis.lmu.de/.
 2 | 
 3 | Documentation can be found at https://github.com/muelletm/cistern/blob/wiki/marlin.md.
 4 | 
 5 | (C) Copyright 2013-2015, Thomas Müller
 6 | 
 7 | The code in basic/ is taken from:
 8 | https://github.com/percyliang/brown-cluster
 9 | 
10 | The following copyright / usage agreement applies:
11 | 
12 | (C) Copyright 2007-2012, Percy Liang
13 | 
14 | Implementation of the Brown hierarchical word clustering algorithm.
15 | Percy Liang
16 | Release 1.3
17 | 2012.07.24
18 | 
19 | http://cs.stanford.edu/~pliang
20 | 
21 | Permission is granted for anyone to copy, use, or modify these programs and
22 | accompanying documents for purposes of research or education, provided this
23 | copyright notice is retained, and note is made of any changes that have been
24 | made.
25 | 
26 | These programs and documents are distributed without any warranty, express or
27 | implied.  As the programs were written for research purposes only, they have
28 | not been tested to the degree that would be advisable in any important
29 | application.  All use of these programs is entirely at the user's own risk.
30 | 


--------------------------------------------------------------------------------
/src/code/replace-unicode-punctuation.perl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
 4 | # Public License version 2.1 or, at your option, any later version.
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | use utf8;
 9 | 
10 | binmode(STDIN, ":utf8");
11 | binmode(STDOUT, ":utf8");
12 | 
13 | while(<STDIN>) {
14 |   s/，/,/g;
15 |   s/。 */. /g;
16 |   s/、/,/g;
17 |   s/”/"/g;
18 |   s/“/"/g;
19 |   s/¨/"/g;
20 |   s/[∶׃：]/:/g;
21 |   s/？/\?/g;
22 |   s/《/"/g;
23 |   s/》/"/g;
24 |   s/）/\)/g;
25 |   s/！/\!/g;
26 |   s/（/\(/g;
27 |   s/；/;/g;
28 |   s/１/"/g;
29 |   s/」/"/g;
30 |   s/「/"/g;
31 |   s/０/0/g;
32 |   s/３/3/g;
33 |   s/２/2/g;
34 |   s/５/5/g;
35 |   s/６/6/g;
36 |   s/９/9/g;
37 |   s/７/7/g;
38 |   s/８/8/g;
39 |   s/４/4/g;
40 |   s/． */. /g;
41 |   s/～/\~/g;
42 |   s/[’ʼ′｀]/\'/g;
43 |   s/…/\.\.\./g;
44 |   s/━/\-/g;
45 |   s/─/-/g;
46 |   s/〈/\</g;
47 |   s/[‹›]/"/g;
48 |   s/〉/\>/g;
49 |   s/【/\[/g;
50 |   s/】/\]/g;
51 |   s/％/\%/g;
52 |   s/¬/-/g;
53 |   s/×+/x/g;
54 |   s/≤/<=/g;
55 |   s/≥/>=/g;
56 |   s/≠/\!=/g;
57 |   s/→/-/g;
58 |   # s/[↔💪😍╥؟ヽ↓▯◘∞►◄♦°░✔▓⚛☻↑¤╰╮➖★♪♫™🏻👇†😭😥😎😢️│·‧·ｪ•●▽❤♥💕♡☉¶§📌✌®╟╢╩╔╗╣╠╝╚═¸┈┉✽♈̷̴̶⌣̊┼╫♉▒▒┌┘└┐┘┌┴‾̲☐÷☆┬✰☰]+/\*/g;
59 | 
60 |   print $_;
61 | }
62 | 


--------------------------------------------------------------------------------
/src/marlin/log:
--------------------------------------------------------------------------------
 1 | W:     Number of words processed / total
 2 | LL:    Current log-likelihood
 3 | Swaps: Number of words that changed their class
 4 | 
 5 | iter: 0
 6 | W:  1688 /   6755 LL: -389785 Swaps:  1495
 7 | W:  3376 /   6755 LL: -385828 Swaps:  2981
 8 | W:  5064 /   6755 LL: -382624 Swaps:  4499
 9 | W:  6752 /   6755 LL: -379357 Swaps:  5997
10 | W:  6755 /   6755 LL: -379352 Swaps:  5999
11 | iter: 1
12 | W:  1688 /   6755 LL: -376172 Swaps:   449
13 | W:  3376 /   6755 LL: -375490 Swaps:   984
14 | W:  5064 /   6755 LL: -375067 Swaps:  1393
15 | W:  6752 /   6755 LL: -374700 Swaps:  1758
16 | W:  6755 /   6755 LL: -374700 Swaps:  1758
17 | iter: 2
18 | W:  1688 /   6755 LL: -374131 Swaps:   209
19 | W:  3376 /   6755 LL: -373951 Swaps:   479
20 | W:  5064 /   6755 LL: -373831 Swaps:   685
21 | W:  6752 /   6755 LL: -373683 Swaps:   866
22 | W:  6755 /   6755 LL: -373683 Swaps:   866
23 | iter: 3
24 | W:  1688 /   6755 LL: -373387 Swaps:   126
25 | W:  3376 /   6755 LL: -373284 Swaps:   286
26 | W:  5064 /   6755 LL: -373191 Swaps:   412
27 | W:  6752 /   6755 LL: -373104 Swaps:   514
28 | W:  6755 /   6755 LL: -373104 Swaps:   514
29 | iter: 4
30 | W:  1688 /   6755 LL: -373060 Swaps:    50
31 | W:  3376 /   6755 LL: -373031 Swaps:   101
32 | W:  5064 /   6755 LL: -373011 Swaps:   128
33 | W:  6752 /   6755 LL: -372996 Swaps:   148
34 | W:  6755 /   6755 LL: -372996 Swaps:   148
35 | 


--------------------------------------------------------------------------------
/src/code/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # preprocessing data
 4 | 
 5 | set -e
 6 | 
 7 | INPUT=""
 8 | ROM="false"
 9 | MODE="test"
10 | IL="en"
11 | 
12 | EXP_DIR="$BASEDIR/exp-cipher"
13 | 
14 | while [ $# -gt 1 ]
15 | do
16 | key="$1"
17 | case $key in
18 |     -i|--input)
19 |     INPUT="$2"
20 |     shift # past argument
21 |     ;;
22 |     -rom|--rom)
23 |     ROM="$2"
24 |     shift # past argument
25 |     ;;
26 | 	-m|--mode)
27 |     MODE="$2"
28 |     shift # past argument
29 |     ;;
30 | 	-l|--lang)
31 |     IL="$2"
32 |     shift # past argument
33 |     ;;
34 |     -exp|--exp_dir)
35 |     EXP_DIR="$2"
36 |     shift # past argument
37 |     ;;
38 |     *)
39 |             # unknown option
40 |     ;;
41 | esac
42 | shift
43 | done 
44 | 
45 | datadir="$EXP_DIR/data"
46 | 
47 | # romanize if needed
48 | if [ $ROM = "true" ]; then
49 |     echo "romanizing"
50 |     src/uroman/bin/uroman.pl < $INPUT > "$INPUT".roman
51 | else
52 |     cp $INPUT "$INPUT".roman
53 | fi
54 | 
55 | echo "cleaning/filtering..."
56 | #normalize / filter noise
57 | 
58 | src/code/replace-unicode-punctuation.perl < "$INPUT".roman > $datadir/$IL.clean
59 | 
60 | if [ $MODE = "train" ]; then
61 | 	python3 src/code/filter_lowfreq.py -i $datadir/$IL.clean -m train -ig
62 | 	mv $datadir/vocab $datadir/vocab.$IL
63 | else
64 | 	python3 src/code/filter_lowfreq.py -i $datadir/$IL.clean -m eval -t 1 -v $datadir/vocab.$IL
65 | fi
66 | 
67 | 


--------------------------------------------------------------------------------
/src/code/conllu2txt.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from utils import *
 3 | import pdb
 4 | import argparse
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |   parser = argparse.ArgumentParser() 
 9 |   parser.add_argument("--input","-i", type=str, default=None, help="input conllu file")
10 |   parser.add_argument("--mode" ,"-m", type=str, default="ch", help="Tag mode [ch,tag]")
11 |   parser.add_argument("--col"  ,"-c", type=int, default=1, help="Column to extract [0-9]")
12 |   parser.add_argument("--tb"   ,"-tb", type=str, default="ud", help="Treebank name [ud,ut]")
13 |   parser.add_argument("--lid","-lid", action='store_true', help="Keep lang_id from text")
14 | 
15 |   args = parser.parse_args()
16 | 
17 |   text=""
18 |   idx = args.col
19 |   mode = args.mode
20 | 
21 |   mapper = upos2char if args.tb=="ud" else ut2char
22 | 
23 |   count = 1
24 | 
25 |   for line in open(args.input,'r'):
26 |     line = line.strip("\n")
27 |     if line=='': continue
28 |     cols = line.split('\t')
29 |     if cols[0]=="1" and text!='':
30 |       print(text.strip(' '))
31 |       text=''
32 |       count = 1
33 |     token = ''
34 |     datum = cols[idx]
35 |     if idx==1 and not args.lid:
36 |       datum = datum[:-3]
37 | 
38 |     if mode=='ch' and idx==3:
39 |       datum = mapper[datum]
40 |     
41 |     token = datum.strip(' ')
42 | 
43 |     all_dig = True
44 |     for sw in token.split(' '):
45 |       if not sw.isdigit():
46 |         all_dig=False
47 |         break
48 |     token = token.replace(" ","") if all_dig else token.replace(" ","_")
49 |     text += " "+token
50 | 
51 |     # print("|",token,"|")
52 |     count += 1
53 | 
54 |   print(text.strip(' '))
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/src/code/arpa2wfst.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | ## Transforms ARPA lm file into WFST format using Carmel
 4 | 
 5 | set -e 
 6 | 
 7 | INPUT="arpa.lang"  # arpa formatted input file
 8 | LAN_CODE="en" # language code for name-formatting purposes
 9 | ORDER=2  # LM order for name-formatting purposes
10 | CARMEL_DIR="/usr/local"
11 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )"
12 | DATADIR=$BASEDIR/data
13 | CODE_DIR="$BASEDIR/src/code"
14 | 
15 | while [ $# -gt 1 ]
16 | do
17 | key="$1"
18 | case $key in
19 |     -i|--input)
20 |     INPUT="$2"
21 |     shift # past argument
22 |     ;;
23 |     -l|--lang)
24 |     LAN_CODE="$2"
25 |     shift # past argument
26 |     ;;
27 |     -o|--order)
28 |     ORDER="$2"
29 |     shift # past argument
30 |     ;;
31 |     -c|--carmel)
32 |     CARMEL_DIR="$2"
33 |     shift # past argument
34 |     ;;
35 |     *)
36 |             # unknown option
37 |     ;;
38 | esac
39 | shift
40 | done 
41 | 
42 | 
43 | INPUT=$(readlink -f $INPUT)
44 | 
45 | cd $CODE_DIR
46 | g++ makelmfsa.cpp -o makelmfsa
47 | g++ makelmfsa_x.cpp -o makelmfsa_x
48 | 
49 | 
50 | # create fsa/fst
51 | ./makelmfsa $INPUT
52 | # ./makelmfsa_x $basedir/lms/$lang.$order.lm
53 | 
54 | $CARMEL_DIR/bin/carmel -n $INPUT.wfsa \
55 | > $INPUT.norm
56 | 
57 | # prepare Viterbi decoding
58 | $CARMEL_DIR/bin/carmel --project-right --project-identity-fsa -HJ $INPUT.wfsa \
59 | > $INPUT.fsa.noe
60 | 
61 | if [ "$INPUT.wfsa" != "$BASEDIR/lms/$LAN_CODE.$ORDER.lm.wfsa" ]; then
62 |     mv $INPUT.wfsa $BASEDIR/lms/$LAN_CODE.$ORDER.lm.wfsa
63 |     mv $INPUT.norm $BASEDIR/lms/$LAN_CODE.$ORDER.lm.norm
64 |     mv $INPUT.fsa.noe $BASEDIR/lms/$LAN_CODE.$ORDER.fsa.noe
65 | fi
66 | 
67 | rm makelmfsa makelmfsa_x


--------------------------------------------------------------------------------
/src/code/run_clustering.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # POSIX variable
 6 | OPTIND=1
 7 | 
 8 | NCPUS=4
 9 | BASELINE="brown" # clark, anchor, emb-loc-mon
10 | NCLUSTERS=500
11 | 
12 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )"
13 | EXP_DIR="$BASEDIR/exp-cipher"
14 | 
15 | while [ $# -gt 1 ]
16 | do
17 | key="$1"
18 | case $key in
19 |     -i|--input)
20 |     INPUT="$2"
21 |     shift # past argument
22 |     ;;    
23 |     -b|--b|--baseline)
24 |     BASELINE="$2"
25 |     shift # past argument
26 |     ;;
27 |     -nj|--njobs)
28 |     NCPUS="$2"
29 |     shift # past argument
30 |     ;;
31 |     -nc|--nclusters)
32 |     NCLUSTERS="$2"
33 |     shift # past argument
34 |     ;;
35 |     -exp|--exp_dir)
36 |     EXP_DIR="$2"
37 |     shift # past argument
38 |     ;;
39 |     *)
40 |             # unknown option
41 |     ;;
42 | esac
43 | shift
44 | done 
45 | 
46 | 
47 | clt_out="$EXP_DIR/$BASELINE-$NCLUSTERS"
48 | mkdir -p $clt_out
49 | 
50 | 
51 | ## run clustering algorithm
52 | 
53 | if [ $BASELINE = "brown" ]; then
54 |     if [ ! -d "$BASEDIR/src/brown" ]; then
55 |         ln -s "$BASEDIR/src/brown-cluster" "$BASEDIR/src/brown"
56 |     fi
57 | 
58 |     src/brown/wcluster --text $INPUT \
59 |             --threads $NCPUS --c $NCLUSTERS --rand 42 \
60 |             --output_dir $clt_out
61 |     mv $clt_out/paths $clt_out/clusters.brown
62 | fi
63 | 
64 | 
65 | if [ $BASELINE = "marlin" ]; then
66 |     src/marlin/marlin_count --text $INPUT \
67 |     --bigrams $clt_out/bigrams --words $clt_out/words --rank-limit -1
68 |     
69 |     $model_dir/marlin_cluster --bigrams $clt_out/bigrams --words $clt_out/words \
70 |     --output $clt_out/clusters --rand 42 --c $NCLUSTERS --alpha 0.0 2> $clt_out/log
71 | fi
72 | 
73 | 


--------------------------------------------------------------------------------
/src/code/decode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | LM="lm_fst_file"
 5 | CHANNEL=""
 6 | INPUT=""
 7 | MAX_LINES=20000
 8 | DEC_CH="False"
 9 | W_LM=1
10 | W_CM=1
11 | 
12 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )"
13 | CODE_DIR="$BASEDIR/src/code"
14 | 
15 | while [ $# -gt 1 ]
16 | do
17 | key="$1"
18 | case $key in
19 |     -lm|--lm)
20 |     LM="$2"
21 |     shift # past argument
22 |     ;;
23 |     -ch|--channel)
24 |     CHANNEL="$2"
25 |     shift # past argument
26 |     ;;
27 |     -i|--input)
28 |     INPUT="$2"
29 |     shift # past argument
30 |     ;;
31 |     -max|--max)
32 |     MAX_LINES="$2"
33 |     shift # past argument
34 |     ;;
35 |     -wlm|--wlm)
36 |     W_LM="$2"
37 |     shift # past argument
38 |     ;;
39 |     -wcm|--wcm)
40 |     W_CM="$2"
41 |     shift # past argument
42 |     ;;
43 |     *)
44 |             # unknown option
45 |     ;;
46 | esac
47 | shift
48 | done 
49 | 
50 | 
51 | carmel="$CARMEL_DIR/bin/carmel"
52 | data_dir="$EXP_DIR/data"
53 | basedir="build"
54 | 
55 | mkdir -p $EXP_DIR/models/
56 | 
57 | awk 'NF>0' $INPUT > $EXP_DIR/logs/$CHANNEL.temp.noe.cmpl
58 | mv $EXP_DIR/logs/$CHANNEL.temp.noe.cmpl $EXP_DIR/logs/$CHANNEL.temp.noe
59 | 
60 | # decipher with Viterbi decoding
61 | # head -10 $EXP_DIR/logs/$CHANNEL.temp.noe | \
62 | if [ $W_LM = 0 ]; then
63 |     cat $EXP_DIR/logs/$CHANNEL.temp.noe | \
64 |     $carmel -qbsriWIEk 1 --exponents=$W_CM,1 \
65 |     $EXP_DIR/models/$CHANNEL \
66 |     > $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.temp.dec 2> $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.dec
67 | else
68 |     cat $EXP_DIR/logs/$CHANNEL.temp.noe | \
69 |     $carmel -qbsriWIEk 1 --exponents=$W_LM,$W_CM,1 \
70 |     $LM $EXP_DIR/models/$CHANNEL \
71 |     > $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.temp.dec 2> $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.dec
72 | fi
73 | 
74 | python3 $CODE_DIR/clean_map_decode.py $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.temp.dec $INPUT.$CHANNEL.$W_LM.$W_CM.decoded
75 | rm $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.temp.dec
76 | rm $EXP_DIR/logs/$CHANNEL.temp.noe
77 | 


--------------------------------------------------------------------------------
/src/code/train_combined_lm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e 
 4 | 
 5 | 
 6 | ORDER=2
 7 | TAGSET="ud"   # tagset code [ud,ut]
 8 | LAN_CODES="en"
 9 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )"
10 | DATADIR=$BASEDIR/lm_data
11 | CODE_DIR="$BASEDIR/src/code"
12 | EXP_DIR="$BASEDIR/exp-cipher"
13 | 
14 | if [ -z "$CARMEL_DIR" ]; then
15 |     CARMEL_DIR="/usr/local"
16 | fi
17 | if [ -z "$SRILM_DIR" ]; then
18 |     SRILM_DIR="$HOME/srilm-1.7.2"
19 | fi
20 | 
21 | 
22 | while [ $# -gt 1 ]
23 | do
24 | key="$1"
25 | case $key in
26 |     -ts|--tagset)
27 |     TAGSET="$2"
28 |     shift # past argument
29 |     ;;
30 |     -ord|--order)
31 |     ORDER="$2"
32 |     shift # past argument
33 |     ;;
34 |     -l|--lang)
35 |     LAN_CODES="$2"
36 |     shift # past argument
37 |     ;;
38 |     -sri|--sridir)
39 |     SRILM_DIR="$2"
40 |     shift # past argument
41 |     ;;
42 |     -c|--carmel)
43 |     CARMEL_DIR="$2"
44 |     shift # past argument
45 |     ;;
46 |     -exp|--exp_dir)
47 |     EXP_DIR="$2"
48 |     shift # past argument
49 |     ;;
50 |     *)
51 |             # unknown option
52 |     ;;
53 | esac
54 | shift
55 | done 
56 | 
57 | export CARMEL_DIR=$CARMEL_DIR
58 | export SRILM_DIR=$SRILM_DIR
59 | 
60 | LAN_CODES=(${LAN_CODES//,/ })
61 | mkdir -p $EXP_DIR/lm
62 | 
63 | cd $CODE_DIR
64 | g++ makelmfsa.cpp -o makelmfsa
65 | g++ makelmfsa_x.cpp -o makelmfsa_x
66 | 
67 | 
68 | echo "" > temp_accum
69 | for lang in "${LAN_CODES[@]}"; do
70 |     cat $DATADIR/$lang/train.upos.ch >> temp_accum
71 | done
72 | 
73 | #-addsmooth -kn  \
74 | $SRILM_DIR/bin/i686-m64/ngram-count -text temp_accum -order $ORDER \
75 | -addsmooth 1 \
76 | -lm $EXP_DIR/lm/comb.$ORDER.lm
77 | grep -vP "^$" < $EXP_DIR/lm/comb.$ORDER.lm > temp
78 | mv temp $EXP_DIR/lm/comb.$ORDER.lm
79 | 
80 | # create fsa/fst
81 | ./makelmfsa $EXP_DIR/lm/comb.$ORDER.lm
82 | # ./makelmfsa_x $basedir/lms/$lang.$ORDER.lm
83 | 
84 | $CARMEL_DIR/bin/carmel -n $EXP_DIR/lm/comb.$ORDER.lm.wfsa \
85 | > $EXP_DIR/lm/comb.$ORDER.lm.norm
86 | 
87 | # prepare Viterbi decoding
88 | $CARMEL_DIR/bin/carmel --project-right --project-identity-fsa -HJ $EXP_DIR/lm/comb.$ORDER.lm.wfsa \
89 | > $EXP_DIR/lm/comb.$ORDER.fsa.noe
90 | 
91 | 
92 | rm makelmfsa makelmfsa_x


--------------------------------------------------------------------------------
/train_srilm_langmodel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e 
 4 | 
 5 | ORDER=2
 6 | INPUT="file.pos"
 7 | TAGSET="ud"   # tagset code [ud,ut]
 8 | LAN_CODE="en"
 9 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
10 | DATADIR=$BASEDIR/lm_data
11 | CODE_DIR="$BASEDIR/src/code"
12 | 
13 | if [ -z "$CARMEL_DIR"]; then
14 |     CARMEL_DIR="/usr/local"
15 | fi
16 | if [ -z "$SRILM_DIR" ]; then
17 |     SRILM_DIR="$HOME/srilm-1.7.2"
18 | fi
19 | 
20 | while [ $# -gt 1 ]
21 | do
22 | key="$1"
23 | case $key in
24 |     -i|--input)
25 |     INPUT="$2"
26 |     shift # past argument
27 |     ;;
28 |     -ts|--tagset)
29 |     TAGSET="$2"
30 |     shift # past argument
31 |     ;;
32 |     -ord|--order)
33 |     ORDER="$2"
34 |     shift # past argument
35 |     ;;
36 |     -l|--lang)
37 |     LAN_CODE="$2"
38 |     shift # past argument
39 |     ;;
40 |     -sri|--sridir)
41 |     SRILM_DIR="$2"
42 |     shift # past argument
43 |     ;;
44 |     -carmel|--carmel)
45 |     CARMEL_DIR="$2"
46 |     shift # past argument
47 |     ;;
48 |     *)
49 |             # unknown option
50 |     ;;
51 | esac
52 | shift
53 | done 
54 | 
55 | export CARMEL_DIR=$CARMEL_DIR
56 | export SRILM_DIR=$SRILM_DIR
57 | 
58 | cd $CODE_DIR
59 | 
60 | g++ makelmfsa.cpp -o makelmfsa
61 | g++ makelmfsa_x.cpp -o makelmfsa_x
62 | 
63 | 
64 | echo ""
65 | echo "Lang: $LAN_CODE"
66 | mkdir -p $BASEDIR/lms/
67 | mkdir -p $DATADIR/$LAN_CODE
68 | 
69 | cp $INPUT $DATADIR/$LAN_CODE/train.upos
70 | python3 src/code/pos2char.py -ts $TAGSET < $INPUT > $DATADIR/$LAN_CODE/train.upos.ch
71 | 
72 | # run LM 
73 | #-addsmooth -kn  \
74 | $SRILM_DIR/bin/i686-m64/ngram-count -text $DATADIR/$LAN_CODE/train.upos.ch -order $ORDER \
75 | -addsmooth 1 \
76 | -lm $BASEDIR/lms/$LAN_CODE.$ORDER.lm
77 | grep -vP "^$" < $BASEDIR/lms/$LAN_CODE.$ORDER.lm > temp
78 | mv temp $BASEDIR/lms/$LAN_CODE.$ORDER.lm
79 | 
80 | 
81 | # create fsa/fst
82 | ./makelmfsa $BASEDIR/lms/$LAN_CODE.$ORDER.lm
83 | 
84 | $CARMEL_DIR/bin/carmel -n $BASEDIR/lms/$LAN_CODE.$ORDER.lm.wfsa \
85 | > $BASEDIR/lms/$LAN_CODE.$ORDER.lm.norm
86 | 
87 | # prepare Viterbi decoding
88 | $CARMEL_DIR/bin/carmel --project-right --project-identity-fsa -HJ $BASEDIR/lms/$LAN_CODE.$ORDER.lm.wfsa \
89 | > $BASEDIR/lms/$LAN_CODE.$ORDER.fsa.noe


--------------------------------------------------------------------------------
/src/marlin/basic/str.cc:
--------------------------------------------------------------------------------
 1 | #include "stl-basic.h"
 2 | #include <stdarg.h>
 3 | 
 4 | string substr(const string &s, int i, int j) {
 5 |   if(i < 0) i += len(s);
 6 |   if(j < 0) j += len(s);
 7 |   i = max(i, 0);
 8 |   j = max(j, i);
 9 |   return s.substr(i, j-i);
10 | }
11 | string substr(const string &s, int i) {
12 |   return substr(s, i, len(s));
13 | }
14 | 
15 | string str_printf(const char *fmt, ...) {
16 |   char buf[16384];
17 |   va_list ap;
18 |   va_start(ap, fmt);
19 |   vsnprintf(buf, sizeof(buf), fmt, ap);
20 |   va_end(ap);
21 |   return buf;
22 | }
23 | 
24 | char *copy_str(const char *s) {
25 |   char *t = new char[strlen(s)+1];
26 |   strcpy(t, s);
27 |   return t;
28 | }
29 | 
30 | string int2str(int x) {
31 |   return str_printf("%d", x);
32 | }
33 | 
34 | string double2str(double x) {
35 |   ostringstream os;
36 |   os << x;
37 |   return os.str();
38 | }
39 | 
40 | StringVec split(const char *str, const char *delims, bool keep_empty) {
41 |   StringVec vec; // Store the result.
42 |   // Build quick lookup table.
43 |   BoolVec is_delim(256);
44 |   for(const char *p = delims; *p; p++) is_delim[*p] = true;
45 |   is_delim['\0'] = true;
46 | 
47 |   const char *end = str;
48 |   while(true) {
49 |     if(is_delim[*end]) {
50 |       if(keep_empty || end-str > 0) // Extract token.
51 |         vec.push_back(string(str, end-str)); 
52 |       str = end+1;
53 |     }
54 |     if(!*end) break;
55 |     end++;
56 |   }
57 |   return vec;
58 | }
59 | 
60 | StrVec mutate_split(char *str, const char *delims) {
61 |   StrVec vec;
62 |   for(char *p = strtok(str, delims); p; p = strtok(NULL, delims))
63 |     vec.push_back(p);
64 |   return vec;
65 | }
66 | 
67 | // Remove leading and trailing white space.
68 | char *trim(char *s) {
69 |   // Removing leading spaces.
70 |   while(*s && isspace(*s)) s++;
71 | 
72 |   // Remove trailing spaces.
73 |   char *t;
74 |   for(t = s+strlen(s)-1; t != s && isspace(*t); t--);
75 |   t[1] = '\0';
76 |   return s;
77 | }
78 | 
79 | string tolower(const char *s) {
80 |   string t = s;
81 |   foridx(i, len(t)) t[i] = tolower(t[i]);
82 |   return t;
83 | }
84 | 
85 | // String matching with brute force.
86 | int index_of(const char *s, const char *t) {
87 |   int ns = strlen(s), nt = strlen(t);
88 |   foridx(i, ns-nt+1)
89 |     if(strncmp(s+i, t, nt) == 0) return i;
90 |   return -1;
91 | }
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # UTagger: A Grounded Unsupervised Universal Part-of-Speech Tagger for Low-Resource Languages
 2 | 
 3 | This reposity contains the code neccesary to reproduce the results in the paper:
 4 | 
 5 | [1] A Grounded Unsupervised Universal Part-of-Speech Tagger for Low-Resource Languages. 
 6 | Ronald Cardenas, Ying Lin, Heng Ji and Jonathan May. NAACL 2019, Minneapolis, USA.
 7 | 
 8 | 
 9 | 
10 | ## Requirements
11 | 
12 | Training of Language Models is done with [SRILM v1.7.2](http://www.speech.sri.com/projects/srilm/download.html). However, any library that produces an ARPA file can be used.
13 | 
14 | For FST manipulation and cipher training, we use [Carmel](https://github.com/isi-nlp/carmel) (included as a submodule)
15 | 
16 | You will also need:
17 | * Python 3.6
18 | * NumPy >= 1.16.2
19 | * SciPy >= 1.2.1
20 | * lxml >= 4.3.3
21 | 
22 | 
23 | ## Setup
24 | 
25 | Initialize the submodules as follows:
26 | 
27 | ```
28 | git submodule update --init --recursive
29 | ```
30 | 
31 | Then build the code in `src/carmel`, `src/brown-cluster`, and `src/marlin` (see each folder's README file for reference).
32 | 
33 | ## Using UTagger
34 | 
35 | 
36 | 0. Extract POS annotations from [UniversalDependencies](http://universaldependencies.org) treebanks
37 | 
38 | If you want to use annoations from UD treebanks, you can extract the POS sequences by running
39 | 
40 | ```
41 | ./setup_ud-treebank_data.sh -td <ud-treebank-parent-folder>
42 | ```
43 | 
44 | This will extract only the POS tags of CONLLU train files for languages experimented with in [1].
45 | 
46 | 
47 | 1. Train POS language models
48 | 
49 |   *  From UD data
50 | 
51 | Training for several languages can be done by listing the iso-639-1 code of each language separated by commas. For instance, to train second order LMs for English and German, run:
52 | 
53 | ```
54 | ./train_format_lm_ud.sh  -l en,de  -o 2
55 | ```
56 | 
57 |   * From POS token sequences (one sentence per line)
58 | 
59 | ```
60 | ./train_srilm_langmodel.sh -i <pos-file> -o <order>
61 | ```
62 | 
63 |   * Reformating an already trained LM in ARPA format
64 | 
65 | Further down in the pipeline, Carmel reads trained language models in OpenFST format. Reformat an ARPA file as follows:
66 | 
67 | ```
68 | ./src/code/arpa2wfst.sh -i <arpa-file> -l <lang-id> -o <order>
69 | ```
70 | 
71 | 2. Train UTagger
72 | 
73 | 
74 | ```
75 | ./utagger -i sample.in -if txt -m train \
76 | -lm_o 2 -pl en,de -ca brown -nc 500
77 | ```
78 | 
79 | 
80 | 3. Tag / eval
81 | 
82 | ```
83 | ./utagger -i sample.in -if txt -m tag \
84 | -lm_o 2 -pl en,de -ca brown -nc 500
85 | ```
86 | 
87 | 


--------------------------------------------------------------------------------
/src/code/train_channel.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | 
  6 | NC=50
  7 | ORD=3
  8 | RL="en"
  9 | IL="en"
 10 | ID=1
 11 | SEED=42
 12 | ITERS=10
 13 | IS_ELISA="-"
 14 | CA="" # clustering algorithm
 15 | 
 16 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )"
 17 | CODE_DIR="$BASEDIR/src/code"
 18 | # CARMEL_DIR defined above pipeline
 19 | 
 20 | 
 21 | echo $CODE_DIR
 22 | 
 23 | 
 24 | while [ $# -gt 1 ]
 25 | do
 26 | key="$1"
 27 | case $key in
 28 |     -c|--nc|--num_clusters)
 29 |     NC="$2"
 30 |     shift # past argument
 31 |     ;;
 32 |     -o|--order)
 33 |     ORD="$2"
 34 |     shift # past argument
 35 |     ;;
 36 |     -rl)
 37 |     RL="$2"
 38 |     shift # past argument
 39 |     ;;
 40 |     -il)
 41 |     IL="$2"
 42 |     shift # past argument
 43 |     ;;
 44 |     -id)
 45 |     ID="$2"
 46 |     shift # past argument
 47 |     ;;
 48 |     -s|--seed)
 49 |     SEED="$2"
 50 |     shift # past argument
 51 |     ;;
 52 |     -it|--iters)
 53 |     ITERS="$2"
 54 |     shift # past argument
 55 |     ;;
 56 |     -elisa|--elisa)
 57 |     IS_ELISA="$2"
 58 |     shift # past argument
 59 |     ;;
 60 |     -ca|--ca)
 61 |     CA="$2"
 62 |     shift # past argument
 63 |     ;;
 64 |     -exp|--exp_dir)
 65 |     EXP_DIR="$2"
 66 |     shift # past argument
 67 |     ;;
 68 |     *)
 69 |             # unknown option
 70 |     ;;
 71 | esac
 72 | shift
 73 | done 
 74 | 
 75 | 
 76 | carmel="$CARMEL_DIR/bin/carmel"
 77 | data_dir="$EXP_DIR/data"
 78 | 
 79 | channel="$RL$ORD-$IL.$CA.$NC.$ITERS.$ID"
 80 | 
 81 | bash $CODE_DIR/create_fst.sh $NC > $EXP_DIR/models/$channel
 82 | 
 83 | $carmel -1 -R $SEED $EXP_DIR/models/$channel > $EXP_DIR/models/$channel.rnd
 84 | $carmel -HJn $EXP_DIR/models/$channel.rnd    > $EXP_DIR/models/$channel.norm
 85 | rm $EXP_DIR/models/$channel.rnd
 86 | 
 87 | cp $BASEDIR/lms/$RL.$ORD.lm.wfsa $EXP_DIR/models/$channel.lm
 88 | 
 89 | # head -10 $data_dir/$IL/$train_pref.$NC.$CA.carmel > $EXP_DIR/logs/$channel.in
 90 | # $EXP_DIR/logs/$channel.in $EXP_DIR/models/$channel.lm \
 91 | 
 92 | echo ":: $data_dir/output.$NC.$CA.carmel.10k"
 93 | echo ":: $channel"
 94 | 
 95 | # # train the channel model
 96 | $carmel --train-cascade -HJa -1 -M $ITERS -R $SEED -X 0.999999 \
 97 | $data_dir/output.$NC.$CA.carmel.10k \
 98 | $EXP_DIR/models/$channel.lm $EXP_DIR/models/$channel.norm \
 99 | 2> $EXP_DIR/logs/$channel
100 | 
101 | mv $EXP_DIR/models/$channel.norm.trained $EXP_DIR/models/$channel
102 | 
103 | #rm $EXP_DIR/logs/$channel.in 
104 | rm $EXP_DIR/models/$channel.norm \
105 | $EXP_DIR/models/$channel.lm $EXP_DIR/models/$channel.lm.trained
106 | 


--------------------------------------------------------------------------------
/train_format_lm_ud.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | set -e 
  4 | 
  5 | UD_DIR="$HOME/ud-treebanks-v2.2"
  6 | ORDER=2
  7 | TAGSET="ud"   # tagset code [ud,ut]
  8 | LAN_CODES="en"
  9 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 10 | DATADIR=$BASEDIR/lm_data
 11 | CODE_DIR="$BASEDIR/src/code"
 12 | 
 13 | 
 14 | if [ -z "$CARMEL_DIR"]; then
 15 |     CARMEL_DIR="/usr/local"
 16 | fi
 17 | if [ -z "$SRILM_DIR" ]; then
 18 |     SRILM_DIR="$HOME/srilm-1.7.2"
 19 | fi
 20 | 
 21 | 
 22 | while [ $# -gt 1 ]
 23 | do
 24 | key="$1"
 25 | case $key in
 26 |     -td|--tb_dir)
 27 |     UD_DIR="$2"
 28 |     shift # past argument
 29 |     ;;
 30 |     -ts|--tagset)
 31 |     TAGSET="$2"
 32 |     shift # past argument
 33 |     ;;
 34 |     -ord|--order)
 35 |     ORDER="$2"
 36 |     shift # past argument
 37 |     ;;
 38 |     -l|--lang)
 39 |     LAN_CODES="$2"
 40 |     shift # past argument
 41 |     ;;
 42 |     -sri|--sridir)
 43 |     SRILM_DIR="$2"
 44 |     shift # past argument
 45 |     ;;
 46 |     -carmel|--carmel)
 47 |     CARMEL_DIR="$2"
 48 |     shift # past argument
 49 |     ;;
 50 |     *)
 51 |             # unknown option
 52 |     ;;
 53 | esac
 54 | shift
 55 | done 
 56 | 
 57 | export CARMEL_DIR=$CARMEL_DIR
 58 | export SRILM_DIR=$SRILM_DIR
 59 | 
 60 | cd $CODE_DIR
 61 | 
 62 | g++ makelmfsa.cpp -o makelmfsa
 63 | g++ makelmfsa_x.cpp -o makelmfsa_x
 64 | 
 65 | 
 66 | LAN_CODES=(${LAN_CODES//,/ })
 67 | 
 68 | for lang in "${LAN_CODES[@]}"; do
 69 |     echo ""
 70 |     echo "Lang: $lang"
 71 |     mkdir -p $BASEDIR/lms/
 72 |     mkdir -p $DATADIR/$lang
 73 |     
 74 |     suf=""
 75 |     if [ $lang = "ja" ]||[ $lang = "ar" ]; then
 76 |         suf=".all"
 77 |     fi
 78 | 
 79 |     python3 conllu2txt.py -i $DATADIR/$lang/train.conllu$suf \
 80 |     -m ch -c 3 -tb $TAGSET > $DATADIR/$lang/train.upos.ch
 81 | 
 82 |     python3 conllu2txt.py -i $DATADIR/$lang/train.conllu$suf \
 83 |     -m tag -c 3 -tb $TAGSET > $DATADIR/$lang/train.upos
 84 | 
 85 |     # run LM 
 86 |     #-addsmooth -kn  \
 87 |     $SRILM_DIR/bin/i686-m64/ngram-count -text $DATADIR/$lang/train.upos.ch -order $ORDER \
 88 |     -addsmooth 1 \
 89 |     -lm $BASEDIR/lms/$lang.$ORDER.lm
 90 |     grep -vP "^$" < $BASEDIR/lms/$lang.$ORDER.lm > temp
 91 |     mv temp $BASEDIR/lms/$lang.$ORDER.lm
 92 | 
 93 |     # create fsa/fst
 94 |     ./makelmfsa $BASEDIR/lms/$lang.$ORDER.lm
 95 |     # ./makelmfsa_x $basedir/lms/$lang.$ORDER.lm
 96 | 
 97 |     $CARMEL_DIR/bin/carmel -n $BASEDIR/lms/$lang.$ORDER.lm.wfsa \
 98 |     > $BASEDIR/lms/$lang.$ORDER.lm.norm
 99 | 
100 |     # prepare Viterbi decoding
101 |     $CARMEL_DIR/bin/carmel --project-right --project-identity-fsa -HJ $BASEDIR/lms/$lang.$ORDER.lm.wfsa \
102 |     > $BASEDIR/lms/$lang.$ORDER.fsa.noe
103 |     
104 | done
105 | 
106 | 
107 | rm makelmfsa makelmfsa_x


--------------------------------------------------------------------------------
/src/marlin/basic/std.cc:
--------------------------------------------------------------------------------
  1 | #include <sys/stat.h>
  2 | #include <dirent.h>
  3 | #include <unistd.h>
  4 | #include "std.h"
  5 | #include "str.h"
  6 | 
  7 | // Return the current date/time.
  8 | string now() {
  9 |   time_t t = time(NULL);
 10 |   return substr(ctime(&t), 0, -1);
 11 | }
 12 | 
 13 | string hostname() {
 14 |   char buf[1024];
 15 |   gethostname(buf, sizeof(buf));
 16 |   return buf;
 17 | }
 18 | 
 19 | // Return the amount of memory (kB) used by this process
 20 | int mem_usage() {
 21 |   ifstream in("/proc/self/status");
 22 |   if(!in) return 0;
 23 |   char buf[1024];
 24 |   static const char *key = "VmRSS";
 25 | 
 26 |   while(in.getline(buf, sizeof(buf))) {
 27 |     if(strncmp(buf, key, strlen(key)) != 0) continue;
 28 |     char *s = strchr(buf, ':');
 29 |     if(!s) return 0;
 30 |     int x;
 31 |     sscanf(s+1, "%d", &x);
 32 |     return x;
 33 |   }
 34 |   return -1;
 35 | }
 36 | 
 37 | // Return whether the file exists.
 38 | bool file_exists(const char *file) {
 39 |   return access(file, F_OK) == 0;
 40 | }
 41 | 
 42 | // Create an empty file.  Return success.
 43 | bool create_file(const char *file) {
 44 |   ofstream out(file);
 45 |   if(!out) return false;
 46 |   out.close();
 47 |   return true;
 48 | }
 49 | 
 50 | time_t file_modified_time(const char *file) {
 51 |   struct stat stat_buf;
 52 |   if(stat(file, &stat_buf) != 0)
 53 |     return 0;
 54 |   return stat_buf.st_mtime;
 55 | }
 56 | 
 57 | // Return the cpu speed in MHz.
 58 | int cpu_speed_mhz() {
 59 |   ifstream in("/proc/cpuinfo");
 60 |   if(!in) return 0;
 61 |   char buf[1024];
 62 |   static const char *key = "cpu MHz";
 63 | 
 64 |   while(in.getline(buf, sizeof(buf))) {
 65 |     if(strncmp(buf, key, strlen(key)) != 0) continue;
 66 |     char *s = strchr(buf, ':');
 67 |     if(!s) return 0;
 68 |     double x;
 69 |     sscanf(s+1, "%lf", &x);
 70 |     return (int)x;
 71 |   }
 72 |   return 0;
 73 | }
 74 | 
 75 | // "file" -> "file"
 76 | // "dir/file" -> "file"
 77 | string strip_dir(string s) {
 78 |   return substr(s, s.rfind('/')+1);
 79 | }
 80 | 
 81 | // "file" -> "file"
 82 | // "dir/file" -> "dir"
 83 | string get_dir(string s) {
 84 |   int i = s.rfind('/');
 85 |   return i == -1 ? "." : substr(s, 0, s.rfind('/'));
 86 | }
 87 | 
 88 | // "base" -> "base"
 89 | // "base.ext" -> "base"
 90 | string file_base(string s) {
 91 |   int i = s.rfind('.');
 92 |   return i == -1 ? s : substr(s, 0, i);
 93 | }
 94 | 
 95 | bool get_files_in_dir(string dirname, bool fullpath, vector<string> &files) {
 96 |   DIR *dir = opendir(dirname.c_str());
 97 |   if(!dir) return false;
 98 |   while(true) {
 99 |     dirent *ent = readdir(dir);
100 |     if(!ent) break;
101 |     // For some reason, sometimes files show up as d_type == DT_UNKNOWN, I
102 |     // think due to AFS issues
103 |     //cout << "FFF " << ent->d_name << ' ' << (int)ent->d_type << endl;
104 |     if(ent->d_type != DT_DIR) {
105 |       files.push_back((fullpath ? dirname+"/" : string()) + ent->d_name);
106 |     }
107 |   }
108 |   closedir(dir);
109 |   return true;
110 | }
111 | 


--------------------------------------------------------------------------------
/src/code/combine_channels.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Combine channel tables
  3 | """
  4 | 
  5 | import os,sys
  6 | import argparse
  7 | import pdb
  8 | import numpy as np
  9 | import subprocess as sp
 10 | from utils import *
 11 | from collections import defaultdict
 12 | from label_dictionary import LabelDictionary
 13 | from multiprocessing import Pool
 14 | import re
 15 | 
 16 | import warnings
 17 | warnings.filterwarnings("ignore")
 18 | 
 19 | regex = re.compile(r'\(0 \(0 "(?P<T>[A-Z])" "(?P<C>[0-9]+)" (?P<P>[-.0-9e^]+)\)\)')
 20 | 
 21 | 
 22 | if __name__ == "__main__":
 23 |   parser = argparse.ArgumentParser() 
 24 |   parser.add_argument("--il","-il",type=str, default=500, help="Incident language")
 25 |   parser.add_argument("--rl"          ,"-rl", type=str, default=None, help="Related Languages")
 26 |   parser.add_argument("--num_clusters","-nc",type=int, default=500, help="Number of clusters")
 27 |   parser.add_argument("--iters","-it",type=int, default=500, help="Number of iterations")
 28 |   parser.add_argument("--ca"    ,"-ca", type=str, default="br", help="Clutering algorithm [brown,anchor,]")
 29 |   parser.add_argument("--exp_dir"      ,"-exp", type=str, default='', help="Experiment folder")
 30 |   args = parser.parse_args()
 31 | 
 32 |   #rls = "en.de.fr.it.es.ja.ar.cs.ru.sw-hcs".split('.')
 33 |   rls = args.rl.split(",")
 34 |   il = args.il
 35 | 
 36 |   if il[:2]=='tl':
 37 |       il = 'tl'
 38 |   p_c_t = np.zeros([17,500])
 39 |   t2id = LabelDictionary()
 40 |   for rl in rls:
 41 |     if il==rl: continue
 42 |     model = "%s/models/%s2-%s.%s.%d.%d" % (args.exp_dir,rl,il,args.ca,args.num_clusters,args.iters)
 43 |     temp = np.zeros([17,500])
 44 | 
 45 |     for line in open(model,'r'):
 46 |       line = line.strip('\n')
 47 |       if line=='' or line=='0': continue
 48 |       match = regex.match(line)
 49 |       if match==None:
 50 |         # print("not found!",line)
 51 |         # pdb.set_trace()
 52 |         continue
 53 |       # pdb.set_trace()
 54 |       t = match.group("T")
 55 |       c = int(match.group("C"))
 56 |       ps = match.group("P")
 57 |       if t=="<s>" or t=="</s>":
 58 |         continue
 59 |       if ps[0]!="e":
 60 |         p = float(ps)
 61 |       else:
 62 |         p = np.exp(float(ps[2:]))
 63 |       tid = t2id.add(t)
 64 |       p_c_t[tid,c] += p
 65 |       temp[tid,c] = p
 66 |     #END-FOR-LINE
 67 | 
 68 |   #END-FOR-RLS
 69 |   
 70 |   # normalize
 71 |   for t in range(17):
 72 |     # print(t2id.get_label_name(t),p_c_t[t,:].sum(),len(rls), (p_c_t[t,:]/len(rls)).sum() )
 73 |     p_c_t[t,:] /= p_c_t[t,:].sum()
 74 | 
 75 |   # print out result
 76 |   outfile_fn = "%s/models/%s.%s.%d.500.comb" % (args.exp_dir,il,args.ca,args.num_clusters)
 77 |   outfile = open(outfile_fn,'w')
 78 |   print("0",file=outfile)
 79 |   print('(0 (0 "<s>" "<s>" 1))',file=outfile)
 80 |   print('(0 (0 "</s>" "</s>" 1))',file=outfile)
 81 |   for t in range(17):
 82 |     for c in range(500):
 83 |       tag = t2id.get_label_name(t)
 84 |       prob = str(p_c_t[t,c])
 85 |       print('(0 (0 "%s" "%d" %s))' % (tag,c,prob ), file=outfile )
 86 |       if p_c_t[t,c]==0:
 87 |         print(il,tag,c)
 88 |   outfile.close()
 89 | 
 90 |   for rl in rls:
 91 |     model_name = "%s/models/%s2-%s.%s.%d.500.comb" % (args.exp_dir,rl,il,args.ca,args.num_clusters)
 92 |     sp.Popen(["cp",outfile_fn,model_name])
 93 |   
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/src/code/tag_text.py:
--------------------------------------------------------------------------------
  1 | from label_dictionary import LabelDictionary
  2 | from collections import defaultdict
  3 | from utils import *
  4 | import os,sys
  5 | import argparse
  6 | import pdb
  7 | import numpy as np
  8 | 
  9 | 
 10 | START="<s>"
 11 | END="</s>"
 12 | 
 13 | 
 14 | if __name__ == "__main__":
 15 |   parser = argparse.ArgumentParser() 
 16 |   parser.add_argument("--input","-i", type=str, help="Cluster dict")
 17 |   parser.add_argument("--baseline","-b", type=str, default="brown", help="Clustering model used")
 18 |   parser.add_argument("--mode","-m", type=str, help="train / eval")
 19 |   parser.add_argument("--mapper","-v", type=str, help="label dict")
 20 |   parser.add_argument("--clt_vocab","-c", type=str, help="cluster vocab")
 21 |   parser.add_argument("--nclusters","-nc", type=int, default=50, help="number of clusters")
 22 |   parser.add_argument("--output_pref","-op", type=str,default="train",help="output filename prefix")
 23 |   parser.add_argument("--subs","-subs", type=int,default=10000,help="subsample size for carmel")
 24 |   
 25 |   args = parser.parse_args()
 26 |   
 27 |   np.random.seed(42)
 28 |   w2cid = {}
 29 | 
 30 |   if args.mode == 'train':
 31 | 
 32 |     cl2cid = LabelDictionary()
 33 |     mapper_fn = os.path.join(os.path.dirname(args.clt_vocab),'clt.mapper')
 34 | 
 35 |     output_file = open(args.clt_vocab+".norm",'w')
 36 | 
 37 |     for line in open(args.clt_vocab,'r'):
 38 |       line = line.strip('\n')
 39 |       if line=='': continue
 40 |       w,c = '',''
 41 |       if args.baseline=='brown':
 42 |         c,w,_ = line.split('\t')
 43 |       elif args.baseline=='clark':
 44 |         w,c,_ = line.split(' ')
 45 |       elif args.baseline[0] in "lp":
 46 |         w,c = line.split('\t')
 47 |       elif args.baseline == "marlin":
 48 |         w,c = line.split(' ')
 49 | 
 50 |       cid = cl2cid.add(c)
 51 |       w2cid[w] = str(cid)
 52 |       print("%s\t%d" % (w,cid),file=output_file)
 53 |     ##
 54 |     saveObject(w2cid,mapper_fn)
 55 | 
 56 |   else:
 57 |     if args.mapper==None:
 58 |       print("Error: LabelDictionary object not specified!\nCheck arguments list with -h option")
 59 |       sys.exit(1)
 60 |     elif not os.path.exists(args.mapper):
 61 |       print("Error: LabelDictionary object does not exist!")
 62 |       sys.exit(1)
 63 |     else:
 64 |       w2cid = uploadObject(args.mapper)
 65 |   ##
 66 |   
 67 | 
 68 |   # pdb.set_trace()
 69 | 
 70 |   outfile        = open(os.path.join(os.path.dirname(args.input), "%s.%d.%s.ctag" % (args.output_pref,args.nclusters,args.baseline) ),'w')
 71 |   outfile_carmel = open(os.path.join(os.path.dirname(args.input), "%s.%d.%s.carmel" % (args.output_pref,args.nclusters,args.baseline) ),'w')
 72 |   outfile_carmel_10k = open(os.path.join(os.path.dirname(args.input), "%s.%d.%s.carmel.10k" % (args.output_pref,args.nclusters,args.baseline) ),'w')
 73 |   lines = []
 74 |   
 75 |   for line in open(args.input,'r'):
 76 |     line = line.strip('\n')
 77 |     if line=='': continue
 78 |     clts = []
 79 |     for w in line.split(' '):
 80 |       if w == '#eos': continue
 81 |       if w not in w2cid:
 82 |         clts.append(w2cid["<unk>"])
 83 |       else:  
 84 |         clts.append(w2cid[w])
 85 |     print(" ".join(clts),file=outfile)
 86 | 
 87 |     clts = [START] + clts + [END]
 88 |     txt = " ".join(['"%s"' % x for x in clts])
 89 |     lines.append(txt)
 90 |     print("",file=outfile_carmel)
 91 |     print(txt,file=outfile_carmel)
 92 |     # print(" ".join(clts),file=outfile_carmel)
 93 | 
 94 |   ##
 95 |   idxs = np.arange(len(lines))
 96 |   np.random.shuffle(idxs)
 97 |   for idx in idxs[:args.subs]:
 98 |     print("",file=outfile_carmel_10k)
 99 |     print(lines[idx],file=outfile_carmel_10k)
100 |   ##


--------------------------------------------------------------------------------
/src/marlin/basic/opt.h:
--------------------------------------------------------------------------------
 1 | #ifndef __OPT_H__
 2 | #define __OPT_H__
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | #include <stdio.h>
 7 | 
 8 | using namespace std;
 9 | 
10 | // First thing to call in main().
11 | void init_opt(int argc, char *argv[]);
12 | 
13 | ////////////////////////////////////////////////////////////////////////
14 | // command-line arguments
15 | 
16 | class GetOpt {
17 | public:
18 |   GetOpt() { }
19 | 
20 |   void AddOpt(const string &name, bool has_arg);
21 |   void Parse(int argc, char *argv[]);
22 |   int Lookup(const string &name) const;
23 | 
24 |   bool Exists(const string &name) const;
25 |   string Get(const string &name, const string &default_value) const;
26 |   string Get(const string &name) const;
27 |   int GetInt(const string &name) const;
28 |   int GetInt(const string &name, int default_value) const;
29 |   double GetDouble(const string &name) const;
30 |   double GetDouble(const string &name, double default_value) const;
31 | 
32 | private:
33 |   vector< pair<string, bool> > opts; 
34 |   vector<string> values;
35 | };
36 | 
37 | template<class T> struct OptInfo {
38 |   OptInfo(const string &name, T *var, const string &msg, bool required)
39 |     : name(name), var(var), msg(msg), required(required) { }
40 | 
41 |   string name;
42 |   T *var; // location of the variable that stores this value
43 |   string msg;
44 |   bool required;
45 | };
46 | 
47 | extern vector< OptInfo<bool> > bool_opts;
48 | extern vector< OptInfo<int> > int_opts;
49 | extern vector< OptInfo<double> > double_opts;
50 | extern vector< OptInfo<string> > string_opts;
51 | 
52 | ////////////////////////////////////////////////////////////
53 | 
54 | // two versions: in one, option is required
55 | #define opt_define_bool_req(var, name, msg) \
56 |   bool var = opt_define_bool_wrap(name, &var, false, msg, true)
57 | #define opt_define_bool(var, name, val, msg) \
58 |   bool var = opt_define_bool_wrap(name, &var, val, msg, false)
59 | #define opt_define_int_req(var, name, msg) \
60 |   int var = opt_define_int_wrap(name, &var, 0, msg, true)
61 | #define opt_define_int(var, name, val, msg) \
62 |   int var = opt_define_int_wrap(name, &var, val, msg, false)
63 | #define opt_define_double_req(var, name, msg) \
64 |   double var = opt_define_double_wrap(name, &var, 0.0, msg, true)
65 | #define opt_define_double(var, name, val, msg) \
66 |   double var = opt_define_double_wrap(name, &var, val, msg, false)
67 | #define opt_define_string_req(var, name, msg) \
68 |   string var = opt_define_string_wrap(name, &var, "", msg, true)
69 | #define opt_define_string(var, name, val, msg) \
70 |   string var = opt_define_string_wrap(name, &var, val, msg, false)
71 | 
72 | inline bool opt_define_bool_wrap(const string &name, bool *var, bool val, const string &msg, bool required) {
73 |   bool_opts.push_back(OptInfo<bool>(name, var, msg, required));
74 |   return val; 
75 | }
76 | 
77 | inline int opt_define_int_wrap(const string &name, int *var, int val, const string &msg, bool required) {
78 |   //printf("HELLO %s\n", name.c_str());
79 |   int_opts.push_back(OptInfo<int>(name, var, msg, required));
80 |   //printf("N %d\n", (int)int_opts.size());
81 |   return val; 
82 | }
83 | inline double opt_define_double_wrap(const string &name, double *var, double val, const string &msg, bool required) {
84 |   double_opts.push_back(OptInfo<double>(name, var, msg, required));
85 |   return val;
86 | }
87 | inline string opt_define_string_wrap(const string &name, string *var, const string &val, const string &msg, bool required) {
88 |   string_opts.push_back(OptInfo<string>(name, var, msg, required));
89 |   return val; 
90 | }
91 | 
92 | ////////////////////////////////////////////////////////////
93 | 
94 | void print_opts();
95 | 
96 | extern int rand_seed;
97 | 
98 | #endif
99 | 


--------------------------------------------------------------------------------
/src/marlin/basic/stl-basic.h:
--------------------------------------------------------------------------------
  1 | #ifndef __STL_BASIC_H__
  2 | #define __STL_BASIC_H__
  3 | 
  4 | #include "std.h"
  5 | #include "city.h"
  6 | 
  7 | ////////////////////////////////////////////////////////////
  8 | 
  9 | typedef double real;
 10 | //typedef float real;
 11 | 
 12 | typedef pair<int, int> IntPair;
 13 | typedef pair<int, real> IntDouble;
 14 | typedef pair<real, int> DoubleInt;
 15 | typedef pair<real, real> DoublePair;
 16 | typedef vector<IntPair> IntPairVec;
 17 | typedef vector<DoubleInt> DoubleIntVec;
 18 | typedef vector<bool> BoolVec;
 19 | typedef vector<int> IntVec;
 20 | typedef vector<string> StringVec;
 21 | typedef vector<IntVec> IntMat;
 22 | typedef vector<IntVec> IntVecVec;
 23 | typedef vector<IntVecVec> IntVecVecVec;
 24 | typedef vector<IntVecVecVec> IntVecVecVecVec;
 25 | typedef vector<real> DoubleVec;
 26 | typedef vector<DoubleVec> DoubleVecVec;
 27 | typedef vector<DoubleVecVec> DoubleVecVecVec;
 28 | typedef vector<DoubleVecVecVec> DoubleVecVecVecVec;
 29 | typedef vector<IntDouble> IntDoubleVec;
 30 | typedef vector<IntDoubleVec> IntDoubleVecVec;
 31 | typedef vector<IntDoubleVecVec> IntDoubleVecVecVec;
 32 | typedef vector<IntDoubleVecVecVec> IntDoubleVecVecVecVec;
 33 | 
 34 | typedef IntVec ivector;
 35 | typedef DoubleVec fvector;
 36 | typedef DoubleVecVec fmatrix;
 37 | 
 38 | ////////////////////////////////////////////////////////////
 39 | 
 40 | struct vector_eq {
 41 |   bool operator()(const IntVec &v1, const IntVec &v2) const {
 42 |     return v1 == v2;
 43 |   }
 44 | };
 45 | struct vector_hf {
 46 |   size_t operator()(const IntVec &v) const {
 47 |     return CityHash64(reinterpret_cast<const char*>(&v[0]), sizeof(int) * v.size());
 48 | #if 0
 49 |     int h = 0;
 50 |     foridx(i, len(v))
 51 |       h = (h<<4)^(h>>28)^v[i];
 52 |     return h;
 53 | #endif
 54 |   }
 55 | };
 56 | 
 57 | struct pair_eq {
 58 |   bool operator()(const IntPair &p1, const IntPair &p2) const {
 59 |     return p1 == p2;
 60 |   }
 61 | };
 62 | struct pair_hf {
 63 |   size_t operator()(const IntPair &p) const {
 64 |     return (p.first<<4)^(p.first>>28) ^ p.second;
 65 |   }
 66 | };
 67 | 
 68 | struct str_eq {
 69 |   bool operator()(const char *s1, const char *s2) const {
 70 |     return strcmp(s1, s2) == 0;
 71 |   }
 72 | };
 73 | struct str_hf {
 74 |   size_t operator()(const char *s) const {
 75 |     return CityHash64(s, strlen(s));
 76 |   }
 77 | };
 78 | 
 79 | struct string_eq {
 80 |   bool operator()(const string &s1, const string &s2) const {
 81 |     return s1 == s2;
 82 |   }
 83 | };
 84 | struct string_hf {
 85 |   size_t operator()(const string &s) const {
 86 |     return CityHash64(s.c_str(), s.size());
 87 |   }
 88 | };
 89 | 
 90 | ////////////////////////////////////////////////////////////
 91 | 
 92 | typedef unordered_set<int> IntSet;
 93 | typedef unordered_set<IntPair, pair_hf, pair_eq> IntPairSet;
 94 | typedef unordered_set<IntVec, vector_hf, vector_eq> IntVecSet;
 95 | typedef unordered_map<IntVec, real, vector_hf, vector_eq> IntVecDoubleMap;
 96 | typedef unordered_map<IntVec, int, vector_hf, vector_eq> IntVecIntMap;
 97 | typedef unordered_map<int, int> IntIntMap;
 98 | typedef unordered_map<int, real> IntDoubleMap;
 99 | typedef unordered_map<int, IntPair> IntIntPairMap;
100 | typedef unordered_map<int, IntVec> IntIntVecMap;
101 | typedef unordered_map<int, IntIntMap> IntIntIntMapMap;
102 | typedef unordered_map<IntPair, int, pair_hf, pair_eq> IntPairIntMap;
103 | typedef unordered_map<IntPair, real, pair_hf, pair_eq> IntPairDoubleMap;
104 | typedef unordered_map<IntPair, DoubleVec, pair_hf, pair_eq> IntPairDoubleVecMap;
105 | typedef unordered_map<IntVec, IntVec, vector_hf, vector_eq> IntVecIntVecMap;
106 | typedef unordered_map<IntVec, DoubleVec, vector_hf, vector_eq> IntVecDoubleVecMap;
107 | typedef vector<IntIntMap> IntIntMapVec;
108 | 
109 | typedef vector<const char *> StrVec;
110 | typedef unordered_map<const char *, int, str_hf, str_eq> StrIntMap;
111 | typedef unordered_map<const char *, const char *, str_hf, str_eq> StrStrMap;
112 | 
113 | #endif
114 | 


--------------------------------------------------------------------------------
/src/marlin/basic/city.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2011 Google, Inc.
 2 | //
 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | // of this software and associated documentation files (the "Software"), to deal
 5 | // in the Software without restriction, including without limitation the rights
 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | // copies of the Software, and to permit persons to whom the Software is
 8 | // furnished to do so, subject to the following conditions:
 9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 | //
21 | // CityHash, by Geoff Pike and Jyrki Alakuijala
22 | //
23 | // This file provides a few functions for hashing strings. On x86-64
24 | // hardware in 2011, CityHash64() is faster than other high-quality
25 | // hash functions, such as Murmur.  This is largely due to higher
26 | // instruction-level parallelism.  CityHash64() and CityHash128() also perform
27 | // well on hash-quality tests.
28 | //
29 | // CityHash128() is optimized for relatively long strings and returns
30 | // a 128-bit hash.  For strings more than about 2000 bytes it can be
31 | // faster than CityHash64().
32 | //
33 | // Functions in the CityHash family are not suitable for cryptography.
34 | //
35 | // WARNING: This code has not been tested on big-endian platforms!
36 | // It is known to work well on little-endian platforms that have a small penalty
37 | // for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs.
38 | //
39 | // By the way, for some hash functions, given strings a and b, the hash
40 | // of a+b is easily derived from the hashes of a and b.  This property
41 | // doesn't hold for any hash functions in this file.
42 | 
43 | #ifndef CITY_HASH_H_
44 | #define CITY_HASH_H_
45 | 
46 | #include <stdlib.h>  // for size_t.
47 | #include <stdint.h>
48 | #include <utility>
49 | 
50 | typedef uint8_t uint8;
51 | typedef uint32_t uint32;
52 | typedef uint64_t uint64;
53 | typedef std::pair<uint64, uint64> uint128;
54 | 
55 | inline uint64 Uint128Low64(const uint128& x) { return x.first; }
56 | inline uint64 Uint128High64(const uint128& x) { return x.second; }
57 | 
58 | // Hash function for a byte array.
59 | uint64 CityHash64(const char *buf, size_t len);
60 | 
61 | // Hash function for a byte array.  For convenience, a 64-bit seed is also
62 | // hashed into the result.
63 | uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed);
64 | 
65 | // Hash function for a byte array.  For convenience, two seeds are also
66 | // hashed into the result.
67 | uint64 CityHash64WithSeeds(const char *buf, size_t len,
68 |                            uint64 seed0, uint64 seed1);
69 | 
70 | // Hash function for a byte array.
71 | uint128 CityHash128(const char *s, size_t len);
72 | 
73 | // Hash function for a byte array.  For convenience, a 128-bit seed is also
74 | // hashed into the result.
75 | uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed);
76 | 
77 | // Hash 128 input bits down to 64 bits of output.
78 | // This is intended to be a reasonably good hash function.
79 | inline uint64 Hash128to64(const uint128& x) {
80 |   // Murmur-inspired hashing.
81 |   const uint64 kMul = 0x9ddfea08eb382d69ULL;
82 |   uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
83 |   a ^= (a >> 47);
84 |   uint64 b = (Uint128High64(x) ^ a) * kMul;
85 |   b ^= (b >> 47);
86 |   b *= kMul;
87 |   return b;
88 | }
89 | 
90 | #endif  // CITY_HASH_H_
91 | 


--------------------------------------------------------------------------------
/src/code/makelmfsa_x.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <set>
  4 | #include <map>
  5 | #include <cmath>
  6 | #include <cstdlib> 
  7 | #include <sstream>
  8 | #include <algorithm>
  9 | #include <iterator>
 10 | #include <vector>
 11 | #include <string>
 12 | 
 13 | using namespace std;
 14 | 
 15 | bool isfloat(wstring s) {
 16 | 	if (s[0] == L'-' || isdigit(s[0])) return true;
 17 | 	return false;
 18 | }
 19 | 
 20 | double tofloat(wstring s) {
 21 | 	wstringstream ss(s);
 22 | 	double d;
 23 | 	ss >> d;
 24 | 	return d;	
 25 | }
 26 | 
 27 | 
 28 | int main(int argc, char** argv) {
 29 | 	locale::global(locale(""));
 30 | 	string infile_fn(argv[1]);
 31 | 	string out1=infile_fn+".wfst";
 32 | 	string out2=infile_fn+".wfsa"; // <---------------- fout2 :: wfsa
 33 | 	string in1= infile_fn;
 34 | 	wifstream fin(in1.c_str());
 35 | 	wofstream fout(out1.c_str());
 36 | 	wofstream fout2(out2.c_str());
 37 | 	wstring l = L"",s;
 38 | 	int order;
 39 | 	double score,backoff;
 40 | 	wstring dest;
 41 | 	while (l[0] != L'n') getline(fin,l);
 42 | 	while (l[0] == L'n') {getline(fin,l); order++;}
 43 | 	int current_ngram = 1;
 44 | 	fout << "FINAL" << endl;
 45 | 	fout << "(START (<s> <s> <s> 1!))" << endl;	
 46 | 	fout2 << "FINAL" << endl;
 47 | 	fout2 << "(START (<s> *e* <s> 1!))" << endl;
 48 | 	while (getline(fin,l)) {
 49 | 		if (l[0] == L'\\') {current_ngram++; cout << current_ngram << endl; continue;}
 50 | 		wstringstream ss(l);	
 51 | 		vector<wstring> data;
 52 | 		while (ss >> s) data.push_back(s);
 53 | 		if (current_ngram > 0 && data.size() > 1 && isfloat(data[0])) {
 54 | 			score=pow(10.0,tofloat(data[0]));
 55 | 			//if (score < 0.1) continue;
 56 | 			if (current_ngram == 1) {
 57 | 				//cout << data[1] << " " << score << endl;
 58 | 				if (data.size() > 2) {
 59 | 					backoff = pow(10.0,tofloat(data[2]));
 60 | 					dest = data[1];
 61 | 					fout << "(" << data[1] << "  (NULL *e* *e* " << backoff << "!))" << endl;
 62 | 					fout2 << "(" << data[1] << "  (NULL *e* *e* " << backoff << "!))" << endl;
 63 | 				} 
 64 | 				else {
 65 | 					if (data[1] == L"</s>")
 66 | 						dest = data[1];
 67 | 					else
 68 | 						dest = L"NULL";
 69 | 				}
 70 | 				if (dest != L"<s>") {
 71 | 					if (data[current_ngram] == L"</s>") dest = L"FINAL";
 72 | 					fout  << "(NULL (" << dest << " " << data[1] << " " << data[1] << " " << score << "!))" << endl;
 73 | 					fout2 << "(NULL (" << dest << " *e* " << data[1] << " " << score << "!))" << endl;
 74 | 				}
 75 | 				continue;
 76 | 			}
 77 | 			if (current_ngram < order) {
 78 | 				if (data.size() > current_ngram+1) {
 79 | 					backoff = pow(10.0,tofloat(data[current_ngram+1]));
 80 | 					dest = data[1];
 81 | 					for (int i = 2; i <= current_ngram; i++) dest += L"."+data[i];
 82 | 					wstring _dest = data[2];
 83 | 					for (int i = 3; i <= current_ngram; i++) _dest+= L"."+data[i];
 84 | 					fout << L"(" << dest << " (" << _dest <<  " *e* *e* " << backoff << "!))" << endl;
 85 | 					fout2 << L"(" << dest << " (" << _dest <<  " *e* *e* " << backoff << "!))" << endl;
 86 | 				}
 87 | 				else {
 88 | 					dest = data[2];
 89 | 					for (int i = 3; i <= current_ngram; i++) dest+= L"."+data[i];
 90 | 				}
 91 | 				wstring dest_ = data[1];
 92 | 				for (int i = 2; i < current_ngram; i++) dest_+= L"."+data[i];
 93 | 				if (data[current_ngram] == L"</s>") dest = L"FINAL";
 94 | 				fout << "(" << dest_ <<  " (" << dest << " " << data[current_ngram] << " " << data[current_ngram] << " " << score << "!))" << endl;
 95 | 				fout2 << "(" << dest_ <<  " (" << dest << " *e* " << data[current_ngram] << " " << score << "!))" << endl;
 96 | 				continue;
 97 | 			}
 98 | 			if (current_ngram == order) {
 99 | 				wstring dest_ = data[1];
100 | 				for (int i = 2; i < current_ngram; i++) dest_+= L"."+data[i];
101 | 				dest = data[2];
102 | 				for (int i = 3; i <= current_ngram; i++) dest += L"."+data[i];
103 | 				if (data[current_ngram] == L"</s>") dest = L"FINAL";
104 | 				fout << "(" << dest_ << " (" << dest << " " << data[current_ngram] << " " << data[current_ngram] << " " << score << "!))" << endl;
105 | 				fout2 << "(" << dest_ << " (" << dest << " *e* " << data[current_ngram] << " " << score << "!))" << endl;
106 | 			}
107 | 			
108 | 		}
109 | 	}
110 | 	return 0;
111 | }
112 | 


--------------------------------------------------------------------------------
/src/marlin/basic/std.h:
--------------------------------------------------------------------------------
  1 | #ifndef __STD_H__
  2 | #define __STD_H__
  3 | 
  4 | #include <assert.h>
  5 | #include <math.h>
  6 | #include <stdlib.h>
  7 | #include <limits.h>
  8 | #include <string.h>
  9 | #include <iostream>
 10 | #include <algorithm>
 11 | #include <iomanip>
 12 | #include <fstream>
 13 | #include <sstream>
 14 | #include <vector>
 15 | #include <string>
 16 | #include <queue>
 17 | #include <unordered_map>
 18 | #include <unordered_set>
 19 | 
 20 | using namespace std;
 21 | 
 22 | ////////////////////////////////////////////////////////////
 23 | 
 24 | #define len(vec) (int)(vec).size()
 25 | #define sq(x) ((x)*(x))
 26 | 
 27 | // For loop sugar.  This is such a hack!
 28 | #define foridx(i, n)                  for(int i = 0; i < n; i++)
 29 | #define forvec(i, tx, x, vec)         for(int i = 0, _##i = 0; i < len(vec); i++) \
 30 |                                       for(tx x = (vec)[i]; i == _##i; _##i++)
 31 | #define formap(tx, x, ty, y, t, map)  forstl(t, _##x##y, map) _mapvars(tx, x, ty, y)
 32 | #define forcmap(tx, x, ty, y, t, map) forcstl(t, _##x##y, map) _mapvars(tx, x, ty, y)
 33 | #define forstl(t, x, container)       for(t::iterator x = (container).begin(); x != (container).end(); x++)
 34 | #define forcstl(t, x, container)      for(t::const_iterator x = (container).begin(); x != (container).end(); x++)
 35 | #define _mapvars(tx, x, ty, y)        for(tx x = _##x##y->first, *_##x = &x; _##x; _##x = NULL) \
 36 |                                       for(ty y = _##x##y->second, *_##y = &y; _##y; _##y = NULL)
 37 | 
 38 | ////////////////////////////////////////////////////////////
 39 | // Generate random numbers.
 40 | 
 41 | inline int mrand(int a)        { return rand() % a; }
 42 | inline int mrand(int a, int b) { return rand() % (b-a) + a; }
 43 | inline double rand_double() {
 44 |   static const int BASE = 100000;
 45 |   return (double)(rand()%BASE)/BASE;
 46 | }
 47 | 
 48 | ////////////////////////////////////////////////////////////
 49 | // Floating point stuff.
 50 | 
 51 | const double TOL = 1e-10;
 52 | 
 53 | inline bool flt(double u, double v) { return u + TOL < v; } 
 54 | inline bool fgt(double u, double v) { return u - TOL > v; } 
 55 | 
 56 | // Comparing floating point numbers.
 57 | inline bool feq(double u, double v, double tol = TOL) { return fabs(u-v) < tol; }
 58 | 
 59 | template <class T> inline int sign(T u) {
 60 |   if(u < 0) return -1;
 61 |   if(u > 0) return 1;
 62 |   return 0;
 63 | }
 64 | 
 65 | #define assert_feq(u, v) do { _assert_feq(u, v, __FILE__, __LINE__); } while(0);
 66 | #define assert_feq2(u, v, tol) do { _assert_feq(u, v, tol, __FILE__, __LINE__); } while(0);
 67 | #define assert_fneq(u, v) do { _assert_fneq(u, v, __FILE__, __LINE__); } while(0);
 68 | inline void _assert_feq(double u, double v, const char *file, int line) {
 69 |   if(!feq(u, v)) { printf("At %s:%d, %f != %f\n", file, line, u, v); assert(0); }
 70 | }
 71 | inline void _assert_feq(double u, double v, double tol, const char *file, int line) {
 72 |   if(!feq(u, v, tol)) { printf("At %s:%d, %f != %f\n", file, line, u, v); assert(0); }
 73 | }
 74 | inline void _assert_fneq(double u, double v, const char *file, int line) {
 75 |   if(feq(u, v)) { printf("At %s:%d, %f == %f\n", file, line, u, v); assert(0); }
 76 | }
 77 | #define assert_eq(u, v) do { _assert_eq(u, v, __STRING(u), __STRING(v), __FILE__, __LINE__); } while(0)
 78 | template<class T> inline void _assert_eq(const T &u, const T &v, const char *us, const char *vs, const char *file, int line) {
 79 |   if(u != v) {
 80 |     cout << "At " << file << ':' << line << ", " <<
 81 |             us << '(' << u << ')' << " != " <<
 82 |             vs << '(' << v << ')' << endl;
 83 |     assert(0);
 84 |   }
 85 | }
 86 | 
 87 | #define assert2(x, reason) \
 88 |   do { \
 89 |     if(!(x)) { \
 90 |       cout << "\nFAILURE REASON: " << reason << endl; \
 91 |       assert(x); \
 92 |     } \
 93 |   } while(0)
 94 | 
 95 | string now();
 96 | string hostname();
 97 | int cpu_speed_mhz();
 98 | int mem_usage(); // in kB
 99 | 
100 | bool create_file(const char *file);
101 | bool file_exists(const char *file);
102 | time_t file_modified_time(const char *file);
103 | 
104 | string strip_dir(string s);
105 | string get_dir(string s);
106 | string file_base(string s);
107 | bool get_files_in_dir(string dirname, bool fullpath, vector<string> &files);
108 | 
109 | #endif
110 | 


--------------------------------------------------------------------------------
/src/code/setup_ud-treebank_data.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | set -e
  4 | 
  5 | 
  6 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )"
  7 | CODE_DIR="$BASEDIR/src/code"
  8 | UR_DIR="$BASEDIR/src/uroman"
  9 | UD_DIR="$HOME/ud-treebanks-v2.2"
 10 | data_dir=$BASEDIR/lm_data
 11 | 
 12 | # format UD
 13 | langs="en da nl de fr it es pt ja cs ru pl ar fa id hi"
 14 | 
 15 | 
 16 | while [ $# -gt 1 ]
 17 | do
 18 | key="$1"
 19 | case $key in
 20 |     -td|--tb_dir) # treebank directory
 21 |     UD_DIR="$2"
 22 |     shift # past argument
 23 |     ;;
 24 |     *)
 25 |             # unknown option
 26 |     ;;
 27 | esac
 28 | shift
 29 | done 
 30 | 
 31 | 
 32 | ########################################################################################
 33 | 
 34 | 
 35 | for lang in $langs; do
 36 | 	mkdir -p $data_dir/$lang;
 37 | done
 38 | 
 39 | split="train"
 40 | # english treebanks
 41 | cat $UD_DIR/UD_English-EWT/en_ewt-ud-$split.conllu \
 42 | $UD_DIR/UD_English-GUM/en_gum-ud-$split.conllu \
 43 | $UD_DIR/UD_English-LinES/en_lines-ud-$split.conllu \
 44 | $UD_DIR/UD_English-ParTUT/en_partut-ud-$split.conllu \
 45 | > $data_dir/en/$split.conllu
 46 | 
 47 | # Danish
 48 | cp $UD_DIR/UD_Danish-DDT/da_ddt-ud-$split.conllu \
 49 | $data_dir/da/$split.conllu
 50 | 
 51 | # Dutch
 52 | cat $UD_DIR/UD_Dutch-Alpino/nl_alpino-ud-$split.conllu \
 53 | $UD_DIR/UD_Dutch-LassySmall/nl_lassysmall-ud-$split.conllu \
 54 | > $data_dir/nl/$split.conllu
 55 | 
 56 | # german
 57 | cp $UD_DIR/UD_German-GSD/de_gsd-ud-$split.conllu \
 58 | $data_dir/de/$split.conllu
 59 | 
 60 | 
 61 | # french treebanks
 62 | cat $UD_DIR/UD_French-GSD/fr_gsd-ud-$split.conllu \
 63 | $UD_DIR/UD_French-ParTUT/fr_partut-ud-$split.conllu \
 64 | $UD_DIR/UD_French-Sequoia/fr_sequoia-ud-$split.conllu \
 65 | $UD_DIR/UD_French-Spoken/fr_spoken-ud-$split.conllu \
 66 | > $data_dir/fr/$split.conllu
 67 | 
 68 | # spanish
 69 | cat $UD_DIR/UD_Spanish-AnCora/es_ancora-ud-$split.conllu \
 70 | $UD_DIR/UD_Spanish-GSD/es_gsd-ud-$split.conllu \
 71 | > $data_dir/es/$split.conllu
 72 | 
 73 | # italian
 74 | cat $UD_DIR/UD_Italian-ISDT/it_isdt-ud-$split.conllu \
 75 | $UD_DIR/UD_Italian-ParTUT/it_partut-ud-$split.conllu \
 76 | $UD_DIR/UD_Italian-PoSTWITA/it_postwita-ud-$split.conllu \
 77 | > $data_dir/it/$split.conllu
 78 | 
 79 | # portuguese
 80 | cat $UD_DIR/UD_Portuguese-Bosque/pt_bosque-ud-$split.conllu \
 81 | $UD_DIR/UD_Portuguese-GSD/pt_gsd-ud-$split.conllu \
 82 | > $data_dir/pt/$split.conllu
 83 | 
 84 | 
 85 | # japanese
 86 | cat $UD_DIR/UD_Japanese-GSD/ja_gsd-ud-$split.conllu \
 87 | $UD_DIR/UD_Japanese-BCCWJ/ja_bccwj-ud-$split.conllu \
 88 | > $data_dir/ja/$split.conllu.all
 89 | 
 90 | cat $UD_DIR/UD_Japanese-GSD/ja_gsd-ud-$split.conllu \
 91 | > $data_dir/ja/$split.conllu
 92 | 
 93 | 
 94 | # czech
 95 | cat $UD_DIR/UD_Czech-PDT/cs_pdt-ud-$split.conllu \
 96 | $UD_DIR/UD_Czech-CAC/cs_cac-ud-$split.conllu \
 97 | $UD_DIR/UD_Czech-FicTree/cs_fictree-ud-$split.conllu \
 98 | > $data_dir/cs/$split.conllu
 99 | 
100 | # russian
101 | cat $UD_DIR/UD_Russian-GSD/ru_gsd-ud-$split.conllu \
102 | $UD_DIR/UD_Russian-SynTagRus/ru_syntagrus-ud-$split.conllu \
103 | $UD_DIR/UD_Russian-Taiga/ru_taiga-ud-$split.conllu \
104 | > $data_dir/ru/$split.conllu
105 | 
106 | # polish
107 | cat $UD_DIR/UD_Polish-LFG/pl_lfg-ud-$split.conllu \
108 | $UD_DIR/UD_Polish-SZ/pl_sz-ud-$split.conllu \
109 | > $data_dir/pl/$split.conllu
110 | 
111 | 
112 | # arabic
113 | cat $UD_DIR/UD_Arabic-PADT/ar_padt-ud-$split.conllu \
114 | $UD_DIR/UD_Arabic-NYUAD/ar_nyuad-ud-$split.conllu \
115 | > $data_dir/ar/$split.conllu.all
116 | 
117 | cat $UD_DIR/UD_Arabic-PADT/ar_padt-ud-$split.conllu \
118 | > $data_dir/ar/$split.conllu
119 | 
120 | # persian
121 | cp $UD_DIR/UD_Persian-Seraji/fa_seraji-ud-$split.conllu \
122 | $data_dir/fa/$split.conllu
123 | 
124 | 
125 | # Indonesian
126 | cp $UD_DIR/UD_Indonesian-GSD/id_gsd-ud-$split.conllu \
127 | $data_dir/id/$split.conllu
128 | 
129 | # hindi
130 | cp $UD_DIR/UD_Hindi-HDTB/hi_hdtb-ud-$split.conllu \
131 | $data_dir/hi/$split.conllu
132 | 
133 | 
134 | for lang in $langs; do
135 | 	echo "lang- split :: $lang - $split"
136 | 	grep -v "^#" $data_dir/$lang/$split.conllu | grep -v "^\s*$" | \
137 | 	grep -vP "^[0-9]+-[0-9]+" > temp1
138 | 	
139 | 	cp temp1 temp2
140 | 	if [ $lang = "ja" ] || [ $lang = "fa" ] || [ $lang = "ar" ] || [ $lang = "ru" ] || [ $lang = "hi" ]; then
141 | 		bash $CODE_DIR/rom_conllu.sh $lang temp1 temp2 $UR_DIR
142 | 	fi
143 | 
144 | 	mv temp2 $data_dir/$lang/$split.conllu
145 | 	rm temp1
146 | 
147 | 	if [ $lang = "ja" ] || [ $lang = "ar" ]; then
148 | 		grep -v "^#" $data_dir/$lang/$split.conllu.all | grep -v "^\s*$" | \
149 | 		grep -vP "^[0-9]+-[0-9]+" > temp1
150 | 
151 | 		mv temp1 $data_dir/$lang/$split.conllu.all
152 | 	fi
153 | done
154 | 
155 | 


--------------------------------------------------------------------------------
/src/code/makelmfsa.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <set>
  4 | #include <map>
  5 | #include <cmath>
  6 | #include <cstdlib> 
  7 | #include <sstream>
  8 | #include <algorithm>
  9 | #include <iterator>
 10 | #include <vector>
 11 | #include <string>
 12 | 
 13 | using namespace std;
 14 | 
 15 | bool isfloat(wstring s) {
 16 | 	if (s[0] == L'-' || isdigit(s[0])) return true;
 17 | 	return false;
 18 | }
 19 | 
 20 | double tofloat(wstring s) {
 21 | 	wstringstream ss(s);
 22 | 	double d;
 23 | 	ss >> d;
 24 | 	return d;	
 25 | }
 26 | 
 27 | 
 28 | int main(int argc, char** argv) {
 29 | 	locale::global(locale(""));
 30 | 	string infile_fn(argv[1]);
 31 | 	string out1=infile_fn+".wfst";
 32 | 	string out2=infile_fn+".wfsa";
 33 | 	string in1= infile_fn;
 34 | 	wifstream fin(in1.c_str());
 35 | 	wofstream fout(out1.c_str());
 36 | 	wofstream fout2(out2.c_str());
 37 | 	wstring l = L"",s;
 38 | 	int order;
 39 | 	double score,backoff;
 40 | 	wstring dest;
 41 | 	while (l[0] != L'n') getline(fin,l);
 42 | 	while (l[0] == L'n') {getline(fin,l); order++;}
 43 | 	int current_ngram = 1;
 44 | 	fout << "FINAL" << endl;
 45 | 	fout << "(START (<s> \"<s>\" \"<s>\" 1!))" << endl;
 46 | 	fout2 << "FINAL" << endl;
 47 | 	fout2 << "(START (<s> *e* \"<s>\" 1!))" << endl;
 48 | 	while (getline(fin,l)) {
 49 | 		if (l[0] == L'\\') {current_ngram++; cout << current_ngram << endl; continue;}
 50 | 		wstringstream ss(l);	
 51 | 		vector<wstring> data;
 52 | 		while (ss >> s) data.push_back(s);
 53 | 		if (current_ngram > 0 && data.size() > 1 && isfloat(data[0])) {
 54 | 			score=pow(10.0,tofloat(data[0]));
 55 | 			//if (score < 0.1) continue;
 56 | 			if (current_ngram == 1) {
 57 | 				//cout << data[1] << " " << score << endl;
 58 | 				if (data.size() > 2) {
 59 | 					backoff = pow(10.0,tofloat(data[2]));
 60 | 					dest = data[1];
 61 | 					fout << "(" << data[1] << "  (NULL *e* *e* " << backoff << "!))" << endl;
 62 | 					fout2 << "(" << data[1] << "  (NULL *e* *e* " << backoff << "!))" << endl;
 63 | 				} 
 64 | 				else {
 65 | 					if (data[1] == L"</s>")
 66 | 						dest = data[1];
 67 | 					else
 68 | 						dest = L"NULL";
 69 | 				}				
 70 | 				if (dest != L"<s>") {
 71 | 					if (data[current_ngram] == L"</s>") dest = L"FINAL";
 72 | 					fout  << "(NULL (" << dest << " \"" << data[1] << "\" \"" << data[1] << "\" " << score << "!))" << endl;
 73 | 					fout2 << "(NULL (" << dest << " *e* \"" << data[1] << "\" " << score << "!))" << endl;
 74 | 				}
 75 | 				continue;
 76 | 			}
 77 | 			if (current_ngram < order) {
 78 | 				if (data.size() > current_ngram+1) {
 79 | 					backoff = pow(10.0,tofloat(data[current_ngram+1]));
 80 | 					// dest = L"";
 81 | 					// for (int i = 1; i <= current_ngram; i++) dest+=data[i];
 82 | 					dest = data[1];
 83 | 					for (int i = 2; i <= current_ngram; i++) dest += L"."+data[i];
 84 | 					// wstring _dest = L"";
 85 | 					// for (int i = 2; i <= current_ngram; i++) _dest+=data[i];
 86 | 					wstring _dest = data[2];
 87 | 					for (int i = 3; i <= current_ngram; i++) _dest+= L"."+data[i];
 88 | 					fout << L"(" << dest << " (" << _dest <<  " *e* *e* " << backoff << "!))" << endl;
 89 | 					fout2 << L"(" << dest << " (" << _dest <<  " *e* *e* " << backoff << "!))" << endl;
 90 | 				}
 91 | 				else {
 92 | 					// dest = L"";
 93 | 					// for (int i = 2; i <= current_ngram; i++) dest+=data[i];
 94 | 					dest = data[2];
 95 | 					for (int i = 3; i <= current_ngram; i++) dest+= L"."+data[i];
 96 | 				}
 97 | 				// wstring dest_ = L"";
 98 | 				// for (int i = 1; i < current_ngram; i++) dest_+=data[i];
 99 | 				wstring dest_ = data[1];
100 | 				for (int i = 2; i < current_ngram; i++) dest_+= L"."+data[i];
101 | 				if (data[current_ngram] == L"</s>") dest = L"FINAL";
102 | 				fout << "(" << dest_ <<  " (" << dest << " \"" << data[current_ngram] << "\" \"" << data[current_ngram] << "\" " << score << "!))" << endl;
103 | 				fout2 << "(" << dest_ <<  " (" << dest << " *e* \"" << data[current_ngram] << "\" " << score << "!))" << endl;
104 | 				continue;
105 | 			}
106 | 			if (current_ngram == order) {
107 | 				// wstring dest_ = L"";
108 | 				// for (int i = 1; i < current_ngram; i++) dest_+=data[i];
109 | 				wstring dest_ = data[1];
110 | 				for (int i = 2; i < current_ngram; i++) dest_+= L"."+data[i];
111 | 				// dest = L"";
112 | 				// for (int i = 2; i <= current_ngram; i++) dest+=data[i];
113 | 				dest = data[2];
114 | 				for (int i = 3; i <= current_ngram; i++) dest += L"."+data[i];
115 | 				if (data[current_ngram] == L"</s>") dest = L"FINAL";
116 | 				fout << "(" << dest_ << " (" << dest << " \"" << data[current_ngram] << "\" \"" << data[current_ngram] << "\" " << score << "!))" << endl;
117 | 				fout2 << "(" << dest_ << " (" << dest << " *e* \"" << data[current_ngram] << "\" " << score << "!))" << endl;
118 | 			}
119 | 			
120 | 		}
121 | 	}
122 | 	return 0;
123 | }
124 | 


--------------------------------------------------------------------------------
/src/code/elisa2flat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import sys
  4 | import codecs
  5 | 
  6 | from lxml import etree as ET # pip install lxml
  7 | #from xml.etree import ElementTree as ET
  8 | from collections import defaultdict as dd
  9 | import re
 10 | import os.path
 11 | import gzip
 12 | scriptdir = os.path.dirname(os.path.abspath(__file__))
 13 | 
 14 | reader = codecs.getreader('utf8')
 15 | writer = codecs.getwriter('utf8')
 16 | 
 17 | 
 18 | def prepfile(fh, code):
 19 |   ret = gzip.open(fh.name, code if code.endswith("t") else code+"t") if fh.name.endswith(".gz") else fh
 20 |   if sys.version_info[0] == 2:
 21 |     if code.startswith('r'):
 22 |       ret = reader(fh)
 23 |     elif code.startswith('w'):
 24 |       ret = writer(fh)
 25 |     else:
 26 |       sys.stderr.write("I didn't understand code "+code+"\n")
 27 |       sys.exit(1)
 28 |   return ret
 29 | 
 30 | # this code is used below but not in this form
 31 | #http://stackoverflow.com/questions/7171140/using-python-iterparse-for-large-xml-files
 32 | # def fast_iter(context, func, *args, **kwargs):
 33 | #   """
 34 | #   http://lxml.de/parsing.html#modifying-the-tree
 35 | #   Based on Liza Daly's fast_iter
 36 | #   http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
 37 | #   See also http://effbot.org/zone/element-iterparse.htm
 38 | #   """
 39 | #   for event, elem in context:
 40 | #     func(elem, *args, **kwargs)
 41 | #     # It's safe to call clear() here because no descendants will be
 42 | #     # accessed
 43 | #     elem.clear()
 44 | #     # Also eliminate now-empty references from the root node to elem
 45 | #     for ancestor in elem.xpath('ancestor-or-self::*'):
 46 | #       while ancestor.getprevious() is not None:
 47 | #         del ancestor.getparent()[0]
 48 | #   del context
 49 | 
 50 | 
 51 | # def process_element(elem):
 52 | #   print elem.xpath( 'description/text( )' )
 53 | 
 54 | # context = etree.iterparse( MYFILE, tag='item' )
 55 | # fast_iter(context,process_element)
 56 | 
 57 | def main():
 58 |   parser = argparse.ArgumentParser(description="Given a compressed elisa xml file and list of attributes, print them out, tab separated",
 59 |                                    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 60 |   parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="input file")
 61 |   parser.add_argument("--fields", "-f", nargs='+', help="list of fields to extract text from. if attribute is desired, use field.attribute. Separate fallback fields with :")
 62 |   parser.add_argument("--segment", "-s", default="SEGMENT", help="segment name. pre v4, PARALLEL for x-eng, SEGMENT for monolingual. Otherwise SEGMENT. More than one match per segment will be concatenated")
 63 |   parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file")
 64 | 
 65 | 
 66 | 
 67 |   try:
 68 |     args = parser.parse_args()
 69 |   except IOError as msg:
 70 |     parser.error(str(msg))
 71 | 
 72 | 
 73 |   infile = args.infile
 74 |   infile = gzip.open(infile.name, 'rb') if infile.name.endswith(".gz") else infile
 75 |   outfile = prepfile(args.outfile, 'w')
 76 | 
 77 | 
 78 |   ctxt = ET.iterparse(infile, events=("end", "start"))
 79 |   # don't delete when in the middle of an element you want to investigate
 80 |   lock = False
 81 |   for event, element in ctxt:
 82 |     if event == "start" and element.tag == args.segment:
 83 |       lock = True
 84 |     if event == "end" and element.tag == args.segment:
 85 |       outfields = []
 86 |       for fieldopts in args.fields:
 87 |         wrotesomething = False
 88 |         fieldopts = fieldopts.split(":")
 89 |         while len(fieldopts) > 0:
 90 |           field = fieldopts.pop(0)
 91 |           subfields = field.split(".")
 92 |           matches = [element,] if subfields[0] == args.segment else element.findall(".//"+subfields[0])
 93 |           for match in matches:
 94 |             value = match.get(subfields[1]) if len(subfields) > 1 else match.text
 95 |             value = value.replace('\n', ' ') if value is not None else None
 96 |             value = value.replace('\t', ' ') if value is not None else None
 97 |             if value is not None:
 98 |               outfields.append(value)
 99 |               wrotesomething = True
100 |           del matches
101 |           if wrotesomething:
102 |             break
103 |         if not wrotesomething:
104 |           outfields.append("")
105 |       ostr = "\t".join(outfields)+"\n"
106 |       outfile.write(ostr)
107 |       lock = False
108 |     # recover memory
109 |     if event == "end" and not lock:
110 |       element.clear()      
111 |       for ancestor in element.xpath('ancestor-or-self::*'):
112 |         while ancestor.getprevious() is not None and ancestor.getparent() is not None and ancestor.getparent()[0] is not None:
113 |             del ancestor.getparent()[0]
114 |   del ctxt
115 | 
116 | 
117 | if __name__ == '__main__':
118 |   main()
119 | 
120 | 


--------------------------------------------------------------------------------
/src/marlin/marlin_count:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | marlin_count, extract bigram staitics from a file.
  4 | """
  5 | 
  6 | import optparse
  7 | import collections
  8 | import gzip
  9 | import bz2
 10 | 
 11 | 
 12 | def myopen(filename, mode='r'):
 13 |     """
 14 |     Open file. Use gzip or bzip2 if appropriate.
 15 |     """
 16 |     if filename.endswith('.gz'):
 17 |         return gzip.open(filename, mode)
 18 | 
 19 |     if filename.endswith('.bz2'):
 20 |         return bz2.BZ2File(filename, mode)
 21 | 
 22 |     return open(filename, mode)
 23 | 
 24 | 
 25 | def read_unigram_counts(file_handle, sent_limit=-1):
 26 |     """
 27 |     Read unigram counts from text file.
 28 |     If sentence_limit is positive only read that-many sentences/lines.
 29 |     """
 30 |     unigram_counts = collections.defaultdict(int)
 31 |     for number, line in enumerate(file_handle):
 32 |         tokens = line.split()
 33 |         for token in tokens:
 34 |             unigram_counts[token] += 1
 35 |         if sent_limit >= 0 and sent_limit > number:
 36 |             break
 37 |     return unigram_counts
 38 | 
 39 | 
 40 | def write_words(file_handle, table):
 41 |     """
 42 |     Write words in order of their index to file.
 43 |     """
 44 |     table_inv = [None] * len(table)
 45 |     for word, index in table.items():
 46 |         table_inv[index] = word
 47 |     for index, word in enumerate(table_inv):
 48 |         file_handle.write(word)
 49 |         file_handle.write('\n')
 50 | 
 51 | 
 52 | def read_bigram_counts(file_handle, table, stop_index, rare_index, sent_limit):
 53 |     """
 54 |     Count bigram coocurrences. Use rare_index for words not in table.
 55 |     If sentence_limit is positive only read that-many sentences/lines.
 56 |     """
 57 |     counts = []
 58 |     for _ in table:
 59 |         counts.append(collections.defaultdict(int))
 60 |     for number, line in enumerate(file_handle):
 61 |         tokens = line.split()
 62 |         last = stop_index
 63 |         for token in tokens:
 64 |             current = table.get(token, None)
 65 |             if current is None:
 66 |                 current = rare_index
 67 |             counts[last][current] += 1
 68 |             last = current
 69 |         counts[last][stop_index] += 1
 70 |         if sent_limit >= 0 and sent_limit > number:
 71 |             break
 72 |     return counts
 73 | 
 74 | 
 75 | def write_bigram_counts(file_handle, counts):
 76 |     """
 77 |     Write bigram statistics to file.
 78 |     """
 79 |     for _, neighbors in enumerate(counts):
 80 |         items = []
 81 |         for neighbor, count in neighbors.items():
 82 |             items.append('%d:%d' % (neighbor, count))
 83 |         print >>file_handle, ' '.join(items)
 84 | 
 85 | 
 86 | def main():
 87 |     """
 88 |     Main function.
 89 |     """
 90 | 
 91 |     parser = optparse.OptionParser()
 92 |     parser.add_option("-t", "--text", dest="text",
 93 |                       help="Input text. (one sentence per line, whitespace separated)",
 94 |                       metavar="FILE")
 95 |     parser.add_option("-w", "--words", dest="words",
 96 |                       help="Output: Word list.", metavar="FILE")
 97 |     parser.add_option("-b", "--bigrams", dest="bigrams",
 98 |                       help="Output: Bigrams counts.", metavar="FILE")
 99 |     parser.add_option("-r", "--rank-limit", dest="rank_limit", default=250000,
100 |                       help="If positive, only extract the r most frequent words.",
101 |                       type=int)
102 |     parser.add_option("-s", "--sent-limit", dest="sent_limit", default=-1,
103 |                       help="If positive, only process the s first sentences/lines.",
104 |                       type=int)
105 | 
106 |     options, _ = parser.parse_args()
107 | 
108 |     with myopen(options.text) as file_handle:
109 |         unigram_counts = read_unigram_counts(file_handle, options.sent_limit)
110 | 
111 |     table = {}
112 |     stop_index = 0
113 |     rare_index = 1
114 |     table['<STOP>'] = stop_index
115 |     table['<RARE>'] = rare_index
116 | 
117 |     unigram_counts = unigram_counts.items()
118 |     unigram_counts.sort(key=lambda (word, count): count, reverse=True)
119 |     # Add high rank words to table
120 |     for word, _ in unigram_counts:
121 |         table[word] = len(table)
122 |         if options.rank_limit >= 0 and len(table) >= options.rank_limit:
123 |             break
124 |     # Don't need this anymore:
125 |     del unigram_counts
126 | 
127 |     with myopen(options.words, 'w') as file_handle:
128 |         write_words(file_handle, table)
129 | 
130 |     with myopen(options.text) as file_handle:
131 |         bigram_counts = read_bigram_counts(file_handle, table, stop_index, rare_index,
132 |                                            options.sent_limit)
133 | 
134 |     with myopen(options.bigrams, 'w') as file_handle:
135 |         write_bigram_counts(file_handle, bigram_counts)
136 | 
137 | 
138 | if __name__ == '__main__':
139 |     main()
140 | 
141 | 


--------------------------------------------------------------------------------
/utagger:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | 
  6 | LM=2            # LM order
  7 | CHANNEL=""
  8 | INPUT=""
  9 | OUTPUT="output"
 10 | 
 11 | MODE="train" # [train,test]
 12 | ROM="false"
 13 | IL="xx"     # incident language
 14 | PL="en"     # parent language(s), comman separated
 15 | 
 16 | BASELINE="brown"
 17 | NCLUSTERS=500
 18 | NJOBS=2
 19 | NITERS=500
 20 | 
 21 | W_LM=1
 22 | W_CM=1
 23 | 
 24 | INPUT_FORMAT="txt" #[elisa:tgz,xml;txt,bio]
 25 | BIO_DELIM=""
 26 | LRLP_FIELD="ULF_LRLP_TOKENIZED_SOURCE"
 27 | 
 28 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 29 | EXP_DIR="$BASEDIR/exp-cipher"
 30 | CODE_DIR="$BASEDIR/src/code"
 31 | 
 32 | if [ -z "$CARMEL_DIR" ]; then
 33 |     CARMEL_DIR="/usr/local"
 34 | fi
 35 | if [ -z "$SRILM_DIR" ]; then
 36 |     SRILM_DIR="$HOME/srilm-1.7.2"
 37 | fi
 38 | 
 39 | 
 40 | while [ $# -gt 1 ]
 41 | do
 42 | key="$1"
 43 | case $key in
 44 |     -lm_o|--lm_order)
 45 |     LM="$2"
 46 |     shift # past argument
 47 |     ;;
 48 |     -i|--input)
 49 |     INPUT="$2"
 50 |     shift # past argument
 51 |     ;;
 52 |     -if|--in_format)
 53 |     INPUT_FORMAT="$2"
 54 |     shift # past argument
 55 |     ;;
 56 |     -bio_delim|--bio_delim)
 57 |     BIO_DELIM="$2"
 58 |     shift # past argument
 59 |     ;;
 60 |     -m|--mode)
 61 |     MODE="$2"
 62 |     shift # past argument
 63 |     ;;
 64 |     -rom|--rom)
 65 |     ROM="$2"
 66 |     shift # past argument
 67 |     ;;
 68 |     -cl|--child_lang)
 69 |     IL="$2"
 70 |     shift # past argument
 71 |     ;;
 72 |     -pl|--par_langs)
 73 |     PL="$2"
 74 |     shift # past argument
 75 |     ;;
 76 |     -o|--output)
 77 |     OUTPUT="$2"
 78 |     shift # past argument
 79 |     ;;
 80 |     -exp|--exp_dir)
 81 |     EXP_DIR="$2"
 82 |     shift # past argument
 83 |     ;;
 84 |     -ca|--clust_alg)
 85 |     BASELINE="$2"
 86 |     shift # past argument
 87 |     ;;
 88 |     -nc|--nclusters)
 89 |     NCLUSTERS="$2"
 90 |     shift # past argument
 91 |     ;;
 92 |     -nj|--njobs)
 93 |     NJOBS="$2"
 94 |     shift # past argument
 95 |     ;;
 96 |     -wlm|--wlm)
 97 |     W_LM="$2"
 98 |     shift # past argument
 99 |     ;;
100 |     -wcm|--wcm)
101 |     W_CM="$2"
102 |     shift # past argument
103 |     ;;
104 |     -carmel|--carmel)
105 |     CARMEL_DIR="$2"
106 |     shift # past argument
107 |     ;;
108 |     -sri|--sridir)
109 |     SRILM_DIR="$2"
110 |     shift # past argument
111 |     ;;
112 |     *)
113 |             # unknown option
114 |     ;;
115 | esac
116 | shift
117 | done 
118 | 
119 | export CARMEL_DIR=$CARMEL_DIR
120 | export SRILM_DIR=$SRILM_DIR
121 | export EXP_DIR=$EXP_DIR
122 | 
123 | 
124 | PL_CODES=(${PL//,/ })
125 | mkdir -p $EXP_DIR/logs $EXP_DIR/models $EXP_DIR/data
126 | datadir="$EXP_DIR/data"
127 | 
128 | 
129 | if [ $OUTPUT = "output" ];then
130 |     OUTPUT="$datadir/output.tagged"
131 | fi
132 | 
133 | 
134 | ##########################################################
135 | # PREPROCESS
136 | ##########################################################
137 | 
138 | #extract from xml
139 | echo "Extracting..."
140 | 
141 | if [ $INPUT_FORMAT = "xml" ]; then
142 |     python3 src/code/elisa2flat.py --infile $INPUT \
143 |         --fields $LRLP_FIELD \
144 |         --outfile $datadir/input.raw
145 | elif [ $INPUT_FORMAT = "bio" ]; then
146 |     cat $INPUT | cut -f 1 -d " " | sed 's/^$/#eos/g' | tr '\n' ' ' | \
147 |     sed 's/ #eos /\n/g' > "$datadir"/input.raw
148 | else
149 |     cp $INPUT $datadir/input.raw
150 | fi
151 | 
152 | # preprocess 
153 | echo "Preprocessing..."
154 | bash $CODE_DIR/preprocess.sh -i $datadir/input.raw -rom $ROM -m $MODE -l input -exp $EXP_DIR
155 | 
156 | 
157 | 
158 | ##########################################################
159 | # RUN TAGGER
160 | ##########################################################
161 | 
162 | # train combined LM
163 | if [ $MODE = "train" ]&&[ "${#PL_CODES[@]}" -gt 1 ]; then
164 |     echo "Training combined language model..."
165 |     bash $CODE_DIR/train_combined_lm.sh -l $PL -ord $LM -exp $EXP_DIR
166 | fi
167 | 
168 | # run clustering
169 | if [ $MODE = "train" ]; then
170 |     echo "Training clustering..."
171 |     bash src/code/run_clustering.sh -i $datadir/input.clean.filt \
172 |     -b $BASELINE -nc $NCLUSTERS -nj $NJOBS -exp $EXP_DIR
173 | fi
174 | 
175 | echo "Tagging text with cluster ids..."
176 | bash $CODE_DIR/tag_with_clusters.sh -b $BASELINE -n $NCLUSTERS -i $datadir/input.clean.filt -e $EXP_DIR
177 | 
178 | 
179 | if [ $MODE = "train" ]; then
180 |     echo "Training cipher model..."
181 |     python3 $CODE_DIR/train_cipher.py -il $IL -rl $PL -exp $EXP_DIR \
182 |     -it $NITERS -rc 100 -lm 2 -b $BASELINE -nc $NCLUSTERS -j $NJOBS -m train -dc 1.1
183 | 
184 |     python3 $CODE_DIR/combine_channels.py -nc $NCLUSTERS -ca $BASELINE -exp $EXP_DIR -it $NITERS \
185 |     -il $IL -rl $PL
186 | fi
187 | 
188 | 
189 | echo "Decoding with cipher model..."
190 | if [ $INPUT_FORMAT = "xml" ]||[ $INPUT_FORMAT = "txt" ]; then
191 |     python3 $CODE_DIR/train_cipher.py -il $IL -rl comb -exp $EXP_DIR \
192 |     -o $OUTPUT -tk $datadir/input.raw -tkr $datadir/input.raw.roman \
193 |     -fm $INPUT_FORMAT \
194 |     -it $NITERS -rc 100 -lm 2 -b $BASELINE -nc $NCLUSTERS -j 1 -m eval -dc 1.1 -ct
195 | 
196 | elif [ $INPUT_FORMAT = "bio" ]; then
197 |     python3 src/code/train_cipher.py -il $IL -rl comb -exp $EXP_DIR \
198 |     -o $OUTPUT -tk $INPUT -tkr $datadir/input.raw.roman \
199 |     -fm $INPUT_FORMAT \
200 |     -it $NITERS -rc 100 -lm 2 -b $BASELINE -nc $NCLUSTERS -j 1 -m eval -dc 1.1 -ct
201 | fi


--------------------------------------------------------------------------------
/src/marlin/basic/opt.cc:
--------------------------------------------------------------------------------
  1 | #include "opt.h"
  2 | #include "std.h"
  3 | #include <getopt.h>
  4 | 
  5 | ////////////////////////////////////////////////////////////////////////
  6 | // command-line arguments
  7 | 
  8 | void GetOpt::AddOpt(const string &name, bool has_arg) {
  9 |   opts.push_back(pair<string, bool>(name, has_arg));
 10 | }
 11 | 
 12 | void GetOpt::Parse(int argc, char *argv[]) {
 13 |   option *opt_list = new option[opts.size()+1];
 14 |   for(int i = 0; i <= (int)opts.size(); i++) {
 15 |     option *o = &opt_list[i];
 16 |     if(i < (int)opts.size()) {
 17 |       o->name = opts[i].first.c_str();
 18 |       o->has_arg = opts[i].second;
 19 |       //printf("N %s\n", o->name);
 20 |     }
 21 |     else {
 22 |       o->name = NULL;
 23 |       o->has_arg = 0;
 24 |     }
 25 |     o->flag = NULL;
 26 |     o->val = 0;
 27 |   }
 28 | 
 29 |   int i;
 30 | 
 31 |   values.clear();
 32 |   values.resize(opts.size());
 33 |   while(true) {
 34 |     int status = getopt_long(argc, argv, "", opt_list, &i);
 35 |     if(status == -1) break;
 36 |     assert(status == 0);
 37 |     //debug("%d %s -> %s\n", i, opt_list[i].name, optarg);
 38 |     // put a 1 to signify that the argument exists
 39 |     values[i] = optarg ? optarg : "1";
 40 |   }
 41 |   
 42 |   delete [] opt_list;
 43 | }
 44 | 
 45 | int GetOpt::Lookup(const string &name) const {
 46 |   for(int i = 0; i < (int)opts.size(); i++) {
 47 |     if(opts[i].first == name) return i;
 48 |   }
 49 |   return -1;
 50 | }
 51 | 
 52 | string GetOpt::Get(const string &name, const string &default_value) const {
 53 |   int i = Lookup(name);
 54 |   return i != -1 && !values[i].empty() ? values[i] : default_value;
 55 | }
 56 | 
 57 | string GetOpt::Get(const string &name) const {
 58 |   string x = Get(name, "");
 59 |   if(x.empty()) {
 60 |     fprintf(stderr, "Missing required parameter `%s'.\n", name.c_str());
 61 |     exit(1);
 62 |   }
 63 |   return x;
 64 | }
 65 | 
 66 | bool GetOpt::Exists(const string &name) const {
 67 |   return !Get(name, "").empty();
 68 | }
 69 | 
 70 | int GetOpt::GetInt(const string &name) const {
 71 |   int x;
 72 |   int r = sscanf(Get(name).c_str(), "%d", &x);
 73 |   assert(r == 1);
 74 |   return x;
 75 | }
 76 | 
 77 | int GetOpt::GetInt(const string &name, int default_value) const {
 78 |   return Exists(name) ? GetInt(name) : default_value;
 79 | }
 80 | 
 81 | double GetOpt::GetDouble(const string &name) const {
 82 |   double x;
 83 |   int r = sscanf(Get(name).c_str(), "%lf", &x);
 84 |   assert(r == 1);
 85 |   return x;
 86 | }
 87 | 
 88 | double GetOpt::GetDouble(const string &name, double default_value) const {
 89 |   return Exists(name) ? GetDouble(name) : default_value;
 90 | }
 91 | 
 92 | ////////////////////////////////////////////////////////////
 93 | 
 94 | void process_opt(int argc, char *argv[]) {
 95 |   GetOpt opt;
 96 | 
 97 |   // set up GetOpt to parse
 98 |   for(int i = 0; i < (int)bool_opts.size(); i++) {
 99 |     opt.AddOpt(bool_opts[i].name, false);
100 |     opt.AddOpt("no" + bool_opts[i].name, false);
101 |   }
102 |   for(int i = 0; i < (int)int_opts.size(); i++)
103 |     opt.AddOpt(int_opts[i].name, true);
104 |   for(int i = 0; i < (int)double_opts.size(); i++)
105 |     opt.AddOpt(double_opts[i].name, true);
106 |   for(int i = 0; i < (int)string_opts.size(); i++)
107 |     opt.AddOpt(string_opts[i].name, true);
108 |   opt.AddOpt("help", false);
109 | 
110 |   // parse
111 |   opt.Parse(argc, argv);
112 | 
113 |   // print help if called for
114 |   if(opt.Exists("help")) {
115 |     printf("usage: %s\n", argv[0]);
116 |     for(int i = 0; i < (int)bool_opts.size(); i++) {
117 |       const OptInfo<bool> &o = bool_opts[i];
118 |       printf(" %c%-20s: %s", " *"[o.required], o.name.c_str(), o.msg.c_str());
119 |       if(!o.required) printf(" [%s]", *(o.var) ? "true" : "false");
120 |       printf("\n");
121 |     }
122 |     for(int i = 0; i < (int)int_opts.size(); i++) {
123 |       const OptInfo<int> &o = int_opts[i];
124 |       printf(" %c%-13s <int> : %s", " *"[o.required], o.name.c_str(), o.msg.c_str());
125 |       if(!o.required) printf(" [%d]", *(o.var));
126 |       printf("\n");
127 |     }
128 |     for(int i = 0; i < (int)double_opts.size(); i++) {
129 |       const OptInfo<double> &o = double_opts[i];
130 |       printf(" %c%-13s <dbl> : %s", " *"[o.required], o.name.c_str(), o.msg.c_str());
131 |       if(!o.required) printf(" [%f]", *(o.var));
132 |       printf("\n");
133 |     }
134 |     for(int i = 0; i < (int)string_opts.size(); i++) {
135 |       const OptInfo<string> &o = string_opts[i];
136 |       printf(" %c%-13s <str> : %s", " *"[o.required], o.name.c_str(), o.msg.c_str());
137 |       if(!o.required) printf(" [%s]", (o.var)->c_str());
138 |       printf("\n");
139 |     }
140 |     exit(1);
141 |   }
142 | 
143 |   // retrieve data; store the variables
144 |   for(int i = 0; i < (int)bool_opts.size(); i++) {
145 |     const OptInfo<bool> &o = bool_opts[i];
146 |     bool yes = opt.Exists(o.name);
147 |     bool no = opt.Exists("no" + o.name);
148 |     assert(!o.required || (yes || no));
149 |     assert(!yes || !no);
150 |     if(yes) *(o.var) = true;
151 |     if(no) *(o.var) = false;
152 |   }
153 |   for(int i = 0; i < (int)int_opts.size(); i++) {
154 |     const OptInfo<int> &o = int_opts[i];
155 |     *(o.var) = o.required ? opt.GetInt(o.name) : opt.GetInt(o.name, *(o.var));
156 |   }
157 |   for(int i = 0; i < (int)double_opts.size(); i++) {
158 |     const OptInfo<double> &o = double_opts[i];
159 |     *(o.var) = o.required ? opt.GetDouble(o.name) : opt.GetDouble(o.name, *(o.var));
160 |   }
161 |   for(int i = 0; i < (int)string_opts.size(); i++) {
162 |     const OptInfo<string> &o = string_opts[i];
163 |     *(o.var) = o.required ? opt.Get(o.name) : opt.Get(o.name, *(o.var));
164 |   }
165 | }
166 | 
167 | void init_opt(int argc, char *argv[]) {
168 |   process_opt(argc, argv);
169 |   srand(rand_seed);
170 | }
171 | 
172 | void print_opts() {
173 |   forvec(_, const OptInfo<bool> &, o, bool_opts)
174 |     cerr << o.name << " = " << (*o.var ? "true" : "false") << endl;
175 |   forvec(_, const OptInfo<int> &, o, int_opts)
176 |     cerr << o.name << " = " << *o.var << endl;
177 |   forvec(_, const OptInfo<double> &, o, double_opts)
178 |     cerr << o.name << " = " << *o.var << endl;
179 |   forvec(_, const OptInfo<string> &, o, string_opts)
180 |     cerr << o.name << " = " << *o.var << endl;
181 | }
182 | 
183 | ////////////////////////////////////////////////////////////
184 | // Pre defined options.
185 | 
186 | // allow user to specify a comment always, so some arbitrary description
187 | // of this program execution can be embedded in the command-line
188 | 


--------------------------------------------------------------------------------
/src/marlin/basic/stl-utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef __STL_UTILS__
  2 | #define __STL_UTILS__
  3 | 
  4 | #include "stl-basic.h"
  5 | #include <stdarg.h>
  6 | 
  7 | #define contains(X, x) ((X).find(x) != (X).end())
  8 | 
  9 | inline void improve(DoubleInt &x, const DoubleInt &y) {
 10 |   if(y.first > x.first) x = y; // Bigger is better.
 11 | }
 12 | 
 13 | template<class Compare> inline void improve(DoubleInt &x, const DoubleInt &y, Compare compare) {
 14 |   if(compare(y.first, x.first)) x = y;
 15 | }
 16 | 
 17 | // Free up the memory in a vector or hash_map.
 18 | template<class T> void destroy(T &obj) {
 19 |   T empty_obj;
 20 |   obj.swap(empty_obj);
 21 | }
 22 | 
 23 | template<class T> int index_of(const vector<T> &vec, const T &x, int i0 = 0) {
 24 |   for(int i = i0; i < len(vec); i++)
 25 |     if(vec[i] == x) return i;
 26 |   return -1;
 27 | }
 28 | 
 29 | template<class T> int count_of(const vector<T> &vec, const T &x) {
 30 |   int n = 0;
 31 |   forvec(_, const T &, y, vec)
 32 |     if(x == y) n++;
 33 |   return n;
 34 | }
 35 | 
 36 | // Get vec[i], but if i is out of range, expand the vector and fill
 37 | // everything with x.
 38 | template<class T> T &expand_get(vector<T> &vec, int i, const T &x) {
 39 |   int n = len(vec);
 40 |   if(i >= n) {
 41 |     vec.resize(i+1);
 42 |     for(int ii = n; ii <= i; ii++) vec[ii] = x;
 43 |   }
 44 |   return vec[i];
 45 | }
 46 | template<class T> T &expand_get(vector< vector<T> > &mat, int i, int j, const T &x) {
 47 |   int n = len(mat);
 48 |   if(i >= n) mat.resize(i+1);
 49 |   return expand_get(mat[i], j, x);
 50 | }
 51 | template<class T> T &expand_get(vector< vector< vector<T> > > &mat, int i, int j, int k, const T &x) {
 52 |   int n = len(mat);
 53 |   if(i >= n) mat.resize(i+1);
 54 |   return expand_get(mat[i], j, k, x);
 55 | }
 56 | 
 57 | // Assuming this vector/matrix will not grow any more,
 58 | // we can safely call compact to reduce the memory usage.
 59 | // This is only effective after deletions.
 60 | // This isn't necessary if we haven't actually touched
 61 | // the memory past size (i.e., we didn't have a bigger
 62 | // structure).
 63 | template<class T> void vector_compact(vector<T> &vec) {
 64 |   vector<T> new_vec(len(vec));
 65 |   new_vec = vec;
 66 |   vec.swap(new_vec);
 67 | }
 68 | template<class T> void matrix_compact(vector< vector<T> > &mat) {
 69 |   vector< vector<T> > new_mat(len(mat));
 70 |   foridx(i, len(mat)) compact(mat[i]);
 71 |   new_mat = mat;
 72 |   mat.swap(new_mat);
 73 | }
 74 | 
 75 | // Append to a vector and return the value type.
 76 | template<class T> inline T &push_back(vector<T> &vec, const T &x = T()) {
 77 |   vec.push_back(x);
 78 |   return vec[len(vec)-1];
 79 | }
 80 | 
 81 | template<class T> inline void matrix_resize(vector< vector<T> > &mat, int nr, int nc) {
 82 |   mat.resize(nr);
 83 |   foridx(r, nr) mat[r].resize(nc);
 84 | }
 85 | 
 86 | template<class T> inline void matrix_resize(vector< vector< vector<T> > > &mat, int n1, int n2, int n3) {
 87 |   mat.resize(n1);
 88 |   foridx(i, n1) {
 89 |     mat[i].resize(n2);
 90 |     foridx(j, n2)
 91 |       mat[i][j].resize(n3);
 92 |   }
 93 | }
 94 | 
 95 | template<class T> inline vector< vector<T> > new_matrix(int nr, int nc, T v) {
 96 |   vector< vector<T> > mat;
 97 |   mat.resize(nr);
 98 |   foridx(r, nr) {
 99 |     mat[r].resize(nc);
100 |     foridx(c, nc)
101 |       mat[r][c] = v;
102 |   }
103 |   return mat;
104 | }
105 | 
106 | template<class T> inline void matrix_fill(vector< vector<T> > &mat, T v) {
107 |   foridx(i, len(mat)) vector_fill(mat[i], v);
108 | }
109 | 
110 | template<class T> inline void vector_fill(vector<T> &vec, T v) {
111 |   foridx(i, len(vec)) vec[i] = v;
112 | }
113 | 
114 | template<class T> inline T vector_sum(const vector<T> &vec) {
115 |   T sum = 0;
116 |   foridx(i, len(vec)) sum += vec[i];
117 |   return sum;
118 | }
119 | 
120 | // Returns the index of the minimum element in vec.
121 | template<class T> inline int vector_index_min(const vector<T> &vec) {
122 |   T min = vec[0];
123 |   int best_i = 0;
124 |   foridx(i, len(vec)) {
125 |     if(vec[i] < min) {
126 |       min = vec[i];
127 |       best_i = i;
128 |     }
129 |   }
130 |   return best_i;
131 | }
132 | 
133 | template<class T> inline int vector_min(const vector<T> &vec) {
134 |   return vec[vector_index_min(vec)];
135 | }
136 | 
137 | // Returns the index of the maximum element in vec.
138 | template<class T> inline int vector_index_max(const vector<T> &vec) {
139 |   T max = vec[0];
140 |   int best_i = 0;
141 |   foridx(i, len(vec)) {
142 |     if(vec[i] > max) {
143 |       max = vec[i];
144 |       best_i = i;
145 |     }
146 |   }
147 |   return best_i;
148 | }
149 | 
150 | template<class T> inline int vector_max(const vector<T> &vec) {
151 |   return vec[vector_index_max(vec)];
152 | }
153 | 
154 | // Returns the index of the maximum element in vec.
155 | template<class T> inline IntPair matrix_index_max(const vector< vector<T> > &mat) {
156 |   T max = mat[0][0];
157 |   IntPair best_ij = IntPair(0, 0);
158 |   foridx(i, len(mat)) {
159 |     foridx(j, len(mat[i])) {
160 |       if(mat[i][j] > max) {
161 |         max = mat[i][j];
162 |         best_ij = IntPair(i, j);
163 |       }
164 |     }
165 |   }
166 |   return best_ij;
167 | }
168 | 
169 | // Returns the sum of the elements in column c.
170 | template<class T> inline T matrix_col_sum(const vector< vector<T> > &mat, int c) {
171 |   T sum = 0;
172 |   foridx(r, len(mat)) sum += mat[r][c];
173 |   return sum;
174 | }
175 | 
176 | template<class T1, class T2> ostream &operator<<(ostream &out, const pair<T1, T2> &p) {
177 |   return out << p.first << ' ' << p.second;
178 | }
179 | 
180 | template<class T> ostream &operator<<(ostream &out, const vector<T> &vec) {
181 |   foridx(i, len(vec)) {
182 |     if(i > 0) out << ' ';
183 |     out << vec[i];
184 |   }
185 |   return out;
186 | }
187 | 
188 | template<class T> ostream &operator<<(ostream &out, const vector< vector<T> > &mat) {
189 |   foridx(r, len(mat)) out << mat[r] << endl;
190 |   return out;
191 | }
192 | 
193 | template<class T> vector<T> subvector(const vector<T> &vec, int i, int j = -1) {
194 |   int N = len(vec);
195 |   if(j < 0) j += N;
196 |   if(j < i) j = i;
197 | 
198 |   // Probably some fancy STL way to do this.
199 |   vector<T> subvec(j-i);
200 |   foridx(k, j-i) subvec[k] = vec[i+k];
201 |   return subvec;
202 | }
203 | 
204 | template<class T> vector<T> to_vector(T arr[], int n) {
205 |   vector<T> vec(n);
206 |   foridx(i, n) vec[i] = arr[i];
207 |   return vec;
208 | }
209 | 
210 | inline IntVec to_vector(int n, ...) {
211 |   va_list ap;
212 |   IntVec vec;
213 |   va_start(ap, n);
214 |   foridx(i, n) vec.push_back(va_arg(ap, int));
215 |   va_end(ap);
216 |   return vec;
217 | }
218 | 
219 | inline DoubleVec to_fvector(int n, ...) {
220 |   va_list ap;
221 |   DoubleVec vec;
222 |   va_start(ap, n);
223 |   foridx(i, n) vec.push_back(va_arg(ap, double));
224 |   va_end(ap);
225 |   return vec;
226 | }
227 | 
228 | template<class T> inline void operator+=(vector<T> &vec1, const vector<T> &vec2) {
229 |   foridx(i, len(vec1)) vec1[i] += vec2[i];
230 | }
231 | 
232 | #endif
233 | 


--------------------------------------------------------------------------------
/src/code/filter_lowfreq.py:
--------------------------------------------------------------------------------
  1 | import os,sys
  2 | import argparse
  3 | from collections import Counter
  4 | import re
  5 | import pdb
  6 | 
  7 | filters_detailed = [
  8 |   ("url" , [re.compile(r'^https?[:/]{1,3}(www\.)?[a-z]+(\.?[a-z]+\/?)+.*?$',re.UNICODE),
  9 |             re.compile(r'^[wW]{3}\.[a-zA-Z]+(\.?[A-Z]+\/?)+.*?$',re.UNICODE),
 10 |             re.compile(r'^([a-zA-Z][^@])[a-zA-Z.]+\.com$',re.UNICODE),
 11 |              ]),
 12 |   ('email', [re.compile(r'^[-a-zA-Z0-9_.]+\@([a-zA-Z0-9]+\.)+[a-zA-Z]+$',re.UNICODE) ]),
 13 |   ("00:00"           , [re.compile(r'[0-9](:[0-9]{2})+',re.UNICODE),
 14 |                          re.compile(r'[0-9](:[0-9]{2})*[aApP][mM]$',re.UNICODE),
 15 |                          re.compile(r'[0-9]hour$',re.UNICODE),] ),
 16 |   ("00km", [re.compile(r'[0-9]km$',re.UNICODE)]),
 17 |   ("00kg", [re.compile(r'[0-9]kg$',re.UNICODE)]),
 18 |   ("haha", [re.compile(r'^haha$',re.UNICODE),
 19 |             re.compile(r'^wkwk$',re.UNICODE)]),
 20 | 
 21 | ]
 22 | 
 23 | filters = [
 24 |   ("snUser"           , [re.compile(r'^[@]([0-9]*[-a-zA-Z._]+[0-9]*[!?]?)+$',re.UNICODE)] ),
 25 |   ("hashTag"          , [re.compile(r'^[#][-a-zA-Z._]{3,}[0-9]*[!?]?$',re.UNICODE),
 26 |                          re.compile(r'^[#][0-9]+[-a-zA-Z._]{3,}[!?]?$',re.UNICODE),
 27 |                          re.compile(r'^[#][0-9]+[-a-zA-Z._]{3,}[0-9]+[!?]?$',re.UNICODE), ]),
 28 |   ("twoDigitNum"      , [re.compile(r'^[0-9]{2}$',re.UNICODE)] ),
 29 |   ("fourDigitNum"     , [re.compile(r'^[0-9]{4}$',re.UNICODE)] ),
 30 |   ("hasDigitAndAlpha" , [re.compile(r'[0-9].*[a-zA-Z]',re.UNICODE) ,
 31 |                          re.compile(r'[a-zA-Z].*[0-9]',re.UNICODE) ]) ,
 32 |   ("hasDigitAndDash"  , [re.compile(r'[0-9]-[0-9]',re.UNICODE)] ),
 33 |   ("hasDigitAndSlash" , [re.compile(r'[0-9]/[0-9]',re.UNICODE)] ),
 34 |   ("hasDigitAndComma" , [re.compile(r'[0-9],[0-9]',re.UNICODE)] ),
 35 |   ("hasDigitAndPeriod" , [re.compile(r'[0-9][.][0-9]',re.UNICODE)] ),
 36 |   ("isHour"           , [re.compile(r'[0-9]:[0-9]',re.UNICODE),
 37 |                          re.compile(r'[0-9][aApP][mM]$',re.UNICODE)] ),
 38 |   ("othernum"         , [re.compile(r'^[0-9]+$',re.UNICODE)] ),
 39 |   ("allCaps"          , [re.compile(r'^[A-Z]+$',re.UNICODE)] ),
 40 |   ("capPeriod"        , [re.compile(r'^[A-Z][.]$',re.UNICODE)] ),
 41 |   ("initCap"          , [re.compile(r'^[A-Z][a-z]+$',re.UNICODE)] ),
 42 |   ("lowercase"        , [re.compile(r'^[a-z]$',re.UNICODE)] ),
 43 | ]
 44 | 
 45 | is_prob_word = re.compile(r"^([a-zA-Z]+[-._',&]?)+$",re.UNICODE)
 46 | 
 47 | 
 48 | def get_filter_tag(word,filter_list):
 49 |   for tag,reg_list in filter_list:
 50 |     for reg in reg_list:
 51 |       if reg.search(word)!=None:
 52 |         return tag
 53 |   return word
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | if __name__ == "__main__":
 60 |   parser = argparse.ArgumentParser() 
 61 |   #parser.add_argument("--l","-l", type=str, help="Language -aaa-")
 62 |   parser.add_argument("--input","-i", type=str, help="Input file")
 63 |   parser.add_argument("--mode","-m", type=str, default="train", help="Mode [train,eval]")
 64 |   parser.add_argument("--vocab","-v", type=str, default=None, help="Filtered vocabulary")
 65 |   parser.add_argument("--thr","-t", type=int, default=3, help="Cut-off threshold")
 66 |   #parser.add_argument("--sent_len","-sl", type=int, default=190, help="Filter threshold for long sentences")
 67 |   parser.add_argument("--dom","-d", type=str, default=None, help="Test domain (valid only for outd exps)")
 68 |   parser.add_argument("--aggr","-aggr", action='store_true', help="Perform aggresive filtering (threshold oriented)")
 69 |   parser.add_argument("--ign_emp","-ig", action='store_true', help="Ignore empty lines/sentences.")
 70 |   parser.add_argument("--lower","-low", action='store_true', help="Lowercase all text")
 71 |   args = parser.parse_args()
 72 | 
 73 |   vocab = set()
 74 | 
 75 |   # load input
 76 |   data = open(args.input,'r').read().split('\n')
 77 |   data = [line for line in data]
 78 |   if data[-1] == '': data = data[:-1]
 79 | 
 80 |   ### aggressive filtering mode
 81 |   
 82 |   ## train mode
 83 |   # create vocabulary
 84 |   if args.mode == "train":
 85 |     vocab = Counter()
 86 |     for sent in data:
 87 |       if sent=='': continue
 88 |       if args.lower: sent = sent.lower()
 89 |       vocab.update(sent.split(' '))
 90 |     filt = []
 91 |     count = 0
 92 | 
 93 |     for x,y in vocab.most_common():
 94 |       # if aggresive, evth below threshold is ignored
 95 |       if y<=args.thr and args.aggr:
 96 |         break
 97 |       if len(x)>40:
 98 |         continue
 99 |       # if not aggressive, evth be;pw thre that is not a word is ignored
100 |       if y<=args.thr and is_prob_word.search(x)==None:
101 |         continue
102 | 
103 |       # all possible urls, email and hours are ignored
104 |       if get_filter_tag(x,filters_detailed)!=x:
105 |         continue
106 |       filt.append([x,y])
107 |       if count%100000 == 0:
108 |         print('->',count)
109 |       count += 1
110 |     #filt = [[x,y] for x,y in vocab.most_common() if y>args.thr]
111 |     dom_pref = '' if args.dom==None else '.'+args.dom
112 |     vocab_fn = os.path.join(os.path.dirname(args.input),"vocab"+dom_pref)
113 |     open(vocab_fn,'w').write('\n'.join(["%s\t%d" % (w,f) for w,f in filt]) + '\n')
114 |     vocab = set([x for x,y in filt])
115 | 
116 |     del filt
117 | 
118 |   # eval mode
119 |   # load vocabulary
120 |   else:
121 |     if args.vocab==None:
122 |       print("Error: Filtered vocabulary file not specified!\nCheck arguments list with -h option")
123 |       sys.exit(1)
124 |     elif not os.path.exists(args.vocab):
125 |       print("Error: Filtered vocabulary file does not exist!")
126 |       sys.exit(1)  
127 |     else:
128 |       for line in open(args.vocab,'r'):
129 |         line = line.strip('\n').strip(' ')
130 |         if line=='': continue
131 |         w,f = line.split('\t')
132 |         vocab.add(w)
133 |     #
134 |   #END-IF-MODE
135 |   
136 |   outfile = open(args.input+".filt",'w')
137 |   count = 0
138 | 
139 |   # filter data
140 |   for sent in data:
141 |     if sent=='' and not args.ign_emp:
142 |       print('',file=outfile)
143 |       continue
144 | 
145 |     new_sent = []
146 |     if args.lower:
147 |       sent = sent.lower()
148 |     sent_tok = sent.split(' ')
149 |     #if args.ign_emp and len(sent_tok)>args.sent_len-1:
150 |     #  continue
151 |     for word in sent_tok:
152 |       if word in vocab:
153 |         new_sent.append(word)
154 |       else:
155 |         tag = get_filter_tag(word,filters_detailed)
156 |         if tag!=word:
157 |           new_sent.append(tag)
158 |           continue
159 |         tag = get_filter_tag(word,filters)
160 |         if tag==word:
161 |           tag = 'unk'
162 |         new_sent.append("<"+tag+">")
163 |       #END-IF-VOCAB
164 |     #END-FOR-W
165 |     new_sent.append("#eos")
166 |     print(' '.join(new_sent),file=outfile)
167 | 
168 |     if count % 100000 == 0:
169 |       print("->",count)
170 |     count+=1
171 |   #END-FOR-SENT
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/src/code/train_cipher.py:
--------------------------------------------------------------------------------
  1 | import os,sys
  2 | import argparse
  3 | from multiprocessing import Pool
  4 | import subprocess as sp
  5 | import numpy as np
  6 | from utils import *
  7 | import pdb
  8 | 
  9 | 
 10 | import warnings
 11 | warnings.filterwarnings("ignore")
 12 | 
 13 | PRIME_MOD = 19751
 14 | 
 15 | 
 16 | def run_train_channel(conf):
 17 |   rl,order,il,c_alg,cl,it,_id,IS_ELISA,exp_dir = conf.split('.')
 18 |   print(" running conf: ",conf)
 19 |   print("              ",rl,order,il,c_alg,cl,it,_id,IS_ELISA,exp_dir)
 20 |   
 21 |   IS_ELISA = bool(int(IS_ELISA))
 22 |   seed = (int(_id)+1) * PRIME_MOD * 100
 23 |   argums = ["bash","src/code/train_channel.sh",
 24 |              "-rl",rl,'-o',order,
 25 |              "-il",il,'-c',cl,
 26 |              "-it",it,'-id',_id,
 27 |              "-s",str(seed),
 28 |              "-exp",exp_dir
 29 |              ]
 30 | 
 31 |   if IS_ELISA:
 32 |     argums.extend(["-elisa","e"])
 33 |   if c_alg!="":
 34 |     argums.extend(["-ca",c_alg])
 35 | 
 36 |   pobj = sp.Popen(argums)
 37 |   while pobj.wait(): continue
 38 | 
 39 |   channel_name = "%s%s-%s.%s.%s.%s.%s" % (rl,order,il,c_alg,cl,it,_id)
 40 |   
 41 |   if not os.path.exists(exp_dir + "/logs/" + channel_name):
 42 |     return None
 43 | 
 44 |   lines = open(exp_dir + "/logs/" + channel_name,'r').read().strip('\n').split('\n')
 45 |   to_mine = ""
 46 |   for line in lines[-10:]:
 47 |     if line.startswith("Setting weights to model"):
 48 |       to_mine = line
 49 |       break
 50 |   idx = to_mine.rfind("^")
 51 |   print(to_mine,idx)
 52 |   score = float(to_mine[idx+1:].strip(' '))
 53 | 
 54 |   return [int(_id),score]
 55 | 
 56 | 
 57 | 
 58 | if __name__ == "__main__":
 59 |   parser = argparse.ArgumentParser() 
 60 |   parser.add_argument("--tokens"      ,"-tk" , type=str, default='', help="Input raw to tag --non-romanized, if applicable")
 61 |   parser.add_argument("--tokens_roman","-tkr", type=str, default='', help="Input raw to tag --romanized")
 62 |   parser.add_argument("--format"      ,"-fm" , type=str, default='txt', help="Format of input tokens [txt,bio]")
 63 |   parser.add_argument("--bio_delim"      ,"-bio_delim" , type=str, default=' ', help="Delimiter in BIO column format [' ', \n]")
 64 | 
 65 |   parser.add_argument("--output"      ,"-o", type=str, default='', help="Output file")
 66 |   parser.add_argument("--exp_dir"      ,"-exp", type=str, default='', help="Experiment folder")
 67 |   parser.add_argument("--il"          ,"-il", type=str, default='en', help="Incident Language")
 68 |   parser.add_argument("--rl"          ,"-rl", type=str, default=None, help="Related Language")
 69 |   parser.add_argument("--iter"        ,"-it", type=int, default=10, help="N. iterations per cipher run")
 70 |   parser.add_argument("--run_per_ch"  ,"-rc", type=int, default=100, help="N. runs per cipher conf")
 71 |   parser.add_argument("--lm_order"    ,"-lm", type=int, default=2, help="LM order")
 72 |   parser.add_argument("--baseline"    ,"-b",  type=str, default="brown", help="clustering algorithm [brown,ah,{l,p}{k,s}{100,300}{mono,multi}]")
 73 |   parser.add_argument("--num_clusters","-nc", type=int, default=500, help="Number of clusters on cipher side")
 74 |   parser.add_argument("--njobs"       ,"-j",  type=int, default=4, help="Number of jobs")
 75 |   parser.add_argument("--mode"        ,"-m",  type=str, default="train", help="mode [train,eval]")
 76 |   parser.add_argument("--test_data"   ,"-td", type=str, default="ud", help="which test data to evaluate [ud,elisa]")
 77 |   parser.add_argument("--dec_conf"    ,"-dc", type=str, default="1.1", help="weights for decoder (LM,CM)")
 78 |   parser.add_argument("--comb_table","-ct", action='store_true', help="Used combined cipher table channel model to decode")
 79 | 
 80 |   args = parser.parse_args()
 81 |   
 82 |   IS_ELISA = False
 83 |   CLUST_ALG = args.baseline
 84 | 
 85 | 
 86 |   if args.mode == "train":
 87 |     rl_list = []
 88 |     def_rfs = "en,de,fr,it,es,ja,ar,cs,ru,sw-hcs,hi"
 89 |     
 90 |     # No RL spec : default list
 91 |     if args.rl==None:
 92 |       rl_list = def_rfs.split(",")
 93 |     # Single RL spec : will run only one RL-IL pair
 94 |     elif ',' not in args.rl:
 95 |       rl_list = [args.rl]
 96 |     # Multiple RL spec: arg format "en.de.du.da", will run for RL-IL for all RL specified
 97 |     else:
 98 |       rl_list = args.rl.split(',')
 99 | 
100 |     with Pool(args.njobs) as pool:
101 |       for rl in rl_list:
102 |         print()
103 |         print("RL: ",rl)
104 |         print("-"*60)
105 |         conf_pref = "%s.%d.%s.%s.%d.%d" % (rl,args.lm_order,args.il,CLUST_ALG,args.num_clusters,args.iter)
106 |         channel_name = "%s%d-%s.%s.%d.%d" % (rl,args.lm_order,args.il,CLUST_ALG,args.num_clusters,args.iter)
107 |         # train channel      
108 |         if "elisa" in args.il or args.il in ["ta","tl"]:
109 |           IS_ELISA = True
110 | 
111 |         confs = ["%s.%d.%d.%s" % (conf_pref,_id,IS_ELISA,args.exp_dir) for _id in range(args.run_per_ch) ]
112 |         res = pool.map(run_train_channel,confs)
113 |         res = [x for x in res if x!=None]
114 |         idxs = [x for x,y in res]
115 |         idx = np.array([y for x,y in res]).argmin()
116 |         print("best model:",res[idx])
117 |         open("%s/logs/%s.scores" % (args.exp_dir,channel_name),'w').write('\n'.join(["%d %f" % (x,y) for x,y in res]))
118 |         
119 |         # clean directories
120 |         to_rm = ["%s/logs/%s.%d" % (args.exp_dir,channel_name,_id) for _id in idxs if _id!=res[idx][0] ]
121 |         if len(to_rm)>0:
122 |           sp.run(["rm"] + to_rm)
123 |         sp.run(["mv","%s/logs/%s.%d" % (args.exp_dir,channel_name,res[idx][0]),args.exp_dir + "/logs/"+channel_name])
124 |         to_rm = ["%s/models/%s.%d" % (args.exp_dir,channel_name,_id) for _id in idxs if _id!=res[idx][0] ]
125 |         if len(to_rm)>0:
126 |           sp.run(["rm"] + to_rm)
127 |         sp.run(["mv","%s/models/%s.%d" % (args.exp_dir,channel_name,res[idx][0]),args.exp_dir+"/models/"+channel_name])
128 |       #END-FOR
129 |     #END-WITH
130 | 
131 | 
132 |   # eval & tag
133 |   else:
134 |     rl = args.rl
135 |     lm_dir = "../../lms"
136 |     if args.comb_table:
137 |       rl = "comb" # placeholder for combination code
138 |       lm_dir = args.exp_dir + "/lm"
139 | 
140 |     channel_name = "%s.%s.%d.%d.comb" % (args.il,args.baseline,args.num_clusters,args.iter)
141 |     lm_file = "%s/%s.%d.fsa.noe" % (lm_dir,rl,args.lm_order)
142 |     wlm,wcm = args.dec_conf.split(".")
143 |     
144 |     test_file = "%s/data/output.%d.%s.carmel" % (args.exp_dir,args.num_clusters,args.baseline)
145 |     
146 |     agms = ["bash","src/code/decode.sh","-lm",lm_file,
147 |             "-ch",channel_name,
148 |             "-i",test_file,
149 |             "-wlm",wlm,
150 |             "-wcm",wcm
151 |             ]
152 |     pobj = sp.Popen(agms)
153 |     while pobj.wait(): continue
154 | 
155 |     outfile     = open(args.output,'w')
156 |     outfile_rom = open(args.output+".roman",'w')
157 |     toks_file   = open(args.tokens,'r')
158 |     toks_rom_file = open(args.tokens_roman,'r')
159 |     tags_file   = open("%s.%s.%s.%s.decoded" % \
160 |       (test_file,channel_name,wlm,wcm), 'r' )
161 | 
162 |     
163 |     for tag_line in tags_file:
164 |       tags = tag_line.split()
165 |       tok_rom_line = toks_rom_file.readline().strip('\n') # raw, romanized text, always in txt format
166 |       tok_roms = tok_rom_line.split()
167 |     
168 |       if args.format == 'txt':
169 |         tok_line = toks_file.readline().strip('\n')
170 |         ntags=[]
171 |         for tk,tag in zip(tok_roms,tags):
172 |           ntags.append( ground_tag(tk,tag) )
173 |         #
174 |         pairs = zip(tok_line.split(),ntags)
175 |         print(" ".join(["%s/%s"%(x,y) for x,y in pairs]), file=outfile)
176 | 
177 |         pairs = zip(tok_roms,ntags)
178 |         print(" ".join(["%s/%s"%(x,y) for x,y in pairs]), file=outfile_rom)
179 | 
180 |       elif args.format == 'bio':
181 |         idx = 0
182 |         while(True):
183 |           tok_line = toks_file.readline().strip('\n')
184 |           if tok_line=='':
185 |             print("",file=outfile)
186 |             break
187 |           tk = tok_roms[idx]
188 |           tag = ground_tag(tk,tags[idx])
189 |           print("%s%s%s" % (tok_line,args.bio_delim,tag),file=outfile)
190 |           idx += 1
191 |         #
192 |       #
193 | 
194 |     #END-FOR
195 | 


--------------------------------------------------------------------------------
/src/code/utils.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from sklearn.metrics import classification_report,\
  3 |                             accuracy_score, v_measure_score, \
  4 |                             precision_score, recall_score, f1_score
  5 | from collections import defaultdict, Counter
  6 | import subprocess as sp
  7 | import unicodedata
  8 | import pdb
  9 | import pickle
 10 | import sys
 11 | sys.path.append("cpos")
 12 | 
 13 | # UDv2 & UTv2 support
 14 | upos2char = {
 15 |   "NOUN"  :'N',
 16 |   "PROPN" :'O',
 17 |   "ADJ" :'A',
 18 |   "ADV" :'R',
 19 |   "ADP" :"I",
 20 |   "AUX" :"B",
 21 |   "CCONJ" :"C",
 22 |   "SCONJ" :"J",
 23 |   "DET" :"D",
 24 |   "INTJ"  :'T',
 25 |   "NUM" :"M",
 26 |   "PART"  :"F",
 27 |   "PRON"  :"P",
 28 |   "PUNCT" :"E",
 29 |   "SYM" :"Y",
 30 |   "VERB"  :"V",
 31 |   "X"   :"X",
 32 | }
 33 | 
 34 | 
 35 | char2upos = {v:k for k,v in upos2char.items() }
 36 | 
 37 | ut2char = {
 38 |   "NOUN"  :'N',
 39 |   "PROPN" :'O',
 40 |   "ADJ" :'A',
 41 |   "ADV" :'R',
 42 |   "ADP" :"I",
 43 |   "AUX" :"B",
 44 |   "CONJ" :"C",
 45 |   "DET" :"D",
 46 |   "INTJ"  :'T',
 47 |   "NUM" :"M",
 48 |   "PRT"  :"F",
 49 |   "PRON"  :"P",
 50 |   "." :"E",
 51 |   "SYM" :"Y",
 52 |   "VERB"  :"V",
 53 |   "X"   :"X",
 54 | }
 55 | 
 56 | char2ut = {v:k for k,v in ut2char.items() }
 57 | 
 58 | 
 59 | MAX_LINES_DECODE=20000
 60 | # MAX_LINES_DECODE=10
 61 | 
 62 | mapper = {
 63 |   'en' : ['en','fr','de','es','it','ar','ja','cs','ru','sw-hcs'],
 64 |   'fr' : ['fr','de','es','ja','ar','it','en','cs','ru','sw-hcs'],
 65 |   'fa' : ['fa','fr','de,''es','ja','cs','ar','it','en','ru','sw-hcs'],
 66 |   'sw-hcs' : ['sw-hcs','de','fr','es','ja','ar', 'it','en','cs','ru'],
 67 |   #'tl-elisa' : ['id','fr','es','ja','it','en','cs','ru','sw-hcs'],
 68 |   #'tl-elisa' : ['fr','es','ja','it','en','cs','ru','sw-hcs'],
 69 |   'si-elisa' : ['de','fr','es','ja','ar', 'it','en','cs','ru','sw-hcs'],
 70 |   'rw-elisa' : ['de','fr','es','ja','ar', 'it','en','cs','ru','sw-hcs']
 71 | }
 72 | 
 73 | default_rls = ['en','de','fr','es','it','ja','ar','cs','ru','sw-hcs']
 74 | 
 75 | 
 76 | def saveObject(obj, name='model'):
 77 |   with open(name + '.pickle', 'wb') as fd:
 78 |     pickle.dump(obj, fd, protocol=pickle.HIGHEST_PROTOCOL)
 79 | 
 80 | 
 81 | def uploadObject(obj_name):
 82 |   # Load tagger
 83 |   with open(obj_name, 'rb') as fd:
 84 |     obj = pickle.load(fd)
 85 |   return obj
 86 | 
 87 | 
 88 | def test_punct(token):
 89 |   for c in token:
 90 |     if unicodedata.category(c)[0] != 'P':
 91 |       return False
 92 |   return True
 93 | 
 94 | 
 95 | def test_num(token):
 96 |   for c in token:
 97 |     if unicodedata.category(c)[0] != 'N':
 98 |       return False
 99 |   return True
100 | 
101 | 
102 | def ground_tag(tk,tag,pos_tagset="ud"):
103 |   mapper = char2ut if pos_tagset=="ut" else char2upos
104 |   if tk.isdigit() or test_num(tk):
105 |     return mapper["M"]
106 |   else:
107 |     is_punct = test_punct(tk)
108 |     # false negatives
109 |     if is_punct:
110 |       return mapper["E"]
111 |     # false positives
112 |     elif tag==mapper["E"] and not is_punct:
113 |       return "X"
114 |     # the rest
115 |     return tag
116 | 
117 | 
118 | def evaluate_core(gold_fn,pred_fn):
119 |   gold,pred = [],[]
120 |   count = 1
121 |   gls = []
122 |   for line in open(gold_fn,'r'):
123 |     gold.extend(line.strip('\n').split(' '))
124 |     gls.append(line.strip('\n').split(' '))
125 |     if count>=MAX_LINES_DECODE:
126 |       break
127 |     count += 1
128 |   count = 0
129 |   for line in open(pred_fn,'r'):
130 |     pline = line.strip('\n').split(' ')
131 |     pred.extend(pline)
132 |     if len(pline)!= len(gls[count]):
133 |       print("->",count,len(gls[count]), len(pline))
134 |       print(gls[count])
135 |       print(pline)
136 |       print("-"*50)
137 |       pdb.set_trace()
138 |     count += 1
139 |   return gold,pred
140 | 
141 | 
142 | def evaluate(gold_fn,pred_fn,report=True):
143 |   gold,pred = evaluate_core(gold_fn,pred_fn)
144 | 
145 |   acc = accuracy_score(gold,pred)
146 |   if report:
147 |     print("ACC: %.4f" % acc )
148 |     print("VM : %.4f" % v_measure_score(gold,pred))
149 |     print(classification_report(gold,pred,digits=4))
150 |   return acc
151 | 
152 | def evaluate_all_metrics(gold_fn,pred_fn):
153 |   gold,pred = evaluate_core(gold_fn,pred_fn)
154 |   acc = accuracy_score(gold,pred)
155 |   p = precision_score(gold,pred)
156 |   r = recall_score(gold,pred)
157 |   f1 = f1_score(gold,pred)
158 |   support = Counter(gold)
159 | 
160 | 
161 | 
162 | 
163 | def eval_lexicon(lexicon_fn,pred_fn,words_fn,report=True):
164 |   # read lexicon
165 |   lexicon = defaultdict(set)
166 |   for line in open(lexicon_fn,'r'):
167 |     line= line.strip('\t')
168 |     if line=='': continue
169 |     w,pos,_ = line.split("\t")
170 |     if pos=='PRT': pos = 'PART'
171 |     lexicon[w].add(pos)
172 | 
173 |   # read pred file
174 |   gold,pred = [],[]
175 |   pred_vocab = defaultdict(set)
176 |   predpos_lines = open(pred_fn,'r').read().strip('\n').split('\n')
177 |   word_lines = open(words_fn,'r').read().strip('\n').split('\n')
178 |   for wform_line,pred_line in zip(word_lines,predpos_lines):
179 |     wforms = wform_line.lower().split(" ")[:-1]
180 |     ptags = pred_line.split(" ")
181 |     for w,pos in zip(wforms,ptags):
182 |       if w not in lexicon: continue
183 |       pred_vocab[w].add(pos)
184 |   #
185 | 
186 |   #compare
187 |   correct = 0.0
188 |   for w,pred_pos_list in pred_vocab.items():
189 |     if len(pred_pos_list & lexicon[w])>0:
190 |       correct += 1
191 |   acc = correct / len(pred_vocab)
192 |   if report:
193 |     print("ACC: %.4f" % acc )
194 |     print("Inters. size: ",len(pred_vocab) )
195 |   return acc
196 | 
197 | 
198 | def get_ppl(channel,wlm,wcm):
199 |   fn = 'logs/%s.%d.%d.dec' % (channel,wlm,wcm)
200 | 
201 |   lines = open(fn,'r').read().strip('\n').split('\n')
202 |   
203 |   idx = lines[-1].rfind("^")
204 |   pl_sc = float(lines[-1][idx+1:].strip(' '))
205 | 
206 |   tmp = lines[-1][:idx]
207 |   idx = tmp.rfind("^")
208 |   idx2 = tmp.rfind(" ")
209 |   pt_sc = float(tmp[idx+1:idx2].strip(' '))
210 | 
211 |   return pl_sc,pt_sc
212 | 
213 | 
214 | def decoder_acc(channel,conf,wlm,wcm):
215 |   rl,order,il,c_alg,cl,IS_ELISA = conf.split('.')
216 |   IS_ELISA = bool(int(IS_ELISA))
217 |   lm_file = "lms/%s.%s.fsa.noe" % (rl,order)
218 |   acc = 0.0
219 |     
220 |   # cases     sw si           ta tl
221 |   if IS_ELISA:
222 |     test_file = "data/%s/test.elisa.%s.carmel" % (il,cl) if c_alg=="br" else \
223 |           "data/%s/test.elisa.%s.%s.carmel" % (il,cl,c_alg)
224 |     test_wf_file = "data/%s/test.elisa.true.filt" % (il)
225 |     lexicon_fn = "data/%s/lexicon.elisa" % (il)
226 | 
227 |     agms = ["sh","decode.sh","-lm",lm_file,
228 |         "-ch",channel,
229 |         "-i",test_file,
230 |         "-wlm",str(wlm),
231 |         "-wcm",str(wcm)
232 |         ]
233 |     pobj = sp.Popen(agms)
234 |     while pobj.wait(): continue
235 | 
236 |     acc = eval_lexicon(lexicon_fn, '%s.%s.%d.%d.decoded' % (test_file,channel,wlm,wcm),test_wf_file,False)
237 | 
238 |   else:
239 |     test_file = "data/%s/test.%s.carmel" % (il,cl) if c_alg=="br" else \
240 |           "data/%s/test.%s.%s.carmel" % (il,cl,c_alg)
241 |     goldfn = "data/%s/test.upos" % (il)
242 | 
243 |     agms = ["sh","decode.sh","-lm",lm_file,
244 |         "-ch",channel,
245 |         "-i",test_file,
246 |         "-wlm",str(wlm),
247 |         "-wcm",str(wcm)
248 |         ]
249 |     pobj = sp.Popen(agms)
250 |     while pobj.wait(): continue
251 | 
252 |     # if args.dec_ch:
253 |     acc = evaluate(goldfn,'%s.%s.%d.%d.decoded' % (test_file,channel,wlm,wcm),False)
254 | 
255 |   return acc
256 | 
257 | 
258 | def post_process(tk_rom_fn, dec_fn,out_fn,pos_tagset="ud"):
259 |   outfile     = open(out_fn,'w')
260 |   toks_rom_file = open(tk_rom_fn,'r')
261 |   tags_file   = open(dec_fn,'r')
262 |   
263 |   for tag_line in tags_file:
264 |     tags = tag_line.split()
265 |     tok_rom_line = toks_rom_file.readline().strip('\n') # raw, romanized text, always in txt format
266 |     tok_roms = tok_rom_line.split()
267 | 
268 |     ntags=[]
269 |     for tk,tag in zip(tok_roms,tags):
270 |       ntags.append( ground_tag(tk,tag,pos_tagset) )
271 |     #
272 |     # what if carmel could not decode input? fallback to all nouns
273 |     if ntags==[]:
274 |       ntags = ["NOUN"]*len(tok_roms)
275 | 
276 |     print(" ".join(ntags), file=outfile)
277 | 
278 | 
279 |   #END-FOR


--------------------------------------------------------------------------------
/src/marlin/basic/city.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2011 Google, Inc.
  2 | //
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to deal
  5 | // in the Software without restriction, including without limitation the rights
  6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7 | // copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | //
 10 | // The above copyright notice and this permission notice shall be included in
 11 | // all copies or substantial portions of the Software.
 12 | //
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 19 | // THE SOFTWARE.
 20 | //
 21 | // CityHash, by Geoff Pike and Jyrki Alakuijala
 22 | //
 23 | // This file provides CityHash64() and related functions.
 24 | //
 25 | // It's probably possible to create even faster hash functions by
 26 | // writing a program that systematically explores some of the space of
 27 | // possible hash functions, by using SIMD instructions, or by
 28 | // compromising on hash quality.
 29 | 
 30 | #include "city.h"
 31 | 
 32 | #include <algorithm>
 33 | #include <string.h>  // for memcpy and memset
 34 | 
 35 | using namespace std;
 36 | 
 37 | static uint64 UNALIGNED_LOAD64(const char *p) {
 38 |   uint64 result;
 39 |   memcpy(&result, p, sizeof(result));
 40 |   return result;
 41 | }
 42 | 
 43 | static uint32 UNALIGNED_LOAD32(const char *p) {
 44 |   uint32 result;
 45 |   memcpy(&result, p, sizeof(result));
 46 |   return result;
 47 | }
 48 | 
 49 | #if !defined(WORDS_BIGENDIAN)
 50 | 
 51 | #define uint32_in_expected_order(x) (x)
 52 | #define uint64_in_expected_order(x) (x)
 53 | 
 54 | #else
 55 | 
 56 | #ifdef _MSC_VER
 57 | #include <stdlib.h>
 58 | #define bswap_32(x) _byteswap_ulong(x)
 59 | #define bswap_64(x) _byteswap_uint64(x)
 60 | 
 61 | #elif defined(__APPLE__)
 62 | // Mac OS X / Darwin features
 63 | #include <libkern/OSByteOrder.h>
 64 | #define bswap_32(x) OSSwapInt32(x)
 65 | #define bswap_64(x) OSSwapInt64(x)
 66 | 
 67 | #else
 68 | #include <byteswap.h>
 69 | #endif
 70 | 
 71 | #define uint32_in_expected_order(x) (bswap_32(x))
 72 | #define uint64_in_expected_order(x) (bswap_64(x))
 73 | 
 74 | #endif  // WORDS_BIGENDIAN
 75 | 
 76 | #if !defined(LIKELY)
 77 | #if HAVE_BUILTIN_EXPECT
 78 | #define LIKELY(x) (__builtin_expect(!!(x), 1))
 79 | #else
 80 | #define LIKELY(x) (x)
 81 | #endif
 82 | #endif
 83 | 
 84 | static uint64 Fetch64(const char *p) {
 85 |   return uint64_in_expected_order(UNALIGNED_LOAD64(p));
 86 | }
 87 | 
 88 | static uint32 Fetch32(const char *p) {
 89 |   return uint32_in_expected_order(UNALIGNED_LOAD32(p));
 90 | }
 91 | 
 92 | // Some primes between 2^63 and 2^64 for various uses.
 93 | static const uint64 k0 = 0xc3a5c85c97cb3127ULL;
 94 | static const uint64 k1 = 0xb492b66fbe98f273ULL;
 95 | static const uint64 k2 = 0x9ae16a3b2f90404fULL;
 96 | static const uint64 k3 = 0xc949d7c7509e6557ULL;
 97 | 
 98 | // Bitwise right rotate.  Normally this will compile to a single
 99 | // instruction, especially if the shift is a manifest constant.
100 | static uint64 Rotate(uint64 val, int shift) {
101 |   // Avoid shifting by 64: doing so yields an undefined result.
102 |   return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
103 | }
104 | 
105 | // Equivalent to Rotate(), but requires the second arg to be non-zero.
106 | // On x86-64, and probably others, it's possible for this to compile
107 | // to a single instruction if both args are already in registers.
108 | static uint64 RotateByAtLeast1(uint64 val, int shift) {
109 |   return (val >> shift) | (val << (64 - shift));
110 | }
111 | 
112 | static uint64 ShiftMix(uint64 val) {
113 |   return val ^ (val >> 47);
114 | }
115 | 
116 | static uint64 HashLen16(uint64 u, uint64 v) {
117 |   return Hash128to64(uint128(u, v));
118 | }
119 | 
120 | static uint64 HashLen0to16(const char *s, size_t len) {
121 |   if (len > 8) {
122 |     uint64 a = Fetch64(s);
123 |     uint64 b = Fetch64(s + len - 8);
124 |     return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b;
125 |   }
126 |   if (len >= 4) {
127 |     uint64 a = Fetch32(s);
128 |     return HashLen16(len + (a << 3), Fetch32(s + len - 4));
129 |   }
130 |   if (len > 0) {
131 |     uint8 a = s[0];
132 |     uint8 b = s[len >> 1];
133 |     uint8 c = s[len - 1];
134 |     uint32 y = static_cast<uint32>(a) + (static_cast<uint32>(b) << 8);
135 |     uint32 z = len + (static_cast<uint32>(c) << 2);
136 |     return ShiftMix(y * k2 ^ z * k3) * k2;
137 |   }
138 |   return k2;
139 | }
140 | 
141 | // This probably works well for 16-byte strings as well, but it may be overkill
142 | // in that case.
143 | static uint64 HashLen17to32(const char *s, size_t len) {
144 |   uint64 a = Fetch64(s) * k1;
145 |   uint64 b = Fetch64(s + 8);
146 |   uint64 c = Fetch64(s + len - 8) * k2;
147 |   uint64 d = Fetch64(s + len - 16) * k0;
148 |   return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
149 |                    a + Rotate(b ^ k3, 20) - c + len);
150 | }
151 | 
152 | // Return a 16-byte hash for 48 bytes.  Quick and dirty.
153 | // Callers do best to use "random-looking" values for a and b.
154 | static pair<uint64, uint64> WeakHashLen32WithSeeds(
155 |     uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) {
156 |   a += w;
157 |   b = Rotate(b + a + z, 21);
158 |   uint64 c = a;
159 |   a += x;
160 |   a += y;
161 |   b += Rotate(a, 44);
162 |   return make_pair(a + z, b + c);
163 | }
164 | 
165 | // Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
166 | static pair<uint64, uint64> WeakHashLen32WithSeeds(
167 |     const char* s, uint64 a, uint64 b) {
168 |   return WeakHashLen32WithSeeds(Fetch64(s),
169 |                                 Fetch64(s + 8),
170 |                                 Fetch64(s + 16),
171 |                                 Fetch64(s + 24),
172 |                                 a,
173 |                                 b);
174 | }
175 | 
176 | // Return an 8-byte hash for 33 to 64 bytes.
177 | static uint64 HashLen33to64(const char *s, size_t len) {
178 |   uint64 z = Fetch64(s + 24);
179 |   uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0;
180 |   uint64 b = Rotate(a + z, 52);
181 |   uint64 c = Rotate(a, 37);
182 |   a += Fetch64(s + 8);
183 |   c += Rotate(a, 7);
184 |   a += Fetch64(s + 16);
185 |   uint64 vf = a + z;
186 |   uint64 vs = b + Rotate(a, 31) + c;
187 |   a = Fetch64(s + 16) + Fetch64(s + len - 32);
188 |   z = Fetch64(s + len - 8);
189 |   b = Rotate(a + z, 52);
190 |   c = Rotate(a, 37);
191 |   a += Fetch64(s + len - 24);
192 |   c += Rotate(a, 7);
193 |   a += Fetch64(s + len - 16);
194 |   uint64 wf = a + z;
195 |   uint64 ws = b + Rotate(a, 31) + c;
196 |   uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
197 |   return ShiftMix(r * k0 + vs) * k2;
198 | }
199 | 
200 | uint64 CityHash64(const char *s, size_t len) {
201 |   if (len <= 32) {
202 |     if (len <= 16) {
203 |       return HashLen0to16(s, len);
204 |     } else {
205 |       return HashLen17to32(s, len);
206 |     }
207 |   } else if (len <= 64) {
208 |     return HashLen33to64(s, len);
209 |   }
210 | 
211 |   // For strings over 64 bytes we hash the end first, and then as we
212 |   // loop we keep 56 bytes of state: v, w, x, y, and z.
213 |   uint64 x = Fetch64(s + len - 40);
214 |   uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
215 |   uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
216 |   pair<uint64, uint64> v = WeakHashLen32WithSeeds(s + len - 64, len, z);
217 |   pair<uint64, uint64> w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x);
218 |   x = x * k1 + Fetch64(s);
219 | 
220 |   // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
221 |   len = (len - 1) & ~static_cast<size_t>(63);
222 |   do {
223 |     x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
224 |     y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
225 |     x ^= w.second;
226 |     y += v.first + Fetch64(s + 40);
227 |     z = Rotate(z + w.first, 33) * k1;
228 |     v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
229 |     w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
230 |     std::swap(z, x);
231 |     s += 64;
232 |     len -= 64;
233 |   } while (len != 0);
234 |   return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
235 |                    HashLen16(v.second, w.second) + x);
236 | }
237 | 
238 | uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) {
239 |   return CityHash64WithSeeds(s, len, k2, seed);
240 | }
241 | 
242 | uint64 CityHash64WithSeeds(const char *s, size_t len,
243 |                            uint64 seed0, uint64 seed1) {
244 |   return HashLen16(CityHash64(s, len) - seed0, seed1);
245 | }
246 | 
247 | // A subroutine for CityHash128().  Returns a decent 128-bit hash for strings
248 | // of any length representable in signed long.  Based on City and Murmur.
249 | static uint128 CityMurmur(const char *s, size_t len, uint128 seed) {
250 |   uint64 a = Uint128Low64(seed);
251 |   uint64 b = Uint128High64(seed);
252 |   uint64 c = 0;
253 |   uint64 d = 0;
254 |   signed long l = len - 16;
255 |   if (l <= 0) {  // len <= 16
256 |     a = ShiftMix(a * k1) * k1;
257 |     c = b * k1 + HashLen0to16(s, len);
258 |     d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c));
259 |   } else {  // len > 16
260 |     c = HashLen16(Fetch64(s + len - 8) + k1, a);
261 |     d = HashLen16(b + len, c + Fetch64(s + len - 16));
262 |     a += d;
263 |     do {
264 |       a ^= ShiftMix(Fetch64(s) * k1) * k1;
265 |       a *= k1;
266 |       b ^= a;
267 |       c ^= ShiftMix(Fetch64(s + 8) * k1) * k1;
268 |       c *= k1;
269 |       d ^= c;
270 |       s += 16;
271 |       l -= 16;
272 |     } while (l > 0);
273 |   }
274 |   a = HashLen16(a, c);
275 |   b = HashLen16(d, b);
276 |   return uint128(a ^ b, HashLen16(b, a));
277 | }
278 | 
279 | uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) {
280 |   if (len < 128) {
281 |     return CityMurmur(s, len, seed);
282 |   }
283 | 
284 |   // We expect len >= 128 to be the common case.  Keep 56 bytes of state:
285 |   // v, w, x, y, and z.
286 |   pair<uint64, uint64> v, w;
287 |   uint64 x = Uint128Low64(seed);
288 |   uint64 y = Uint128High64(seed);
289 |   uint64 z = len * k1;
290 |   v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s);
291 |   v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8);
292 |   w.first = Rotate(y + z, 35) * k1 + x;
293 |   w.second = Rotate(x + Fetch64(s + 88), 53) * k1;
294 | 
295 |   // This is the same inner loop as CityHash64(), manually unrolled.
296 |   do {
297 |     x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
298 |     y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
299 |     x ^= w.second;
300 |     y += v.first + Fetch64(s + 40);
301 |     z = Rotate(z + w.first, 33) * k1;
302 |     v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
303 |     w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
304 |     std::swap(z, x);
305 |     s += 64;
306 |     x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
307 |     y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
308 |     x ^= w.second;
309 |     y += v.first + Fetch64(s + 40);
310 |     z = Rotate(z + w.first, 33) * k1;
311 |     v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
312 |     w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
313 |     std::swap(z, x);
314 |     s += 64;
315 |     len -= 128;
316 |   } while (LIKELY(len >= 128));
317 |   x += Rotate(v.first + z, 49) * k0;
318 |   z += Rotate(w.first, 37) * k0;
319 |   // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
320 |   for (size_t tail_done = 0; tail_done < len; ) {
321 |     tail_done += 32;
322 |     y = Rotate(x + y, 42) * k0 + v.second;
323 |     w.first += Fetch64(s + len - tail_done + 16);
324 |     x = x * k0 + w.first;
325 |     z += w.second + Fetch64(s + len - tail_done);
326 |     w.second += v.first;
327 |     v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second);
328 |   }
329 |   // At this point our 56 bytes of state should contain more than
330 |   // enough information for a strong 128-bit hash.  We use two
331 |   // different 56-byte-to-8-byte hashes to get a 16-byte final result.
332 |   x = HashLen16(x, v.first);
333 |   y = HashLen16(y + z, w.first);
334 |   return uint128(HashLen16(x + v.second, w.second) + y,
335 |                  HashLen16(x + w.second, y + v.second));
336 | }
337 | 
338 | uint128 CityHash128(const char *s, size_t len) {
339 |   if (len >= 16) {
340 |     return CityHash128WithSeed(s + 16,
341 |                                len - 16,
342 |                                uint128(Fetch64(s) ^ k3,
343 |                                        Fetch64(s + 8)));
344 |   } else if (len >= 8) {
345 |     return CityHash128WithSeed(NULL,
346 |                                0,
347 |                                uint128(Fetch64(s) ^ (len * k0),
348 |                                        Fetch64(s + len - 8) ^ k1));
349 |   } else {
350 |     return CityHash128WithSeed(s, len, uint128(k0, k1));
351 |   }
352 | }
353 | 
354 | #ifdef __SSE4_2__
355 | #include <citycrc.h>
356 | #include <nmmintrin.h>
357 | 
358 | // Requires len >= 240.
359 | static void CityHashCrc256Long(const char *s, size_t len,
360 |                                uint32 seed, uint64 *result) {
361 |   uint64 a = Fetch64(s + 56) + k0;
362 |   uint64 b = Fetch64(s + 96) + k0;
363 |   uint64 c = result[0] = HashLen16(b, len);
364 |   uint64 d = result[1] = Fetch64(s + 120) * k0 + len;
365 |   uint64 e = Fetch64(s + 184) + seed;
366 |   uint64 f = seed;
367 |   uint64 g = 0;
368 |   uint64 h = 0;
369 |   uint64 i = 0;
370 |   uint64 j = 0;
371 |   uint64 t = c + d;
372 | 
373 |   // 240 bytes of input per iter.
374 |   size_t iters = len / 240;
375 |   len -= iters * 240;
376 |   do {
377 | #define CHUNK(multiplier, z)                                    \
378 |     {                                                           \
379 |       uint64 old_a = a;                                         \
380 |       a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s);          \
381 |       b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8);      \
382 |       c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16);     \
383 |       d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24);     \
384 |       e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32);     \
385 |       t = old_a;                                                \
386 |     }                                                           \
387 |     f = _mm_crc32_u64(f, a);                                    \
388 |     g = _mm_crc32_u64(g, b);                                    \
389 |     h = _mm_crc32_u64(h, c);                                    \
390 |     i = _mm_crc32_u64(i, d);                                    \
391 |     j = _mm_crc32_u64(j, e);                                    \
392 |     s += 40
393 | 
394 |     CHUNK(1, 1); CHUNK(k0, 0);
395 |     CHUNK(1, 1); CHUNK(k0, 0);
396 |     CHUNK(1, 1); CHUNK(k0, 0);
397 |   } while (--iters > 0);
398 | 
399 |   while (len >= 40) {
400 |     CHUNK(k0, 0);
401 |     len -= 40;
402 |   }
403 |   if (len > 0) {
404 |     s = s + len - 40;
405 |     CHUNK(k0, 0);
406 |   }
407 |   j += i << 32;
408 |   a = HashLen16(a, j);
409 |   h += g << 32;
410 |   b += h;
411 |   c = HashLen16(c, f) + i;
412 |   d = HashLen16(d, e + result[0]);
413 |   j += e;
414 |   i += HashLen16(h, t);
415 |   e = HashLen16(a, d) + j;
416 |   f = HashLen16(b, c) + a;
417 |   g = HashLen16(j, i) + c;
418 |   result[0] = e + f + g + h;
419 |   a = ShiftMix((a + g) * k0) * k0 + b;
420 |   result[1] += a + result[0];
421 |   a = ShiftMix(a * k0) * k0 + c;
422 |   result[2] = a + result[1];
423 |   a = ShiftMix((a + e) * k0) * k0;
424 |   result[3] = a + result[2];
425 | }
426 | 
427 | // Requires len < 240.
428 | static void CityHashCrc256Short(const char *s, size_t len, uint64 *result) {
429 |   char buf[240];
430 |   memcpy(buf, s, len);
431 |   memset(buf + len, 0, 240 - len);
432 |   CityHashCrc256Long(buf, 240, ~static_cast<uint32>(len), result);
433 | }
434 | 
435 | void CityHashCrc256(const char *s, size_t len, uint64 *result) {
436 |   if (LIKELY(len >= 240)) {
437 |     CityHashCrc256Long(s, len, 0, result);
438 |   } else {
439 |     CityHashCrc256Short(s, len, result);
440 |   }
441 | }
442 | 
443 | uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed) {
444 |   if (len <= 900) {
445 |     return CityHash128WithSeed(s, len, seed);
446 |   } else {
447 |     uint64 result[4];
448 |     CityHashCrc256(s, len, result);
449 |     uint64 u = Uint128High64(seed) + result[0];
450 |     uint64 v = Uint128Low64(seed) + result[1];
451 |     return uint128(HashLen16(u, v + result[2]),
452 |                    HashLen16(Rotate(v, 32), u * k0 + result[3]));
453 |   }
454 | }
455 | 
456 | uint128 CityHashCrc128(const char *s, size_t len) {
457 |   if (len <= 900) {
458 |     return CityHash128(s, len);
459 |   } else {
460 |     uint64 result[4];
461 |     CityHashCrc256(s, len, result);
462 |     return uint128(result[2], result[3]);
463 |   }
464 | }
465 | 
466 | #endif
467 | 


--------------------------------------------------------------------------------
/src/marlin/marlin_cluster.cc:
--------------------------------------------------------------------------------
  1 | #include "basic/std.h"
  2 | #include "basic/stl-basic.h"
  3 | #include "basic/stl-utils.h"
  4 | #include "basic/opt.h"
  5 | #include <unistd.h>
  6 | #include <time.h>
  7 | 
  8 | vector< OptInfo<bool> > bool_opts;
  9 | vector< OptInfo<int> > int_opts;
 10 | vector< OptInfo<real> > double_opts;
 11 | vector< OptInfo<string> > string_opts;
 12 | 
 13 | opt_define_string_req(unigram_file_, "words", "Text file with words.");
 14 | opt_define_string_req(bigram_file_, "bigrams", "Text file with bigrams.");
 15 | opt_define_string_req(output_,    "output", "Output.");
 16 | 
 17 | opt_define_double(alpha_, "alpha", 0.0,         "Character Model Strength [0,1].");
 18 | opt_define_int(num_classes_,        "c", 1000,                    "Number of clusters.");
 19 | opt_define_int(num_steps_,        "steps", -1,                    "Number of steps or -1 to run until convergence.");
 20 | opt_define_int(rand_seed,    "rand", time(NULL)*getpid(),  "Number to call srand with.");
 21 | opt_define_int(verbose_,    "v", 2,  "Verbosity level.");
 22 | opt_define_int(sanity_checks_,    "sanity", 0,  "Sanity Check level. Makes things slow!");
 23 | opt_define_int(cache_size_,    "cache-size", 100000,  "N up to which to cache n * log(n).");
 24 | 
 25 | const real REAL_NEG_INF = -1e99;
 26 | 
 27 | struct UpdateResult {
 28 |   int current_w;
 29 |   int current_c;
 30 |   int best_c;
 31 |   real best_delta;
 32 |   DoubleVec deltas;
 33 | 
 34 |   UpdateResult() {
 35 |     deltas.resize(num_classes_);
 36 |   }
 37 | 
 38 | };
 39 | 
 40 | struct Entry{
 41 |   int item;
 42 |   int count;
 43 | };
 44 | 
 45 | bool compare_entry(const Entry& entry, const Entry& other) {
 46 |   return entry.item < other.item;
 47 | }
 48 | 
 49 | typedef vector<Entry> EntryVec;
 50 | typedef vector<EntryVec> EntryVecVec;
 51 | 
 52 | void addToSparseEntryVec(EntryVec& vector, int klass, int count) {
 53 |   Entry entry;
 54 |   entry.item = klass;
 55 |   entry.count = count;
 56 | 
 57 |   EntryVec::iterator pos = lower_bound (vector.begin(), vector.end(), entry, compare_entry);
 58 | 
 59 |   if (pos != vector.end() && pos->item == klass) {
 60 |     pos->count += count;
 61 |   } else {
 62 |     vector.insert(pos, entry);
 63 |   }
 64 |   
 65 | }
 66 | 
 67 | struct BiEntry{
 68 |   int item;
 69 |   int item2;
 70 |   int count;
 71 | };
 72 | 
 73 | typedef vector<BiEntry> BiEntryVec;
 74 | typedef vector<BiEntryVec> BiEntryVecVec;
 75 | 
 76 | // Observed counts
 77 | EntryVecVec left_context_;
 78 | EntryVecVec right_context_;
 79 | 
 80 | IntVec word_counts_;
 81 | 
 82 | IntVec class_class_counts_;
 83 | IntVec class_counts_;
 84 | IntVecVec class_char_char_counts_;
 85 | IntVecVec class_char_counts_;
 86 | 
 87 | IntVec word_assignment_;
 88 | 
 89 | int num_words_;
 90 | int num_chars_;
 91 | 
 92 | // Whether a character bigram model is used. Depends on alpha.
 93 | bool character_;
 94 | 
 95 | StringVec word_forms_;
 96 | IntVecVec word_chars_;
 97 | 
 98 | // These data structures are needed to make update() efficient.
 99 | 
100 | // Number of times a class has been seen left (right) of the current word.
101 | EntryVec left_class_counts_;
102 | EntryVec right_class_counts_;
103 | 
104 | EntryVec left_tri_class_counts_;
105 | EntryVec center_tri_class_counts_;
106 | EntryVec right_tri_class_counts_;
107 | 
108 | // Current word
109 | int current_word_;
110 | 
111 | // Character bigrams and unigrams occuring in current word.
112 | EntryVec char_bigram_counts_;
113 | EntryVec char_unigram_counts_;
114 | 
115 | // Class bigrams affected by current class change.
116 | IntVec current_class_class_counts_;
117 | 
118 | // Number of times a word cooccurs with itself.
119 | int word_word_count_;
120 | int word_word_word_count_;
121 | EntryVec word_word_x_counts_;
122 | EntryVec word_x_word_counts_;
123 | EntryVec x_word_word_counts_;
124 | 
125 | // Constant term of the Likelihood that depends on word form counts
126 | real word_ll_term_;
127 | 
128 | DoubleVec nlogn_cache_;
129 | 
130 | real nlogn(int n) {
131 |   if (n<0)
132 |     cout << "nlogn : " << n << endl;
133 |   assert (n >= 0);
134 | 
135 |   if (n == 0) {
136 |     return 0;
137 |   }
138 | 
139 |   if (n - 1 < cache_size_) {
140 |     return nlogn_cache_[n - 1];
141 |   }
142 | 
143 |   return n * log(n);
144 | }
145 | 
146 | int classIndex(int i, int j) {
147 |   assert (i >= 0 && i < num_classes_);
148 |   assert (j >= 0 && j < num_classes_);
149 |   int index = i * num_classes_ + j;
150 |   assert (index >= 0 && index < num_classes_ * num_classes_);
151 |   return index;
152 | }
153 | 
154 | void addCharBigram(int klass, int c, int k, int count) {
155 |   int index = c * num_chars_ + k;
156 |   class_char_char_counts_[klass][index] += count;
157 | }
158 | 
159 | void incrementChars(int word, int klass, int factor) {
160 |   int last = 0;
161 |   IntVec& chars = word_chars_[word];
162 | 
163 |   forvec(_, int, c, chars) {
164 |     addCharBigram(klass, last, c, factor * word_counts_[word]);
165 | 
166 |     class_char_counts_[klass][c] += factor * word_counts_[word];
167 |     assert (class_char_counts_[klass][c] >= 0);
168 | 
169 |     last = c;
170 |   }
171 | 
172 |   addCharBigram(klass, last, 0, factor * word_counts_[word]);
173 | 
174 |   class_char_counts_[klass][0] += factor * word_counts_[word];
175 |   assert (class_char_counts_[klass][last] >= 0);
176 | }
177 | 
178 | void assignToZero() {
179 |   
180 |   fill(word_assignment_.begin(), word_assignment_.end(), 0);
181 | 
182 |   for (int word = 0; word < num_words_; word++) {
183 |     class_counts_[0] += word_counts_[word];
184 |     if (character_) {
185 |       if (word > 0) {
186 | 	incrementChars(word, 0, +1);
187 |       }
188 |     }
189 |   }
190 |   class_class_counts_[0] = class_counts_[0];
191 | }
192 | 
193 | void addTagTagCount(int klass, int cclass, int count) {
194 |   class_class_counts_[classIndex(klass, cclass)] += count;
195 | }
196 | 
197 | void incrementBigrams(int word, int klass, int factor) {
198 |   forvec (_, Entry, entry, left_context_[word]) {
199 |     int cword = entry.item;
200 |     if (cword != word) {
201 |       int cclass = word_assignment_[cword];
202 |       addTagTagCount(cclass, klass, factor * entry.count);
203 |     } else {
204 |       addTagTagCount(klass, klass, factor * entry.count);
205 |     }
206 |   }
207 | 
208 |   forvec (_, Entry, entry, right_context_[word]) {
209 |     int cword = entry.item;
210 |     if (cword != word) {
211 |       int cclass = word_assignment_[cword];
212 |       addTagTagCount(klass, cclass, factor * entry.count);
213 |     }
214 |   }
215 | }
216 | 
217 | void increment(int word, int klass, int factor) {
218 | 
219 |   assert (word > 0);
220 |   assert (klass > 0 || factor < 0);
221 | 
222 |   class_counts_[klass] += factor * word_counts_[word];
223 | 
224 |   incrementBigrams(word, klass, factor);
225 | 
226 |   if (character_) {
227 |     incrementChars(word, klass, factor);
228 |   }
229 | 
230 |   word_assignment_[word] = klass;
231 | }
232 | 
233 | void randomInit() {
234 |   assert (num_words_ > num_classes_);
235 | 
236 |   assignToZero();
237 | 
238 |   int half_num_classes = num_classes_ / 2;
239 | 
240 |   for (int word = 1; word < half_num_classes; word++) {
241 |     increment(word, 0, -1);
242 |     increment(word, word, +1);
243 |   }
244 | 
245 |   for (int word = half_num_classes; word < num_words_; word++) {
246 |     int klass = half_num_classes + mrand((int)ceil(num_classes_ / 2.));
247 |     increment(word, 0, -1);
248 |     increment(word, klass, +1);
249 |   }
250 | }
251 | 
252 | void strtok(StringVec& vec, string string, char delim) {
253 |   uint last = 0;
254 |   for (uint i=0; i<string.length(); i++) {
255 |     char c = string[i];
256 |     if ((delim == 0 && isspace(c)) || (delim > 0 && delim == c )) {
257 | 	int length = i - last;
258 | 	if (length > 0)
259 | 	  vec.push_back(string.substr(last, length));
260 | 	last = i + 1;
261 |     }
262 |   }
263 | 
264 |   if (last < string.length()) {
265 |     vec.push_back(string.substr(last, string.length() - last));
266 |   }
267 | }
268 | 
269 | void readWordForms() {
270 |   ifstream in(unigram_file_.c_str());
271 |   string buf;
272 |   while(getline(in, buf)) {
273 |     StringVec tokens;
274 |     strtok(tokens, buf, (char) 0);
275 |     const string& word_form = tokens[0];
276 |     word_forms_.push_back(word_form);
277 |   }
278 |   num_words_ = word_forms_.size();
279 | }
280 | 
281 | void readBigrams() {
282 |   ifstream in(bigram_file_.c_str());
283 |   string buf;
284 |   int word = 0;
285 | 
286 |   while(getline(in, buf)) {
287 |     EntryVec& cwords = right_context_[word];
288 | 
289 |     StringVec line;
290 |     strtok(line, buf, (char) 0);
291 | 
292 |     forvec (_, string, pair_string, line) {
293 |       Entry entry;
294 | 
295 |       StringVec tokens;
296 |       strtok(tokens, pair_string, ':');
297 | 
298 |       const string& word_string = tokens[0];
299 |       const string& count_string = tokens[1];
300 |   
301 |       int cword = atoi(word_string.c_str());
302 |       int count = atoi(count_string.c_str());
303 | 
304 |       // Add to right contexts
305 |       entry.item = cword;
306 |       entry.count = count;
307 |       cwords.push_back(entry);
308 | 
309 |       // Add to left contexts     
310 |       entry.item = word;
311 |       left_context_[cword].push_back(entry);
312 | 
313 |       word_counts_[word] += count;
314 |     }
315 | 
316 |     word++;
317 |   }
318 | 
319 | 
320 |   // Check consistency
321 |   for (word = 0; word < num_words_; word++) {
322 |     int count = word_counts_[word];
323 | 
324 |     int left_count = 0;
325 |     forvec (_, const Entry&, entry, left_context_[word]) {
326 |       left_count += entry.count;
327 |     }
328 | 
329 |     int right_count = 0;
330 |     forvec (_, const Entry&, entry, right_context_[word]) {
331 |       right_count += entry.count;
332 |     }
333 | 
334 |     assert (count == left_count && count == right_count);
335 |   }
336 | 
337 | }
338 | 
339 | void readData() {
340 |   readWordForms();
341 | 
342 |   if (alpha_ > 1e-5) {
343 |     character_ = true;
344 |   } else {
345 |     character_ = false;
346 |   }
347 | 
348 |   // Fill nlogn cache.
349 |   nlogn_cache_.resize(cache_size_);
350 |   for (int i=0; i<cache_size_; i++) {
351 |     nlogn_cache_[i] = (i + 1) * log(i + 1);
352 |   }
353 | 
354 |   // Init data structures.
355 | 
356 |   class_class_counts_.resize(num_classes_ * num_classes_);  
357 |   current_class_class_counts_.resize(num_classes_ * num_classes_);
358 | 
359 |   left_context_.resize(num_words_);
360 |   right_context_.resize(num_words_);
361 | 
362 |   word_counts_.resize(num_words_);
363 |   class_counts_.resize(num_classes_);
364 |   word_assignment_.resize(num_words_);
365 |   word_forms_.resize(num_words_);
366 | 
367 |   readBigrams();
368 | 
369 |   if (character_) {
370 | 
371 |     typedef unordered_map<char, int> CIMap;
372 | 
373 |     CIMap table;
374 |     table['^'] = 0;
375 | 
376 |     word_chars_.resize(num_words_);
377 | 
378 |     for (int w = 1; w < num_words_; w++) {
379 |       const string& form = word_forms_[w];
380 | 
381 |       IntVec& chars = word_chars_[w];
382 |       chars.resize(form.length());
383 | 
384 | 	for (uint i = 0; i < form.length(); i++) {
385 | 	  char c = form[i];
386 |     if (c=='^') c='"';
387 |     
388 | 	  assert (c != '^');
389 | 
390 | 	  int k = table[c];
391 | 
392 | 	  if (k == 0) {
393 | 	    k = table.size();
394 | 	    table[c] = k;
395 | 	  }
396 | 	  
397 | 	  chars[i] = k;
398 | 	}
399 |       
400 |     }
401 | 
402 |     num_chars_ = table.size();
403 | 
404 |     matrix_resize(class_char_char_counts_, num_classes_, num_chars_* num_chars_);
405 |     matrix_resize(class_char_counts_, num_classes_, num_chars_);
406 | 
407 |   }
408 | 
409 |   word_ll_term_ = 0;
410 |   for (int w = 1; w < num_words_; w++) {
411 |     word_ll_term_ += nlogn(word_counts_[w]);
412 |   }
413 | }
414 | 
415 | real calcCharLikelihood(int klass) {
416 |   real ll = 0;
417 | 
418 |   // Bigram Counts
419 |   for (int c = 0; c < num_chars_; c++) {
420 |     for (int k = 0; k < num_chars_; k++) {
421 |       int index = c * num_chars_ + k;
422 |       ll += nlogn(class_char_char_counts_[klass][index]);
423 |     }
424 |   }
425 | 
426 |   // Unigram Counts
427 |   for (int c = 0; c < num_chars_; c++) {
428 |     ll -= nlogn(class_char_counts_[klass][c]);
429 |   }
430 | 
431 |   return ll;
432 | }
433 | 
434 | real calcLikelihood() {
435 |   real ll = 0;
436 | 
437 |   // Bigram Context Counts.
438 |   real ll_b = 0;
439 | 
440 |   for (int c = 0; c < num_classes_; c++) {
441 |     for (int k = 0; k < num_classes_; k++) {
442 |       ll_b += nlogn(class_class_counts_[classIndex(c, k)]);
443 |     }
444 |   }
445 | 
446 |   for (int c = 0; c < num_classes_; c++) {
447 |     ll_b -= nlogn(class_counts_[c]);
448 |   }
449 | 
450 |   ll += ll_b;
451 | 
452 |   // Unigram Word Emission Counts.
453 |   real ll_w = word_ll_term_;
454 |   for (int c = 0; c < num_classes_; c++) {
455 |     ll_w -= nlogn(class_counts_[c]);
456 |   }
457 |   ll += (1.0 - alpha_) * ll_w;
458 | 
459 |   // Character Model Counts.
460 | 
461 |   real ll_c = 0;
462 |   if (character_) {
463 |     for (int c = 0; c < num_classes_; c++) {
464 |       ll_c += calcCharLikelihood(c);
465 |     }
466 |   }
467 |   ll += alpha_ * ll_c;
468 | 
469 |   return ll;
470 | }
471 | 
472 | void setCharCounts(int w) {
473 |    char_bigram_counts_.clear();
474 |    char_unigram_counts_.clear();
475 | 
476 |    const IntVec& chars = word_chars_[w];
477 | 
478 |    int last = 0;
479 |    forvec (_, int, c, chars) {
480 |      addToSparseEntryVec(char_bigram_counts_, last * num_chars_ + c, 1);
481 |      addToSparseEntryVec(char_unigram_counts_, c, 1);
482 |      last = c;
483 |    }
484 | 
485 |    addToSparseEntryVec(char_bigram_counts_, last * num_chars_ + 0, 1);
486 |    addToSparseEntryVec(char_unigram_counts_, 0, 1);
487 | }
488 | 
489 | void setContextClassCount(int word) {
490 |   left_class_counts_.clear();
491 |   right_class_counts_.clear();
492 |   word_word_count_ = 0;
493 | 
494 |   forvec (_, Entry, entry, left_context_[word]) {
495 |     int cword = entry.item;
496 |     if (cword != word) {
497 |       int cclass = word_assignment_[cword];
498 | 
499 |       addToSparseEntryVec(left_class_counts_, cclass, entry.count);
500 |     } else {
501 |       word_word_count_ = entry.count;
502 |     }
503 |   }
504 | 
505 |   forvec (_, Entry, entry, right_context_[word]) {
506 |     int cword = entry.item;
507 |     if (cword != word) {
508 |       int cclass = word_assignment_[cword];
509 |       addToSparseEntryVec(right_class_counts_, cclass, entry.count);
510 |     }
511 |   }
512 | }
513 | 
514 | void setCurrentWord(int w) {
515 |   current_word_ = w;
516 | 
517 |   setContextClassCount(w);
518 | 
519 |   if (character_)
520 |     setCharCounts(w);
521 | }
522 | 
523 | real calcCharDelta(int klass) {
524 |   int wcount = word_counts_[current_word_];
525 |   real delta = 0;
526 | 
527 |   forvec (_, const Entry&, entry, char_bigram_counts_) {
528 |     int old_count = class_char_char_counts_[klass][entry.item];
529 |     int new_count = old_count + entry.count * wcount;
530 |     delta += nlogn(new_count) - nlogn(old_count);
531 |   }
532 | 
533 |   forvec (_, const Entry&, entry, char_unigram_counts_) {
534 |     int old_count = class_char_counts_[klass][entry.item];
535 |     int new_count = old_count + entry.count * wcount;
536 |     delta -= nlogn(new_count) - nlogn(old_count);
537 |   }
538 |    
539 |   return delta;
540 | }
541 | 
542 | real calcLocalDelta(int c, int k, int count) {
543 |   assert (k < num_classes_);
544 |   assert (c < num_classes_);
545 |   assert (count >= 0);
546 |   int index = classIndex(c, k);
547 |   int oldcount = class_class_counts_[index] + current_class_class_counts_[index];
548 |   int newcount = oldcount + count;
549 |   current_class_class_counts_[index] += count;
550 |   return nlogn(newcount) - nlogn(oldcount);
551 | }
552 | 
553 | void setZero(int c, int k) {
554 |   assert (k < num_classes_);
555 |   assert (c < num_classes_);
556 |   int index = classIndex(c, k);
557 |   current_class_class_counts_[index] = 0;
558 | }
559 | 
560 | real calcBigramDelta(int klass) {
561 |   real delta = 0;
562 | 
563 |   forvec (_, Entry, entry, left_class_counts_) {
564 |     delta += calcLocalDelta(entry.item, klass, entry.count);
565 |   }
566 | 
567 |   forvec (_, Entry, entry, right_class_counts_) {
568 |     delta += calcLocalDelta(klass, entry.item, entry.count);
569 |   }
570 | 
571 |   delta += calcLocalDelta(klass, klass, word_word_count_);
572 | 
573 |   forvec (_, Entry, entry, left_class_counts_) {
574 |     setZero(entry.item, klass);
575 |   }
576 | 
577 |   forvec (_, Entry, entry, right_class_counts_) {
578 |     setZero(klass, entry.item);
579 |   }
580 | 
581 |   setZero(klass, klass);
582 | 
583 |   return delta;
584 | }
585 | 
586 | real calcDelta(int klass) {
587 |   int wcount = word_counts_[current_word_];
588 |   real delta = 0;
589 | 
590 |   real bigram_delta = calcBigramDelta(klass);
591 | 
592 |   real delta_b = 0;
593 |   delta_b += bigram_delta;
594 |   delta_b -= (nlogn(class_counts_[klass] + wcount ) - nlogn(class_counts_[klass]));
595 |   delta += delta_b;
596 | 
597 |   // Word Emission
598 |   delta -= (1.0 - alpha_) * (nlogn(class_counts_[klass] + wcount ) - nlogn(class_counts_[klass]));
599 | 
600 |   // Character Bigram Emission
601 |   if (character_) {
602 |     delta += alpha_ * calcCharDelta(klass);
603 |   }
604 | 
605 |   return delta;
606 | }
607 | 
608 | 
609 | void update(UpdateResult &r) {
610 |   setCurrentWord(r.current_w);
611 | 
612 |   for (int c = 1; c < num_classes_; c++) {
613 |     real delta = calcDelta(c);
614 |     r.deltas[c] = delta;
615 | 
616 |     if (delta > r.best_delta) {
617 |       r.best_delta = delta;
618 |       r.best_c = c;
619 |     }    
620 |   }
621 | }
622 | 
623 | int update() {
624 |   int swaps = 0;
625 | 
626 |   real current_ll = calcLikelihood();
627 | 
628 |   UpdateResult r;
629 | 
630 |   for (int w = 1; w < num_words_; w++) {
631 |     r.current_w = w;
632 |     r.current_c = word_assignment_[w];
633 |     fill(r.deltas.begin(), r.deltas.end(), 0.0);
634 |     r.best_delta = REAL_NEG_INF;
635 |     r.best_c = -1;
636 | 
637 |     increment(w, r.current_c, -1);
638 |     update(r);
639 | 
640 |     current_ll = current_ll - r.deltas[r.current_c] + r.deltas[r.best_c];
641 |     increment(w, r.best_c, +1);
642 | 
643 |     if (r.current_c != r.best_c) {
644 |       swaps += 1;
645 |     }
646 | 
647 |     if (sanity_checks_ > 0) {
648 |       real actual_ll = calcLikelihood();
649 |       real delta = fabs((actual_ll - current_ll) / current_ll);
650 |       if (delta > 1e-5) {
651 | 	cerr << "Sanity check failed: " << delta << " " << actual_ll << " " << current_ll << endl;
652 | 	assert (false);
653 |       }
654 |     }
655 | 
656 |     if (verbose_ > 1 && w % (num_words_ / 4) == 0) {
657 |       fprintf(stderr, "W:%6d / %6d LL: %g Swaps: %5d\n", w, num_words_, current_ll, swaps);
658 |     }
659 |   }
660 | 
661 |   if (verbose_ > 0) {
662 |     fprintf(stderr, "W:%6d / %6d LL: %g Swaps: %5d\n", num_words_, num_words_, current_ll, swaps);
663 |   }
664 | 
665 |   return swaps;
666 | }
667 | 
668 | void writeAssignment() {
669 |   ofstream os;
670 |   os.open(output_);
671 |   for (int w = 0; w < num_words_; w++) {
672 |     os << word_forms_[w] << ' ' << word_assignment_[w] << endl;
673 |   }
674 |   os.close();
675 | }
676 | 
677 | int main(int argc, char** argv) {
678 |   clock_t t1;
679 | 	t1=clock();
680 |   init_opt(argc, argv);
681 | 
682 |   readData();
683 |   randomInit();
684 | 
685 |   if (verbose_ > 0) {
686 |     cerr << "W:     Number of words processed / total" << endl
687 |          << "LL:    Current log-likelihood" << endl
688 |          << "Swaps: Number of words that changed their class" << endl 
689 |          << endl;
690 |   }
691 | 
692 |   for (int step = 0; num_steps_ < 0 || step < num_steps_; step++) {
693 |     if (verbose_ > 0)
694 |       cerr << "iter: " << step << endl;
695 | 
696 |     int swaps = update();
697 | 
698 |     if (swaps == 0) {
699 |       break;
700 |     }
701 | 
702 |   }
703 | 
704 |   writeAssignment();
705 | 
706 | 	float secs =  ((float)clock()-(float)t1) / CLOCKS_PER_SEC;
707 | 	cerr << "Time: " << secs << endl;
708 | 
709 |   return 0;
710 | }
711 | 
712 | 


--------------------------------------------------------------------------------