├── src ├── marlin │ ├── basic │ │ ├── stl-basic.cc │ │ ├── stl-utils.cc │ │ ├── city.o │ │ ├── opt.o │ │ ├── std.o │ │ ├── str.o │ │ ├── stl-basic.o │ │ ├── stl-utils.o │ │ ├── str.h │ │ ├── COPYRIGHT │ │ ├── str.cc │ │ ├── std.cc │ │ ├── opt.h │ │ ├── stl-basic.h │ │ ├── city.h │ │ ├── std.h │ │ ├── opt.cc │ │ ├── stl-utils.h │ │ └── city.cc │ ├── marlin_cluster │ ├── marlin_cluster.o │ ├── example │ │ ├── example.txt.bz2 │ │ └── example.sh │ ├── Makefile │ ├── README │ ├── log │ ├── marlin_count │ └── marlin_cluster.cc ├── brown └── code │ ├── create_fst.sh │ ├── rom_conllu.sh │ ├── clean_map_decode.py │ ├── pos2char.py │ ├── label_dictionary.py │ ├── tag_with_clusters.sh │ ├── replace-unicode-punctuation.perl │ ├── preprocess.sh │ ├── conllu2txt.py │ ├── arpa2wfst.sh │ ├── run_clustering.sh │ ├── decode.sh │ ├── train_combined_lm.sh │ ├── train_channel.sh │ ├── combine_channels.py │ ├── tag_text.py │ ├── makelmfsa_x.cpp │ ├── setup_ud-treebank_data.sh │ ├── makelmfsa.cpp │ ├── elisa2flat.py │ ├── filter_lowfreq.py │ ├── train_cipher.py │ └── utils.py ├── .gitignore ├── .gitmodules ├── train_srilm_langmodel.sh ├── README.md ├── train_format_lm_ud.sh └── utagger /src/marlin/basic/stl-basic.cc: -------------------------------------------------------------------------------- 1 | #include "stl-basic.h" 2 | -------------------------------------------------------------------------------- /src/marlin/basic/stl-utils.cc: -------------------------------------------------------------------------------- 1 | #include "stl-utils.h" 2 | -------------------------------------------------------------------------------- /src/brown: -------------------------------------------------------------------------------- 1 | /home/ronald/universal-cipher-pos-tagging/src/brown-cluster -------------------------------------------------------------------------------- /src/marlin/basic/city.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/city.o -------------------------------------------------------------------------------- /src/marlin/basic/opt.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/opt.o -------------------------------------------------------------------------------- /src/marlin/basic/std.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/std.o -------------------------------------------------------------------------------- /src/marlin/basic/str.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/str.o -------------------------------------------------------------------------------- /src/marlin/marlin_cluster: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/marlin_cluster -------------------------------------------------------------------------------- /src/marlin/basic/stl-basic.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/stl-basic.o -------------------------------------------------------------------------------- /src/marlin/basic/stl-utils.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/basic/stl-utils.o -------------------------------------------------------------------------------- /src/marlin/marlin_cluster.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/marlin_cluster.o -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/* 2 | src/code/__pycache__/* 3 | lm_data/* 4 | lms/* 5 | exp-cipher/* 6 | *.pyc 7 | *.o 8 | utagger_hpc 9 | gen_exp.sh -------------------------------------------------------------------------------- /src/marlin/example/example.txt.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isi-nlp/universal-cipher-pos-tagging/HEAD/src/marlin/example/example.txt.bz2 -------------------------------------------------------------------------------- /src/marlin/example/example.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -ue 4 | 5 | ../marlin_count --text example.txt.bz2 --bigrams bigrams --words words 6 | ../marlin_cluster --words words --bigrams bigrams --output classes --c 100 --steps 5 7 | rm words bigrams 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/uroman"] 2 | path = src/uroman 3 | url = git@github.com:isi-nlp/uroman.git 4 | [submodule "src/brown-cluster"] 5 | path = src/brown-cluster 6 | url = git@github.com:percyliang/brown-cluster.git 7 | [submodule "src/carmel"] 8 | path = src/carmel 9 | url = git@github.com:isi-nlp/carmel.git 10 | -------------------------------------------------------------------------------- /src/code/create_fst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nclusters=$1 4 | tagset="N O A R I B C J D T M F P E Y V X" 5 | nc1=$(($nclusters - 1)) 6 | 7 | echo "S" 8 | echo '(S (S "" "" 1))' 9 | echo '(S (S "" "" 1))' 10 | for tag in $tagset 11 | do 12 | for nc in `seq 0 $nc1` 13 | do 14 | echo '(S (S "'$tag'" "'$nc'" 1))' 15 | done 16 | done -------------------------------------------------------------------------------- /src/code/rom_conllu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | lang=$1 4 | input=$2 5 | output=$3 6 | UR_DIR=$4 7 | 8 | cut -f 2 $input | $UR_DIR/bin/uroman.pl -l $lang | \ 9 | awk -F'\t' '{OFS = FS} FNR==NR{a[NR]=$1;next}{$2=a[FNR]}1' \ 10 | /dev/stdin $input > temp 11 | 12 | cut -f 3 temp | $UR_DIR/bin/uroman.pl -l $lang | \ 13 | awk -F'\t' '{OFS = FS} FNR==NR{a[NR]=$1;next}{$3=a[FNR]}1' \ 14 | /dev/stdin temp > $output 15 | 16 | rm temp -------------------------------------------------------------------------------- /src/code/clean_map_decode.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from utils import char2upos 3 | 4 | inputfn = sys.argv[1] 5 | outputfn = sys.argv[2] 6 | outfile = open(outputfn,'w') 7 | for line in open(inputfn,'r'): 8 | line=line.strip('\n') 9 | if line=='': continue 10 | dec = [char2upos[c.strip('"')] for c in line.split()[1:-1] ] 11 | # assuming line always like this: ... (#eos already strips while cl_tagging) 12 | print(" ".join(dec),file=outfile) -------------------------------------------------------------------------------- /src/marlin/Makefile: -------------------------------------------------------------------------------- 1 | # 1.2: need to make sure opt.o goes in the right order to get the right scope on the command-line arguments 2 | # Use this for Linux 3 | files=$(subst .cc,.o,$(shell /bin/ls basic/*.cc)) 4 | 5 | all: marlin_cluster 6 | 7 | marlin_cluster: marlin_cluster.o $(files) 8 | g++ -std=c++11 -Wall -g -O3 -o marlin_cluster marlin_cluster.o $(files) 9 | 10 | %.o: %.cc 11 | g++ -Wunused -std=c++11 -Wall -g -O3 -o $@ -c $< 12 | 13 | clean: 14 | rm -rf marlin_cluster basic/*.o *.o 15 | -------------------------------------------------------------------------------- /src/code/pos2char.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from utils import * 3 | import pdb 4 | import argparse 5 | 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--ts" ,"-ts", type=str, default="ud", help="Tagset name [ud,ut]") 10 | 11 | args = parser.parse_args() 12 | 13 | mapper = upos2char if args.ts=="ud" else ut2char 14 | 15 | for line in sys.stdin: 16 | line = line.strip("\n") 17 | if line=='': continue 18 | mapped = [mapper[tag] for tag in line.split(" ") ] 19 | print(" ".join(mapped)) 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/marlin/basic/str.h: -------------------------------------------------------------------------------- 1 | #ifndef __STR_H__ 2 | #define __STR_H__ 3 | 4 | #include "stl-basic.h" 5 | 6 | string substr(const string &s, int i, int j); 7 | string substr(const string &s, int i); 8 | 9 | string str_printf(const char *fmt, ...); 10 | char *copy_str(const char *s); 11 | string int2str(int x); 12 | string double2str(double x); 13 | 14 | StringVec split(const char *str, const char *delims, bool keep_empty); 15 | StrVec mutate_split(char *str, const char *delims); 16 | 17 | char *trim(char *s); 18 | string tolower(const char *s); 19 | 20 | int index_of(const char *s, const char *t); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /src/code/label_dictionary.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import warnings 3 | 4 | class LabelDictionary(dict): 5 | '''This class implements a dictionary of labels. Labels as mapped to 6 | integers, and it is efficient to retrieve the label name from its 7 | integer representation, and vice-versa.''' 8 | def __init__(self, label_names=[]): 9 | self.names = [] 10 | for name in label_names: 11 | self.add(name) 12 | 13 | def add(self, name): 14 | if name in self: 15 | return self[name] 16 | #warnings.warn('Ignoring duplicated label ' + name) 17 | label_id = len(self.names) 18 | self[name] = label_id 19 | self.names.append(name) 20 | return label_id 21 | 22 | def get_label_name(self, label_id): 23 | return self.names[label_id] 24 | 25 | def get_label_id(self, name): 26 | return self[name] 27 | -------------------------------------------------------------------------------- /src/marlin/basic/COPYRIGHT: -------------------------------------------------------------------------------- 1 | The code in basic/ is taken from: 2 | https://github.com/percyliang/brown-cluster 3 | 4 | The following copyright / usage agreement applies: 5 | 6 | (C) Copyright 2007-2012, Percy Liang 7 | 8 | Implementation of the Brown hierarchical word clustering algorithm. 9 | Percy Liang 10 | Release 1.3 11 | 2012.07.24 12 | 13 | http://cs.stanford.edu/~pliang 14 | 15 | Permission is granted for anyone to copy, use, or modify these programs and 16 | accompanying documents for purposes of research or education, provided this 17 | copyright notice is retained, and note is made of any changes that have been 18 | made. 19 | 20 | These programs and documents are distributed without any warranty, express or 21 | implied. As the programs were written for research purposes only, they have 22 | not been tested to the degree that would be advisable in any important 23 | application. All use of these programs is entirely at the user's own risk. 24 | 25 | -------------------------------------------------------------------------------- /src/code/tag_with_clusters.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #SBATCH --time=50:00:00 3 | #SBATCH --partition=isi 4 | 5 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )" 6 | BASELINE="brown" 7 | NCLUSTERS=500 8 | INPUT="" 9 | EXP_DIR="$BASEDIR/exp-cipher" 10 | 11 | while getopts "h?b:n:i:e:" opt; do 12 | case "$opt" in 13 | h|\?) 14 | show_help 15 | exit 0 16 | ;; 17 | b) BASELINE=$OPTARG 18 | ;; 19 | n) NCLUSTERS=$OPTARG 20 | ;; 21 | i) INPUT=$OPTARG 22 | ;; 23 | e) 24 | EXP_DIR=$OPTARG 25 | ;; 26 | esac 27 | done 28 | 29 | clt_out="$EXP_DIR/$BASELINE-$NCLUSTERS" 30 | 31 | if [ ! -f $clt_out/clt.mapper.pickle ]; then 32 | python3 $BASEDIR/src/code/tag_text.py -i $INPUT \ 33 | -b $BASELINE -m train -c $clt_out/clusters.$BASELINE -op output -nc $NCLUSTERS 34 | else 35 | python3 $BASEDIR/src/code/tag_text.py -i $INPUT \ 36 | -b $BASELINE -m eval -v $clt_out/clt.mapper.pickle -op output -nc $NCLUSTERS 37 | fi 38 | -------------------------------------------------------------------------------- /src/marlin/README: -------------------------------------------------------------------------------- 1 | For more information visit http://cistern.cis.lmu.de/. 2 | 3 | Documentation can be found at https://github.com/muelletm/cistern/blob/wiki/marlin.md. 4 | 5 | (C) Copyright 2013-2015, Thomas Müller 6 | 7 | The code in basic/ is taken from: 8 | https://github.com/percyliang/brown-cluster 9 | 10 | The following copyright / usage agreement applies: 11 | 12 | (C) Copyright 2007-2012, Percy Liang 13 | 14 | Implementation of the Brown hierarchical word clustering algorithm. 15 | Percy Liang 16 | Release 1.3 17 | 2012.07.24 18 | 19 | http://cs.stanford.edu/~pliang 20 | 21 | Permission is granted for anyone to copy, use, or modify these programs and 22 | accompanying documents for purposes of research or education, provided this 23 | copyright notice is retained, and note is made of any changes that have been 24 | made. 25 | 26 | These programs and documents are distributed without any warranty, express or 27 | implied. As the programs were written for research purposes only, they have 28 | not been tested to the degree that would be advisable in any important 29 | application. All use of these programs is entirely at the user's own risk. 30 | -------------------------------------------------------------------------------- /src/code/replace-unicode-punctuation.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | use warnings; 7 | use strict; 8 | use utf8; 9 | 10 | binmode(STDIN, ":utf8"); 11 | binmode(STDOUT, ":utf8"); 12 | 13 | while() { 14 | s/,/,/g; 15 | s/。 */. /g; 16 | s/、/,/g; 17 | s/”/"/g; 18 | s/“/"/g; 19 | s/¨/"/g; 20 | s/[∶׃:]/:/g; 21 | s/?/\?/g; 22 | s/《/"/g; 23 | s/》/"/g; 24 | s/)/\)/g; 25 | s/!/\!/g; 26 | s/(/\(/g; 27 | s/;/;/g; 28 | s/1/"/g; 29 | s/」/"/g; 30 | s/「/"/g; 31 | s/0/0/g; 32 | s/3/3/g; 33 | s/2/2/g; 34 | s/5/5/g; 35 | s/6/6/g; 36 | s/9/9/g; 37 | s/7/7/g; 38 | s/8/8/g; 39 | s/4/4/g; 40 | s/. */. /g; 41 | s/~/\~/g; 42 | s/[’ʼ′`]/\'/g; 43 | s/…/\.\.\./g; 44 | s/━/\-/g; 45 | s/─/-/g; 46 | s/〈/\/g; 49 | s/【/\[/g; 50 | s/】/\]/g; 51 | s/%/\%/g; 52 | s/¬/-/g; 53 | s/×+/x/g; 54 | s/≤/<=/g; 55 | s/≥/>=/g; 56 | s/≠/\!=/g; 57 | s/→/-/g; 58 | # s/[↔💪😍╥؟ヽ↓▯◘∞►◄♦°░✔▓⚛☻↑¤╰╮➖★♪♫™🏻👇†😭😥😎😢️│·‧·ェ•●▽❤♥💕♡☉¶§📌✌®╟╢╩╔╗╣╠╝╚═¸┈┉✽♈̷̴̶⌣̊┼╫♉▒▒┌┘└┐┘┌┴‾̲☐÷☆┬✰☰]+/\*/g; 59 | 60 | print $_; 61 | } 62 | -------------------------------------------------------------------------------- /src/marlin/log: -------------------------------------------------------------------------------- 1 | W: Number of words processed / total 2 | LL: Current log-likelihood 3 | Swaps: Number of words that changed their class 4 | 5 | iter: 0 6 | W: 1688 / 6755 LL: -389785 Swaps: 1495 7 | W: 3376 / 6755 LL: -385828 Swaps: 2981 8 | W: 5064 / 6755 LL: -382624 Swaps: 4499 9 | W: 6752 / 6755 LL: -379357 Swaps: 5997 10 | W: 6755 / 6755 LL: -379352 Swaps: 5999 11 | iter: 1 12 | W: 1688 / 6755 LL: -376172 Swaps: 449 13 | W: 3376 / 6755 LL: -375490 Swaps: 984 14 | W: 5064 / 6755 LL: -375067 Swaps: 1393 15 | W: 6752 / 6755 LL: -374700 Swaps: 1758 16 | W: 6755 / 6755 LL: -374700 Swaps: 1758 17 | iter: 2 18 | W: 1688 / 6755 LL: -374131 Swaps: 209 19 | W: 3376 / 6755 LL: -373951 Swaps: 479 20 | W: 5064 / 6755 LL: -373831 Swaps: 685 21 | W: 6752 / 6755 LL: -373683 Swaps: 866 22 | W: 6755 / 6755 LL: -373683 Swaps: 866 23 | iter: 3 24 | W: 1688 / 6755 LL: -373387 Swaps: 126 25 | W: 3376 / 6755 LL: -373284 Swaps: 286 26 | W: 5064 / 6755 LL: -373191 Swaps: 412 27 | W: 6752 / 6755 LL: -373104 Swaps: 514 28 | W: 6755 / 6755 LL: -373104 Swaps: 514 29 | iter: 4 30 | W: 1688 / 6755 LL: -373060 Swaps: 50 31 | W: 3376 / 6755 LL: -373031 Swaps: 101 32 | W: 5064 / 6755 LL: -373011 Swaps: 128 33 | W: 6752 / 6755 LL: -372996 Swaps: 148 34 | W: 6755 / 6755 LL: -372996 Swaps: 148 35 | -------------------------------------------------------------------------------- /src/code/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # preprocessing data 4 | 5 | set -e 6 | 7 | INPUT="" 8 | ROM="false" 9 | MODE="test" 10 | IL="en" 11 | 12 | EXP_DIR="$BASEDIR/exp-cipher" 13 | 14 | while [ $# -gt 1 ] 15 | do 16 | key="$1" 17 | case $key in 18 | -i|--input) 19 | INPUT="$2" 20 | shift # past argument 21 | ;; 22 | -rom|--rom) 23 | ROM="$2" 24 | shift # past argument 25 | ;; 26 | -m|--mode) 27 | MODE="$2" 28 | shift # past argument 29 | ;; 30 | -l|--lang) 31 | IL="$2" 32 | shift # past argument 33 | ;; 34 | -exp|--exp_dir) 35 | EXP_DIR="$2" 36 | shift # past argument 37 | ;; 38 | *) 39 | # unknown option 40 | ;; 41 | esac 42 | shift 43 | done 44 | 45 | datadir="$EXP_DIR/data" 46 | 47 | # romanize if needed 48 | if [ $ROM = "true" ]; then 49 | echo "romanizing" 50 | src/uroman/bin/uroman.pl < $INPUT > "$INPUT".roman 51 | else 52 | cp $INPUT "$INPUT".roman 53 | fi 54 | 55 | echo "cleaning/filtering..." 56 | #normalize / filter noise 57 | 58 | src/code/replace-unicode-punctuation.perl < "$INPUT".roman > $datadir/$IL.clean 59 | 60 | if [ $MODE = "train" ]; then 61 | python3 src/code/filter_lowfreq.py -i $datadir/$IL.clean -m train -ig 62 | mv $datadir/vocab $datadir/vocab.$IL 63 | else 64 | python3 src/code/filter_lowfreq.py -i $datadir/$IL.clean -m eval -t 1 -v $datadir/vocab.$IL 65 | fi 66 | 67 | -------------------------------------------------------------------------------- /src/code/conllu2txt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from utils import * 3 | import pdb 4 | import argparse 5 | 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--input","-i", type=str, default=None, help="input conllu file") 10 | parser.add_argument("--mode" ,"-m", type=str, default="ch", help="Tag mode [ch,tag]") 11 | parser.add_argument("--col" ,"-c", type=int, default=1, help="Column to extract [0-9]") 12 | parser.add_argument("--tb" ,"-tb", type=str, default="ud", help="Treebank name [ud,ut]") 13 | parser.add_argument("--lid","-lid", action='store_true', help="Keep lang_id from text") 14 | 15 | args = parser.parse_args() 16 | 17 | text="" 18 | idx = args.col 19 | mode = args.mode 20 | 21 | mapper = upos2char if args.tb=="ud" else ut2char 22 | 23 | count = 1 24 | 25 | for line in open(args.input,'r'): 26 | line = line.strip("\n") 27 | if line=='': continue 28 | cols = line.split('\t') 29 | if cols[0]=="1" and text!='': 30 | print(text.strip(' ')) 31 | text='' 32 | count = 1 33 | token = '' 34 | datum = cols[idx] 35 | if idx==1 and not args.lid: 36 | datum = datum[:-3] 37 | 38 | if mode=='ch' and idx==3: 39 | datum = mapper[datum] 40 | 41 | token = datum.strip(' ') 42 | 43 | all_dig = True 44 | for sw in token.split(' '): 45 | if not sw.isdigit(): 46 | all_dig=False 47 | break 48 | token = token.replace(" ","") if all_dig else token.replace(" ","_") 49 | text += " "+token 50 | 51 | # print("|",token,"|") 52 | count += 1 53 | 54 | print(text.strip(' ')) 55 | 56 | 57 | -------------------------------------------------------------------------------- /src/code/arpa2wfst.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ## Transforms ARPA lm file into WFST format using Carmel 4 | 5 | set -e 6 | 7 | INPUT="arpa.lang" # arpa formatted input file 8 | LAN_CODE="en" # language code for name-formatting purposes 9 | ORDER=2 # LM order for name-formatting purposes 10 | CARMEL_DIR="/usr/local" 11 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )" 12 | DATADIR=$BASEDIR/data 13 | CODE_DIR="$BASEDIR/src/code" 14 | 15 | while [ $# -gt 1 ] 16 | do 17 | key="$1" 18 | case $key in 19 | -i|--input) 20 | INPUT="$2" 21 | shift # past argument 22 | ;; 23 | -l|--lang) 24 | LAN_CODE="$2" 25 | shift # past argument 26 | ;; 27 | -o|--order) 28 | ORDER="$2" 29 | shift # past argument 30 | ;; 31 | -c|--carmel) 32 | CARMEL_DIR="$2" 33 | shift # past argument 34 | ;; 35 | *) 36 | # unknown option 37 | ;; 38 | esac 39 | shift 40 | done 41 | 42 | 43 | INPUT=$(readlink -f $INPUT) 44 | 45 | cd $CODE_DIR 46 | g++ makelmfsa.cpp -o makelmfsa 47 | g++ makelmfsa_x.cpp -o makelmfsa_x 48 | 49 | 50 | # create fsa/fst 51 | ./makelmfsa $INPUT 52 | # ./makelmfsa_x $basedir/lms/$lang.$order.lm 53 | 54 | $CARMEL_DIR/bin/carmel -n $INPUT.wfsa \ 55 | > $INPUT.norm 56 | 57 | # prepare Viterbi decoding 58 | $CARMEL_DIR/bin/carmel --project-right --project-identity-fsa -HJ $INPUT.wfsa \ 59 | > $INPUT.fsa.noe 60 | 61 | if [ "$INPUT.wfsa" != "$BASEDIR/lms/$LAN_CODE.$ORDER.lm.wfsa" ]; then 62 | mv $INPUT.wfsa $BASEDIR/lms/$LAN_CODE.$ORDER.lm.wfsa 63 | mv $INPUT.norm $BASEDIR/lms/$LAN_CODE.$ORDER.lm.norm 64 | mv $INPUT.fsa.noe $BASEDIR/lms/$LAN_CODE.$ORDER.fsa.noe 65 | fi 66 | 67 | rm makelmfsa makelmfsa_x -------------------------------------------------------------------------------- /src/code/run_clustering.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # POSIX variable 6 | OPTIND=1 7 | 8 | NCPUS=4 9 | BASELINE="brown" # clark, anchor, emb-loc-mon 10 | NCLUSTERS=500 11 | 12 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )" 13 | EXP_DIR="$BASEDIR/exp-cipher" 14 | 15 | while [ $# -gt 1 ] 16 | do 17 | key="$1" 18 | case $key in 19 | -i|--input) 20 | INPUT="$2" 21 | shift # past argument 22 | ;; 23 | -b|--b|--baseline) 24 | BASELINE="$2" 25 | shift # past argument 26 | ;; 27 | -nj|--njobs) 28 | NCPUS="$2" 29 | shift # past argument 30 | ;; 31 | -nc|--nclusters) 32 | NCLUSTERS="$2" 33 | shift # past argument 34 | ;; 35 | -exp|--exp_dir) 36 | EXP_DIR="$2" 37 | shift # past argument 38 | ;; 39 | *) 40 | # unknown option 41 | ;; 42 | esac 43 | shift 44 | done 45 | 46 | 47 | clt_out="$EXP_DIR/$BASELINE-$NCLUSTERS" 48 | mkdir -p $clt_out 49 | 50 | 51 | ## run clustering algorithm 52 | 53 | if [ $BASELINE = "brown" ]; then 54 | if [ ! -d "$BASEDIR/src/brown" ]; then 55 | ln -s "$BASEDIR/src/brown-cluster" "$BASEDIR/src/brown" 56 | fi 57 | 58 | src/brown/wcluster --text $INPUT \ 59 | --threads $NCPUS --c $NCLUSTERS --rand 42 \ 60 | --output_dir $clt_out 61 | mv $clt_out/paths $clt_out/clusters.brown 62 | fi 63 | 64 | 65 | if [ $BASELINE = "marlin" ]; then 66 | src/marlin/marlin_count --text $INPUT \ 67 | --bigrams $clt_out/bigrams --words $clt_out/words --rank-limit -1 68 | 69 | $model_dir/marlin_cluster --bigrams $clt_out/bigrams --words $clt_out/words \ 70 | --output $clt_out/clusters --rand 42 --c $NCLUSTERS --alpha 0.0 2> $clt_out/log 71 | fi 72 | 73 | -------------------------------------------------------------------------------- /src/code/decode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | LM="lm_fst_file" 5 | CHANNEL="" 6 | INPUT="" 7 | MAX_LINES=20000 8 | DEC_CH="False" 9 | W_LM=1 10 | W_CM=1 11 | 12 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )" 13 | CODE_DIR="$BASEDIR/src/code" 14 | 15 | while [ $# -gt 1 ] 16 | do 17 | key="$1" 18 | case $key in 19 | -lm|--lm) 20 | LM="$2" 21 | shift # past argument 22 | ;; 23 | -ch|--channel) 24 | CHANNEL="$2" 25 | shift # past argument 26 | ;; 27 | -i|--input) 28 | INPUT="$2" 29 | shift # past argument 30 | ;; 31 | -max|--max) 32 | MAX_LINES="$2" 33 | shift # past argument 34 | ;; 35 | -wlm|--wlm) 36 | W_LM="$2" 37 | shift # past argument 38 | ;; 39 | -wcm|--wcm) 40 | W_CM="$2" 41 | shift # past argument 42 | ;; 43 | *) 44 | # unknown option 45 | ;; 46 | esac 47 | shift 48 | done 49 | 50 | 51 | carmel="$CARMEL_DIR/bin/carmel" 52 | data_dir="$EXP_DIR/data" 53 | basedir="build" 54 | 55 | mkdir -p $EXP_DIR/models/ 56 | 57 | awk 'NF>0' $INPUT > $EXP_DIR/logs/$CHANNEL.temp.noe.cmpl 58 | mv $EXP_DIR/logs/$CHANNEL.temp.noe.cmpl $EXP_DIR/logs/$CHANNEL.temp.noe 59 | 60 | # decipher with Viterbi decoding 61 | # head -10 $EXP_DIR/logs/$CHANNEL.temp.noe | \ 62 | if [ $W_LM = 0 ]; then 63 | cat $EXP_DIR/logs/$CHANNEL.temp.noe | \ 64 | $carmel -qbsriWIEk 1 --exponents=$W_CM,1 \ 65 | $EXP_DIR/models/$CHANNEL \ 66 | > $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.temp.dec 2> $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.dec 67 | else 68 | cat $EXP_DIR/logs/$CHANNEL.temp.noe | \ 69 | $carmel -qbsriWIEk 1 --exponents=$W_LM,$W_CM,1 \ 70 | $LM $EXP_DIR/models/$CHANNEL \ 71 | > $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.temp.dec 2> $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.dec 72 | fi 73 | 74 | python3 $CODE_DIR/clean_map_decode.py $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.temp.dec $INPUT.$CHANNEL.$W_LM.$W_CM.decoded 75 | rm $EXP_DIR/logs/$CHANNEL.$W_LM.$W_CM.temp.dec 76 | rm $EXP_DIR/logs/$CHANNEL.temp.noe 77 | -------------------------------------------------------------------------------- /src/code/train_combined_lm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | 6 | ORDER=2 7 | TAGSET="ud" # tagset code [ud,ut] 8 | LAN_CODES="en" 9 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )" 10 | DATADIR=$BASEDIR/lm_data 11 | CODE_DIR="$BASEDIR/src/code" 12 | EXP_DIR="$BASEDIR/exp-cipher" 13 | 14 | if [ -z "$CARMEL_DIR" ]; then 15 | CARMEL_DIR="/usr/local" 16 | fi 17 | if [ -z "$SRILM_DIR" ]; then 18 | SRILM_DIR="$HOME/srilm-1.7.2" 19 | fi 20 | 21 | 22 | while [ $# -gt 1 ] 23 | do 24 | key="$1" 25 | case $key in 26 | -ts|--tagset) 27 | TAGSET="$2" 28 | shift # past argument 29 | ;; 30 | -ord|--order) 31 | ORDER="$2" 32 | shift # past argument 33 | ;; 34 | -l|--lang) 35 | LAN_CODES="$2" 36 | shift # past argument 37 | ;; 38 | -sri|--sridir) 39 | SRILM_DIR="$2" 40 | shift # past argument 41 | ;; 42 | -c|--carmel) 43 | CARMEL_DIR="$2" 44 | shift # past argument 45 | ;; 46 | -exp|--exp_dir) 47 | EXP_DIR="$2" 48 | shift # past argument 49 | ;; 50 | *) 51 | # unknown option 52 | ;; 53 | esac 54 | shift 55 | done 56 | 57 | export CARMEL_DIR=$CARMEL_DIR 58 | export SRILM_DIR=$SRILM_DIR 59 | 60 | LAN_CODES=(${LAN_CODES//,/ }) 61 | mkdir -p $EXP_DIR/lm 62 | 63 | cd $CODE_DIR 64 | g++ makelmfsa.cpp -o makelmfsa 65 | g++ makelmfsa_x.cpp -o makelmfsa_x 66 | 67 | 68 | echo "" > temp_accum 69 | for lang in "${LAN_CODES[@]}"; do 70 | cat $DATADIR/$lang/train.upos.ch >> temp_accum 71 | done 72 | 73 | #-addsmooth -kn \ 74 | $SRILM_DIR/bin/i686-m64/ngram-count -text temp_accum -order $ORDER \ 75 | -addsmooth 1 \ 76 | -lm $EXP_DIR/lm/comb.$ORDER.lm 77 | grep -vP "^$" < $EXP_DIR/lm/comb.$ORDER.lm > temp 78 | mv temp $EXP_DIR/lm/comb.$ORDER.lm 79 | 80 | # create fsa/fst 81 | ./makelmfsa $EXP_DIR/lm/comb.$ORDER.lm 82 | # ./makelmfsa_x $basedir/lms/$lang.$ORDER.lm 83 | 84 | $CARMEL_DIR/bin/carmel -n $EXP_DIR/lm/comb.$ORDER.lm.wfsa \ 85 | > $EXP_DIR/lm/comb.$ORDER.lm.norm 86 | 87 | # prepare Viterbi decoding 88 | $CARMEL_DIR/bin/carmel --project-right --project-identity-fsa -HJ $EXP_DIR/lm/comb.$ORDER.lm.wfsa \ 89 | > $EXP_DIR/lm/comb.$ORDER.fsa.noe 90 | 91 | 92 | rm makelmfsa makelmfsa_x -------------------------------------------------------------------------------- /train_srilm_langmodel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | ORDER=2 6 | INPUT="file.pos" 7 | TAGSET="ud" # tagset code [ud,ut] 8 | LAN_CODE="en" 9 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 10 | DATADIR=$BASEDIR/lm_data 11 | CODE_DIR="$BASEDIR/src/code" 12 | 13 | if [ -z "$CARMEL_DIR"]; then 14 | CARMEL_DIR="/usr/local" 15 | fi 16 | if [ -z "$SRILM_DIR" ]; then 17 | SRILM_DIR="$HOME/srilm-1.7.2" 18 | fi 19 | 20 | while [ $# -gt 1 ] 21 | do 22 | key="$1" 23 | case $key in 24 | -i|--input) 25 | INPUT="$2" 26 | shift # past argument 27 | ;; 28 | -ts|--tagset) 29 | TAGSET="$2" 30 | shift # past argument 31 | ;; 32 | -ord|--order) 33 | ORDER="$2" 34 | shift # past argument 35 | ;; 36 | -l|--lang) 37 | LAN_CODE="$2" 38 | shift # past argument 39 | ;; 40 | -sri|--sridir) 41 | SRILM_DIR="$2" 42 | shift # past argument 43 | ;; 44 | -carmel|--carmel) 45 | CARMEL_DIR="$2" 46 | shift # past argument 47 | ;; 48 | *) 49 | # unknown option 50 | ;; 51 | esac 52 | shift 53 | done 54 | 55 | export CARMEL_DIR=$CARMEL_DIR 56 | export SRILM_DIR=$SRILM_DIR 57 | 58 | cd $CODE_DIR 59 | 60 | g++ makelmfsa.cpp -o makelmfsa 61 | g++ makelmfsa_x.cpp -o makelmfsa_x 62 | 63 | 64 | echo "" 65 | echo "Lang: $LAN_CODE" 66 | mkdir -p $BASEDIR/lms/ 67 | mkdir -p $DATADIR/$LAN_CODE 68 | 69 | cp $INPUT $DATADIR/$LAN_CODE/train.upos 70 | python3 src/code/pos2char.py -ts $TAGSET < $INPUT > $DATADIR/$LAN_CODE/train.upos.ch 71 | 72 | # run LM 73 | #-addsmooth -kn \ 74 | $SRILM_DIR/bin/i686-m64/ngram-count -text $DATADIR/$LAN_CODE/train.upos.ch -order $ORDER \ 75 | -addsmooth 1 \ 76 | -lm $BASEDIR/lms/$LAN_CODE.$ORDER.lm 77 | grep -vP "^$" < $BASEDIR/lms/$LAN_CODE.$ORDER.lm > temp 78 | mv temp $BASEDIR/lms/$LAN_CODE.$ORDER.lm 79 | 80 | 81 | # create fsa/fst 82 | ./makelmfsa $BASEDIR/lms/$LAN_CODE.$ORDER.lm 83 | 84 | $CARMEL_DIR/bin/carmel -n $BASEDIR/lms/$LAN_CODE.$ORDER.lm.wfsa \ 85 | > $BASEDIR/lms/$LAN_CODE.$ORDER.lm.norm 86 | 87 | # prepare Viterbi decoding 88 | $CARMEL_DIR/bin/carmel --project-right --project-identity-fsa -HJ $BASEDIR/lms/$LAN_CODE.$ORDER.lm.wfsa \ 89 | > $BASEDIR/lms/$LAN_CODE.$ORDER.fsa.noe -------------------------------------------------------------------------------- /src/marlin/basic/str.cc: -------------------------------------------------------------------------------- 1 | #include "stl-basic.h" 2 | #include 3 | 4 | string substr(const string &s, int i, int j) { 5 | if(i < 0) i += len(s); 6 | if(j < 0) j += len(s); 7 | i = max(i, 0); 8 | j = max(j, i); 9 | return s.substr(i, j-i); 10 | } 11 | string substr(const string &s, int i) { 12 | return substr(s, i, len(s)); 13 | } 14 | 15 | string str_printf(const char *fmt, ...) { 16 | char buf[16384]; 17 | va_list ap; 18 | va_start(ap, fmt); 19 | vsnprintf(buf, sizeof(buf), fmt, ap); 20 | va_end(ap); 21 | return buf; 22 | } 23 | 24 | char *copy_str(const char *s) { 25 | char *t = new char[strlen(s)+1]; 26 | strcpy(t, s); 27 | return t; 28 | } 29 | 30 | string int2str(int x) { 31 | return str_printf("%d", x); 32 | } 33 | 34 | string double2str(double x) { 35 | ostringstream os; 36 | os << x; 37 | return os.str(); 38 | } 39 | 40 | StringVec split(const char *str, const char *delims, bool keep_empty) { 41 | StringVec vec; // Store the result. 42 | // Build quick lookup table. 43 | BoolVec is_delim(256); 44 | for(const char *p = delims; *p; p++) is_delim[*p] = true; 45 | is_delim['\0'] = true; 46 | 47 | const char *end = str; 48 | while(true) { 49 | if(is_delim[*end]) { 50 | if(keep_empty || end-str > 0) // Extract token. 51 | vec.push_back(string(str, end-str)); 52 | str = end+1; 53 | } 54 | if(!*end) break; 55 | end++; 56 | } 57 | return vec; 58 | } 59 | 60 | StrVec mutate_split(char *str, const char *delims) { 61 | StrVec vec; 62 | for(char *p = strtok(str, delims); p; p = strtok(NULL, delims)) 63 | vec.push_back(p); 64 | return vec; 65 | } 66 | 67 | // Remove leading and trailing white space. 68 | char *trim(char *s) { 69 | // Removing leading spaces. 70 | while(*s && isspace(*s)) s++; 71 | 72 | // Remove trailing spaces. 73 | char *t; 74 | for(t = s+strlen(s)-1; t != s && isspace(*t); t--); 75 | t[1] = '\0'; 76 | return s; 77 | } 78 | 79 | string tolower(const char *s) { 80 | string t = s; 81 | foridx(i, len(t)) t[i] = tolower(t[i]); 82 | return t; 83 | } 84 | 85 | // String matching with brute force. 86 | int index_of(const char *s, const char *t) { 87 | int ns = strlen(s), nt = strlen(t); 88 | foridx(i, ns-nt+1) 89 | if(strncmp(s+i, t, nt) == 0) return i; 90 | return -1; 91 | } 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UTagger: A Grounded Unsupervised Universal Part-of-Speech Tagger for Low-Resource Languages 2 | 3 | This reposity contains the code neccesary to reproduce the results in the paper: 4 | 5 | [1] A Grounded Unsupervised Universal Part-of-Speech Tagger for Low-Resource Languages. 6 | Ronald Cardenas, Ying Lin, Heng Ji and Jonathan May. NAACL 2019, Minneapolis, USA. 7 | 8 | 9 | 10 | ## Requirements 11 | 12 | Training of Language Models is done with [SRILM v1.7.2](http://www.speech.sri.com/projects/srilm/download.html). However, any library that produces an ARPA file can be used. 13 | 14 | For FST manipulation and cipher training, we use [Carmel](https://github.com/isi-nlp/carmel) (included as a submodule) 15 | 16 | You will also need: 17 | * Python 3.6 18 | * NumPy >= 1.16.2 19 | * SciPy >= 1.2.1 20 | * lxml >= 4.3.3 21 | 22 | 23 | ## Setup 24 | 25 | Initialize the submodules as follows: 26 | 27 | ``` 28 | git submodule update --init --recursive 29 | ``` 30 | 31 | Then build the code in `src/carmel`, `src/brown-cluster`, and `src/marlin` (see each folder's README file for reference). 32 | 33 | ## Using UTagger 34 | 35 | 36 | 0. Extract POS annotations from [UniversalDependencies](http://universaldependencies.org) treebanks 37 | 38 | If you want to use annoations from UD treebanks, you can extract the POS sequences by running 39 | 40 | ``` 41 | ./setup_ud-treebank_data.sh -td 42 | ``` 43 | 44 | This will extract only the POS tags of CONLLU train files for languages experimented with in [1]. 45 | 46 | 47 | 1. Train POS language models 48 | 49 | * From UD data 50 | 51 | Training for several languages can be done by listing the iso-639-1 code of each language separated by commas. For instance, to train second order LMs for English and German, run: 52 | 53 | ``` 54 | ./train_format_lm_ud.sh -l en,de -o 2 55 | ``` 56 | 57 | * From POS token sequences (one sentence per line) 58 | 59 | ``` 60 | ./train_srilm_langmodel.sh -i -o 61 | ``` 62 | 63 | * Reformating an already trained LM in ARPA format 64 | 65 | Further down in the pipeline, Carmel reads trained language models in OpenFST format. Reformat an ARPA file as follows: 66 | 67 | ``` 68 | ./src/code/arpa2wfst.sh -i -l -o 69 | ``` 70 | 71 | 2. Train UTagger 72 | 73 | 74 | ``` 75 | ./utagger -i sample.in -if txt -m train \ 76 | -lm_o 2 -pl en,de -ca brown -nc 500 77 | ``` 78 | 79 | 80 | 3. Tag / eval 81 | 82 | ``` 83 | ./utagger -i sample.in -if txt -m tag \ 84 | -lm_o 2 -pl en,de -ca brown -nc 500 85 | ``` 86 | 87 | -------------------------------------------------------------------------------- /src/code/train_channel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | 6 | NC=50 7 | ORD=3 8 | RL="en" 9 | IL="en" 10 | ID=1 11 | SEED=42 12 | ITERS=10 13 | IS_ELISA="-" 14 | CA="" # clustering algorithm 15 | 16 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )" 17 | CODE_DIR="$BASEDIR/src/code" 18 | # CARMEL_DIR defined above pipeline 19 | 20 | 21 | echo $CODE_DIR 22 | 23 | 24 | while [ $# -gt 1 ] 25 | do 26 | key="$1" 27 | case $key in 28 | -c|--nc|--num_clusters) 29 | NC="$2" 30 | shift # past argument 31 | ;; 32 | -o|--order) 33 | ORD="$2" 34 | shift # past argument 35 | ;; 36 | -rl) 37 | RL="$2" 38 | shift # past argument 39 | ;; 40 | -il) 41 | IL="$2" 42 | shift # past argument 43 | ;; 44 | -id) 45 | ID="$2" 46 | shift # past argument 47 | ;; 48 | -s|--seed) 49 | SEED="$2" 50 | shift # past argument 51 | ;; 52 | -it|--iters) 53 | ITERS="$2" 54 | shift # past argument 55 | ;; 56 | -elisa|--elisa) 57 | IS_ELISA="$2" 58 | shift # past argument 59 | ;; 60 | -ca|--ca) 61 | CA="$2" 62 | shift # past argument 63 | ;; 64 | -exp|--exp_dir) 65 | EXP_DIR="$2" 66 | shift # past argument 67 | ;; 68 | *) 69 | # unknown option 70 | ;; 71 | esac 72 | shift 73 | done 74 | 75 | 76 | carmel="$CARMEL_DIR/bin/carmel" 77 | data_dir="$EXP_DIR/data" 78 | 79 | channel="$RL$ORD-$IL.$CA.$NC.$ITERS.$ID" 80 | 81 | bash $CODE_DIR/create_fst.sh $NC > $EXP_DIR/models/$channel 82 | 83 | $carmel -1 -R $SEED $EXP_DIR/models/$channel > $EXP_DIR/models/$channel.rnd 84 | $carmel -HJn $EXP_DIR/models/$channel.rnd > $EXP_DIR/models/$channel.norm 85 | rm $EXP_DIR/models/$channel.rnd 86 | 87 | cp $BASEDIR/lms/$RL.$ORD.lm.wfsa $EXP_DIR/models/$channel.lm 88 | 89 | # head -10 $data_dir/$IL/$train_pref.$NC.$CA.carmel > $EXP_DIR/logs/$channel.in 90 | # $EXP_DIR/logs/$channel.in $EXP_DIR/models/$channel.lm \ 91 | 92 | echo ":: $data_dir/output.$NC.$CA.carmel.10k" 93 | echo ":: $channel" 94 | 95 | # # train the channel model 96 | $carmel --train-cascade -HJa -1 -M $ITERS -R $SEED -X 0.999999 \ 97 | $data_dir/output.$NC.$CA.carmel.10k \ 98 | $EXP_DIR/models/$channel.lm $EXP_DIR/models/$channel.norm \ 99 | 2> $EXP_DIR/logs/$channel 100 | 101 | mv $EXP_DIR/models/$channel.norm.trained $EXP_DIR/models/$channel 102 | 103 | #rm $EXP_DIR/logs/$channel.in 104 | rm $EXP_DIR/models/$channel.norm \ 105 | $EXP_DIR/models/$channel.lm $EXP_DIR/models/$channel.lm.trained 106 | -------------------------------------------------------------------------------- /train_format_lm_ud.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | UD_DIR="$HOME/ud-treebanks-v2.2" 6 | ORDER=2 7 | TAGSET="ud" # tagset code [ud,ut] 8 | LAN_CODES="en" 9 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 10 | DATADIR=$BASEDIR/lm_data 11 | CODE_DIR="$BASEDIR/src/code" 12 | 13 | 14 | if [ -z "$CARMEL_DIR"]; then 15 | CARMEL_DIR="/usr/local" 16 | fi 17 | if [ -z "$SRILM_DIR" ]; then 18 | SRILM_DIR="$HOME/srilm-1.7.2" 19 | fi 20 | 21 | 22 | while [ $# -gt 1 ] 23 | do 24 | key="$1" 25 | case $key in 26 | -td|--tb_dir) 27 | UD_DIR="$2" 28 | shift # past argument 29 | ;; 30 | -ts|--tagset) 31 | TAGSET="$2" 32 | shift # past argument 33 | ;; 34 | -ord|--order) 35 | ORDER="$2" 36 | shift # past argument 37 | ;; 38 | -l|--lang) 39 | LAN_CODES="$2" 40 | shift # past argument 41 | ;; 42 | -sri|--sridir) 43 | SRILM_DIR="$2" 44 | shift # past argument 45 | ;; 46 | -carmel|--carmel) 47 | CARMEL_DIR="$2" 48 | shift # past argument 49 | ;; 50 | *) 51 | # unknown option 52 | ;; 53 | esac 54 | shift 55 | done 56 | 57 | export CARMEL_DIR=$CARMEL_DIR 58 | export SRILM_DIR=$SRILM_DIR 59 | 60 | cd $CODE_DIR 61 | 62 | g++ makelmfsa.cpp -o makelmfsa 63 | g++ makelmfsa_x.cpp -o makelmfsa_x 64 | 65 | 66 | LAN_CODES=(${LAN_CODES//,/ }) 67 | 68 | for lang in "${LAN_CODES[@]}"; do 69 | echo "" 70 | echo "Lang: $lang" 71 | mkdir -p $BASEDIR/lms/ 72 | mkdir -p $DATADIR/$lang 73 | 74 | suf="" 75 | if [ $lang = "ja" ]||[ $lang = "ar" ]; then 76 | suf=".all" 77 | fi 78 | 79 | python3 conllu2txt.py -i $DATADIR/$lang/train.conllu$suf \ 80 | -m ch -c 3 -tb $TAGSET > $DATADIR/$lang/train.upos.ch 81 | 82 | python3 conllu2txt.py -i $DATADIR/$lang/train.conllu$suf \ 83 | -m tag -c 3 -tb $TAGSET > $DATADIR/$lang/train.upos 84 | 85 | # run LM 86 | #-addsmooth -kn \ 87 | $SRILM_DIR/bin/i686-m64/ngram-count -text $DATADIR/$lang/train.upos.ch -order $ORDER \ 88 | -addsmooth 1 \ 89 | -lm $BASEDIR/lms/$lang.$ORDER.lm 90 | grep -vP "^$" < $BASEDIR/lms/$lang.$ORDER.lm > temp 91 | mv temp $BASEDIR/lms/$lang.$ORDER.lm 92 | 93 | # create fsa/fst 94 | ./makelmfsa $BASEDIR/lms/$lang.$ORDER.lm 95 | # ./makelmfsa_x $basedir/lms/$lang.$ORDER.lm 96 | 97 | $CARMEL_DIR/bin/carmel -n $BASEDIR/lms/$lang.$ORDER.lm.wfsa \ 98 | > $BASEDIR/lms/$lang.$ORDER.lm.norm 99 | 100 | # prepare Viterbi decoding 101 | $CARMEL_DIR/bin/carmel --project-right --project-identity-fsa -HJ $BASEDIR/lms/$lang.$ORDER.lm.wfsa \ 102 | > $BASEDIR/lms/$lang.$ORDER.fsa.noe 103 | 104 | done 105 | 106 | 107 | rm makelmfsa makelmfsa_x -------------------------------------------------------------------------------- /src/marlin/basic/std.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "std.h" 5 | #include "str.h" 6 | 7 | // Return the current date/time. 8 | string now() { 9 | time_t t = time(NULL); 10 | return substr(ctime(&t), 0, -1); 11 | } 12 | 13 | string hostname() { 14 | char buf[1024]; 15 | gethostname(buf, sizeof(buf)); 16 | return buf; 17 | } 18 | 19 | // Return the amount of memory (kB) used by this process 20 | int mem_usage() { 21 | ifstream in("/proc/self/status"); 22 | if(!in) return 0; 23 | char buf[1024]; 24 | static const char *key = "VmRSS"; 25 | 26 | while(in.getline(buf, sizeof(buf))) { 27 | if(strncmp(buf, key, strlen(key)) != 0) continue; 28 | char *s = strchr(buf, ':'); 29 | if(!s) return 0; 30 | int x; 31 | sscanf(s+1, "%d", &x); 32 | return x; 33 | } 34 | return -1; 35 | } 36 | 37 | // Return whether the file exists. 38 | bool file_exists(const char *file) { 39 | return access(file, F_OK) == 0; 40 | } 41 | 42 | // Create an empty file. Return success. 43 | bool create_file(const char *file) { 44 | ofstream out(file); 45 | if(!out) return false; 46 | out.close(); 47 | return true; 48 | } 49 | 50 | time_t file_modified_time(const char *file) { 51 | struct stat stat_buf; 52 | if(stat(file, &stat_buf) != 0) 53 | return 0; 54 | return stat_buf.st_mtime; 55 | } 56 | 57 | // Return the cpu speed in MHz. 58 | int cpu_speed_mhz() { 59 | ifstream in("/proc/cpuinfo"); 60 | if(!in) return 0; 61 | char buf[1024]; 62 | static const char *key = "cpu MHz"; 63 | 64 | while(in.getline(buf, sizeof(buf))) { 65 | if(strncmp(buf, key, strlen(key)) != 0) continue; 66 | char *s = strchr(buf, ':'); 67 | if(!s) return 0; 68 | double x; 69 | sscanf(s+1, "%lf", &x); 70 | return (int)x; 71 | } 72 | return 0; 73 | } 74 | 75 | // "file" -> "file" 76 | // "dir/file" -> "file" 77 | string strip_dir(string s) { 78 | return substr(s, s.rfind('/')+1); 79 | } 80 | 81 | // "file" -> "file" 82 | // "dir/file" -> "dir" 83 | string get_dir(string s) { 84 | int i = s.rfind('/'); 85 | return i == -1 ? "." : substr(s, 0, s.rfind('/')); 86 | } 87 | 88 | // "base" -> "base" 89 | // "base.ext" -> "base" 90 | string file_base(string s) { 91 | int i = s.rfind('.'); 92 | return i == -1 ? s : substr(s, 0, i); 93 | } 94 | 95 | bool get_files_in_dir(string dirname, bool fullpath, vector &files) { 96 | DIR *dir = opendir(dirname.c_str()); 97 | if(!dir) return false; 98 | while(true) { 99 | dirent *ent = readdir(dir); 100 | if(!ent) break; 101 | // For some reason, sometimes files show up as d_type == DT_UNKNOWN, I 102 | // think due to AFS issues 103 | //cout << "FFF " << ent->d_name << ' ' << (int)ent->d_type << endl; 104 | if(ent->d_type != DT_DIR) { 105 | files.push_back((fullpath ? dirname+"/" : string()) + ent->d_name); 106 | } 107 | } 108 | closedir(dir); 109 | return true; 110 | } 111 | -------------------------------------------------------------------------------- /src/code/combine_channels.py: -------------------------------------------------------------------------------- 1 | """ 2 | Combine channel tables 3 | """ 4 | 5 | import os,sys 6 | import argparse 7 | import pdb 8 | import numpy as np 9 | import subprocess as sp 10 | from utils import * 11 | from collections import defaultdict 12 | from label_dictionary import LabelDictionary 13 | from multiprocessing import Pool 14 | import re 15 | 16 | import warnings 17 | warnings.filterwarnings("ignore") 18 | 19 | regex = re.compile(r'\(0 \(0 "(?P[A-Z])" "(?P[0-9]+)" (?P

[-.0-9e^]+)\)\)') 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--il","-il",type=str, default=500, help="Incident language") 25 | parser.add_argument("--rl" ,"-rl", type=str, default=None, help="Related Languages") 26 | parser.add_argument("--num_clusters","-nc",type=int, default=500, help="Number of clusters") 27 | parser.add_argument("--iters","-it",type=int, default=500, help="Number of iterations") 28 | parser.add_argument("--ca" ,"-ca", type=str, default="br", help="Clutering algorithm [brown,anchor,]") 29 | parser.add_argument("--exp_dir" ,"-exp", type=str, default='', help="Experiment folder") 30 | args = parser.parse_args() 31 | 32 | #rls = "en.de.fr.it.es.ja.ar.cs.ru.sw-hcs".split('.') 33 | rls = args.rl.split(",") 34 | il = args.il 35 | 36 | if il[:2]=='tl': 37 | il = 'tl' 38 | p_c_t = np.zeros([17,500]) 39 | t2id = LabelDictionary() 40 | for rl in rls: 41 | if il==rl: continue 42 | model = "%s/models/%s2-%s.%s.%d.%d" % (args.exp_dir,rl,il,args.ca,args.num_clusters,args.iters) 43 | temp = np.zeros([17,500]) 44 | 45 | for line in open(model,'r'): 46 | line = line.strip('\n') 47 | if line=='' or line=='0': continue 48 | match = regex.match(line) 49 | if match==None: 50 | # print("not found!",line) 51 | # pdb.set_trace() 52 | continue 53 | # pdb.set_trace() 54 | t = match.group("T") 55 | c = int(match.group("C")) 56 | ps = match.group("P") 57 | if t=="" or t=="": 58 | continue 59 | if ps[0]!="e": 60 | p = float(ps) 61 | else: 62 | p = np.exp(float(ps[2:])) 63 | tid = t2id.add(t) 64 | p_c_t[tid,c] += p 65 | temp[tid,c] = p 66 | #END-FOR-LINE 67 | 68 | #END-FOR-RLS 69 | 70 | # normalize 71 | for t in range(17): 72 | # print(t2id.get_label_name(t),p_c_t[t,:].sum(),len(rls), (p_c_t[t,:]/len(rls)).sum() ) 73 | p_c_t[t,:] /= p_c_t[t,:].sum() 74 | 75 | # print out result 76 | outfile_fn = "%s/models/%s.%s.%d.500.comb" % (args.exp_dir,il,args.ca,args.num_clusters) 77 | outfile = open(outfile_fn,'w') 78 | print("0",file=outfile) 79 | print('(0 (0 "" "" 1))',file=outfile) 80 | print('(0 (0 "" "" 1))',file=outfile) 81 | for t in range(17): 82 | for c in range(500): 83 | tag = t2id.get_label_name(t) 84 | prob = str(p_c_t[t,c]) 85 | print('(0 (0 "%s" "%d" %s))' % (tag,c,prob ), file=outfile ) 86 | if p_c_t[t,c]==0: 87 | print(il,tag,c) 88 | outfile.close() 89 | 90 | for rl in rls: 91 | model_name = "%s/models/%s2-%s.%s.%d.500.comb" % (args.exp_dir,rl,il,args.ca,args.num_clusters) 92 | sp.Popen(["cp",outfile_fn,model_name]) 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /src/code/tag_text.py: -------------------------------------------------------------------------------- 1 | from label_dictionary import LabelDictionary 2 | from collections import defaultdict 3 | from utils import * 4 | import os,sys 5 | import argparse 6 | import pdb 7 | import numpy as np 8 | 9 | 10 | START="" 11 | END="" 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--input","-i", type=str, help="Cluster dict") 17 | parser.add_argument("--baseline","-b", type=str, default="brown", help="Clustering model used") 18 | parser.add_argument("--mode","-m", type=str, help="train / eval") 19 | parser.add_argument("--mapper","-v", type=str, help="label dict") 20 | parser.add_argument("--clt_vocab","-c", type=str, help="cluster vocab") 21 | parser.add_argument("--nclusters","-nc", type=int, default=50, help="number of clusters") 22 | parser.add_argument("--output_pref","-op", type=str,default="train",help="output filename prefix") 23 | parser.add_argument("--subs","-subs", type=int,default=10000,help="subsample size for carmel") 24 | 25 | args = parser.parse_args() 26 | 27 | np.random.seed(42) 28 | w2cid = {} 29 | 30 | if args.mode == 'train': 31 | 32 | cl2cid = LabelDictionary() 33 | mapper_fn = os.path.join(os.path.dirname(args.clt_vocab),'clt.mapper') 34 | 35 | output_file = open(args.clt_vocab+".norm",'w') 36 | 37 | for line in open(args.clt_vocab,'r'): 38 | line = line.strip('\n') 39 | if line=='': continue 40 | w,c = '','' 41 | if args.baseline=='brown': 42 | c,w,_ = line.split('\t') 43 | elif args.baseline=='clark': 44 | w,c,_ = line.split(' ') 45 | elif args.baseline[0] in "lp": 46 | w,c = line.split('\t') 47 | elif args.baseline == "marlin": 48 | w,c = line.split(' ') 49 | 50 | cid = cl2cid.add(c) 51 | w2cid[w] = str(cid) 52 | print("%s\t%d" % (w,cid),file=output_file) 53 | ## 54 | saveObject(w2cid,mapper_fn) 55 | 56 | else: 57 | if args.mapper==None: 58 | print("Error: LabelDictionary object not specified!\nCheck arguments list with -h option") 59 | sys.exit(1) 60 | elif not os.path.exists(args.mapper): 61 | print("Error: LabelDictionary object does not exist!") 62 | sys.exit(1) 63 | else: 64 | w2cid = uploadObject(args.mapper) 65 | ## 66 | 67 | 68 | # pdb.set_trace() 69 | 70 | outfile = open(os.path.join(os.path.dirname(args.input), "%s.%d.%s.ctag" % (args.output_pref,args.nclusters,args.baseline) ),'w') 71 | outfile_carmel = open(os.path.join(os.path.dirname(args.input), "%s.%d.%s.carmel" % (args.output_pref,args.nclusters,args.baseline) ),'w') 72 | outfile_carmel_10k = open(os.path.join(os.path.dirname(args.input), "%s.%d.%s.carmel.10k" % (args.output_pref,args.nclusters,args.baseline) ),'w') 73 | lines = [] 74 | 75 | for line in open(args.input,'r'): 76 | line = line.strip('\n') 77 | if line=='': continue 78 | clts = [] 79 | for w in line.split(' '): 80 | if w == '#eos': continue 81 | if w not in w2cid: 82 | clts.append(w2cid[""]) 83 | else: 84 | clts.append(w2cid[w]) 85 | print(" ".join(clts),file=outfile) 86 | 87 | clts = [START] + clts + [END] 88 | txt = " ".join(['"%s"' % x for x in clts]) 89 | lines.append(txt) 90 | print("",file=outfile_carmel) 91 | print(txt,file=outfile_carmel) 92 | # print(" ".join(clts),file=outfile_carmel) 93 | 94 | ## 95 | idxs = np.arange(len(lines)) 96 | np.random.shuffle(idxs) 97 | for idx in idxs[:args.subs]: 98 | print("",file=outfile_carmel_10k) 99 | print(lines[idx],file=outfile_carmel_10k) 100 | ## -------------------------------------------------------------------------------- /src/marlin/basic/opt.h: -------------------------------------------------------------------------------- 1 | #ifndef __OPT_H__ 2 | #define __OPT_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | // First thing to call in main(). 11 | void init_opt(int argc, char *argv[]); 12 | 13 | //////////////////////////////////////////////////////////////////////// 14 | // command-line arguments 15 | 16 | class GetOpt { 17 | public: 18 | GetOpt() { } 19 | 20 | void AddOpt(const string &name, bool has_arg); 21 | void Parse(int argc, char *argv[]); 22 | int Lookup(const string &name) const; 23 | 24 | bool Exists(const string &name) const; 25 | string Get(const string &name, const string &default_value) const; 26 | string Get(const string &name) const; 27 | int GetInt(const string &name) const; 28 | int GetInt(const string &name, int default_value) const; 29 | double GetDouble(const string &name) const; 30 | double GetDouble(const string &name, double default_value) const; 31 | 32 | private: 33 | vector< pair > opts; 34 | vector values; 35 | }; 36 | 37 | template struct OptInfo { 38 | OptInfo(const string &name, T *var, const string &msg, bool required) 39 | : name(name), var(var), msg(msg), required(required) { } 40 | 41 | string name; 42 | T *var; // location of the variable that stores this value 43 | string msg; 44 | bool required; 45 | }; 46 | 47 | extern vector< OptInfo > bool_opts; 48 | extern vector< OptInfo > int_opts; 49 | extern vector< OptInfo > double_opts; 50 | extern vector< OptInfo > string_opts; 51 | 52 | //////////////////////////////////////////////////////////// 53 | 54 | // two versions: in one, option is required 55 | #define opt_define_bool_req(var, name, msg) \ 56 | bool var = opt_define_bool_wrap(name, &var, false, msg, true) 57 | #define opt_define_bool(var, name, val, msg) \ 58 | bool var = opt_define_bool_wrap(name, &var, val, msg, false) 59 | #define opt_define_int_req(var, name, msg) \ 60 | int var = opt_define_int_wrap(name, &var, 0, msg, true) 61 | #define opt_define_int(var, name, val, msg) \ 62 | int var = opt_define_int_wrap(name, &var, val, msg, false) 63 | #define opt_define_double_req(var, name, msg) \ 64 | double var = opt_define_double_wrap(name, &var, 0.0, msg, true) 65 | #define opt_define_double(var, name, val, msg) \ 66 | double var = opt_define_double_wrap(name, &var, val, msg, false) 67 | #define opt_define_string_req(var, name, msg) \ 68 | string var = opt_define_string_wrap(name, &var, "", msg, true) 69 | #define opt_define_string(var, name, val, msg) \ 70 | string var = opt_define_string_wrap(name, &var, val, msg, false) 71 | 72 | inline bool opt_define_bool_wrap(const string &name, bool *var, bool val, const string &msg, bool required) { 73 | bool_opts.push_back(OptInfo(name, var, msg, required)); 74 | return val; 75 | } 76 | 77 | inline int opt_define_int_wrap(const string &name, int *var, int val, const string &msg, bool required) { 78 | //printf("HELLO %s\n", name.c_str()); 79 | int_opts.push_back(OptInfo(name, var, msg, required)); 80 | //printf("N %d\n", (int)int_opts.size()); 81 | return val; 82 | } 83 | inline double opt_define_double_wrap(const string &name, double *var, double val, const string &msg, bool required) { 84 | double_opts.push_back(OptInfo(name, var, msg, required)); 85 | return val; 86 | } 87 | inline string opt_define_string_wrap(const string &name, string *var, const string &val, const string &msg, bool required) { 88 | string_opts.push_back(OptInfo(name, var, msg, required)); 89 | return val; 90 | } 91 | 92 | //////////////////////////////////////////////////////////// 93 | 94 | void print_opts(); 95 | 96 | extern int rand_seed; 97 | 98 | #endif 99 | -------------------------------------------------------------------------------- /src/marlin/basic/stl-basic.h: -------------------------------------------------------------------------------- 1 | #ifndef __STL_BASIC_H__ 2 | #define __STL_BASIC_H__ 3 | 4 | #include "std.h" 5 | #include "city.h" 6 | 7 | //////////////////////////////////////////////////////////// 8 | 9 | typedef double real; 10 | //typedef float real; 11 | 12 | typedef pair IntPair; 13 | typedef pair IntDouble; 14 | typedef pair DoubleInt; 15 | typedef pair DoublePair; 16 | typedef vector IntPairVec; 17 | typedef vector DoubleIntVec; 18 | typedef vector BoolVec; 19 | typedef vector IntVec; 20 | typedef vector StringVec; 21 | typedef vector IntMat; 22 | typedef vector IntVecVec; 23 | typedef vector IntVecVecVec; 24 | typedef vector IntVecVecVecVec; 25 | typedef vector DoubleVec; 26 | typedef vector DoubleVecVec; 27 | typedef vector DoubleVecVecVec; 28 | typedef vector DoubleVecVecVecVec; 29 | typedef vector IntDoubleVec; 30 | typedef vector IntDoubleVecVec; 31 | typedef vector IntDoubleVecVecVec; 32 | typedef vector IntDoubleVecVecVecVec; 33 | 34 | typedef IntVec ivector; 35 | typedef DoubleVec fvector; 36 | typedef DoubleVecVec fmatrix; 37 | 38 | //////////////////////////////////////////////////////////// 39 | 40 | struct vector_eq { 41 | bool operator()(const IntVec &v1, const IntVec &v2) const { 42 | return v1 == v2; 43 | } 44 | }; 45 | struct vector_hf { 46 | size_t operator()(const IntVec &v) const { 47 | return CityHash64(reinterpret_cast(&v[0]), sizeof(int) * v.size()); 48 | #if 0 49 | int h = 0; 50 | foridx(i, len(v)) 51 | h = (h<<4)^(h>>28)^v[i]; 52 | return h; 53 | #endif 54 | } 55 | }; 56 | 57 | struct pair_eq { 58 | bool operator()(const IntPair &p1, const IntPair &p2) const { 59 | return p1 == p2; 60 | } 61 | }; 62 | struct pair_hf { 63 | size_t operator()(const IntPair &p) const { 64 | return (p.first<<4)^(p.first>>28) ^ p.second; 65 | } 66 | }; 67 | 68 | struct str_eq { 69 | bool operator()(const char *s1, const char *s2) const { 70 | return strcmp(s1, s2) == 0; 71 | } 72 | }; 73 | struct str_hf { 74 | size_t operator()(const char *s) const { 75 | return CityHash64(s, strlen(s)); 76 | } 77 | }; 78 | 79 | struct string_eq { 80 | bool operator()(const string &s1, const string &s2) const { 81 | return s1 == s2; 82 | } 83 | }; 84 | struct string_hf { 85 | size_t operator()(const string &s) const { 86 | return CityHash64(s.c_str(), s.size()); 87 | } 88 | }; 89 | 90 | //////////////////////////////////////////////////////////// 91 | 92 | typedef unordered_set IntSet; 93 | typedef unordered_set IntPairSet; 94 | typedef unordered_set IntVecSet; 95 | typedef unordered_map IntVecDoubleMap; 96 | typedef unordered_map IntVecIntMap; 97 | typedef unordered_map IntIntMap; 98 | typedef unordered_map IntDoubleMap; 99 | typedef unordered_map IntIntPairMap; 100 | typedef unordered_map IntIntVecMap; 101 | typedef unordered_map IntIntIntMapMap; 102 | typedef unordered_map IntPairIntMap; 103 | typedef unordered_map IntPairDoubleMap; 104 | typedef unordered_map IntPairDoubleVecMap; 105 | typedef unordered_map IntVecIntVecMap; 106 | typedef unordered_map IntVecDoubleVecMap; 107 | typedef vector IntIntMapVec; 108 | 109 | typedef vector StrVec; 110 | typedef unordered_map StrIntMap; 111 | typedef unordered_map StrStrMap; 112 | 113 | #endif 114 | -------------------------------------------------------------------------------- /src/marlin/basic/city.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 Google, Inc. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | // 21 | // CityHash, by Geoff Pike and Jyrki Alakuijala 22 | // 23 | // This file provides a few functions for hashing strings. On x86-64 24 | // hardware in 2011, CityHash64() is faster than other high-quality 25 | // hash functions, such as Murmur. This is largely due to higher 26 | // instruction-level parallelism. CityHash64() and CityHash128() also perform 27 | // well on hash-quality tests. 28 | // 29 | // CityHash128() is optimized for relatively long strings and returns 30 | // a 128-bit hash. For strings more than about 2000 bytes it can be 31 | // faster than CityHash64(). 32 | // 33 | // Functions in the CityHash family are not suitable for cryptography. 34 | // 35 | // WARNING: This code has not been tested on big-endian platforms! 36 | // It is known to work well on little-endian platforms that have a small penalty 37 | // for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs. 38 | // 39 | // By the way, for some hash functions, given strings a and b, the hash 40 | // of a+b is easily derived from the hashes of a and b. This property 41 | // doesn't hold for any hash functions in this file. 42 | 43 | #ifndef CITY_HASH_H_ 44 | #define CITY_HASH_H_ 45 | 46 | #include // for size_t. 47 | #include 48 | #include 49 | 50 | typedef uint8_t uint8; 51 | typedef uint32_t uint32; 52 | typedef uint64_t uint64; 53 | typedef std::pair uint128; 54 | 55 | inline uint64 Uint128Low64(const uint128& x) { return x.first; } 56 | inline uint64 Uint128High64(const uint128& x) { return x.second; } 57 | 58 | // Hash function for a byte array. 59 | uint64 CityHash64(const char *buf, size_t len); 60 | 61 | // Hash function for a byte array. For convenience, a 64-bit seed is also 62 | // hashed into the result. 63 | uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed); 64 | 65 | // Hash function for a byte array. For convenience, two seeds are also 66 | // hashed into the result. 67 | uint64 CityHash64WithSeeds(const char *buf, size_t len, 68 | uint64 seed0, uint64 seed1); 69 | 70 | // Hash function for a byte array. 71 | uint128 CityHash128(const char *s, size_t len); 72 | 73 | // Hash function for a byte array. For convenience, a 128-bit seed is also 74 | // hashed into the result. 75 | uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed); 76 | 77 | // Hash 128 input bits down to 64 bits of output. 78 | // This is intended to be a reasonably good hash function. 79 | inline uint64 Hash128to64(const uint128& x) { 80 | // Murmur-inspired hashing. 81 | const uint64 kMul = 0x9ddfea08eb382d69ULL; 82 | uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; 83 | a ^= (a >> 47); 84 | uint64 b = (Uint128High64(x) ^ a) * kMul; 85 | b ^= (b >> 47); 86 | b *= kMul; 87 | return b; 88 | } 89 | 90 | #endif // CITY_HASH_H_ 91 | -------------------------------------------------------------------------------- /src/code/makelmfsa_x.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | bool isfloat(wstring s) { 16 | if (s[0] == L'-' || isdigit(s[0])) return true; 17 | return false; 18 | } 19 | 20 | double tofloat(wstring s) { 21 | wstringstream ss(s); 22 | double d; 23 | ss >> d; 24 | return d; 25 | } 26 | 27 | 28 | int main(int argc, char** argv) { 29 | locale::global(locale("")); 30 | string infile_fn(argv[1]); 31 | string out1=infile_fn+".wfst"; 32 | string out2=infile_fn+".wfsa"; // <---------------- fout2 :: wfsa 33 | string in1= infile_fn; 34 | wifstream fin(in1.c_str()); 35 | wofstream fout(out1.c_str()); 36 | wofstream fout2(out2.c_str()); 37 | wstring l = L"",s; 38 | int order; 39 | double score,backoff; 40 | wstring dest; 41 | while (l[0] != L'n') getline(fin,l); 42 | while (l[0] == L'n') {getline(fin,l); order++;} 43 | int current_ngram = 1; 44 | fout << "FINAL" << endl; 45 | fout << "(START ( 1!))" << endl; 46 | fout2 << "FINAL" << endl; 47 | fout2 << "(START ( *e* 1!))" << endl; 48 | while (getline(fin,l)) { 49 | if (l[0] == L'\\') {current_ngram++; cout << current_ngram << endl; continue;} 50 | wstringstream ss(l); 51 | vector data; 52 | while (ss >> s) data.push_back(s); 53 | if (current_ngram > 0 && data.size() > 1 && isfloat(data[0])) { 54 | score=pow(10.0,tofloat(data[0])); 55 | //if (score < 0.1) continue; 56 | if (current_ngram == 1) { 57 | //cout << data[1] << " " << score << endl; 58 | if (data.size() > 2) { 59 | backoff = pow(10.0,tofloat(data[2])); 60 | dest = data[1]; 61 | fout << "(" << data[1] << " (NULL *e* *e* " << backoff << "!))" << endl; 62 | fout2 << "(" << data[1] << " (NULL *e* *e* " << backoff << "!))" << endl; 63 | } 64 | else { 65 | if (data[1] == L"") 66 | dest = data[1]; 67 | else 68 | dest = L"NULL"; 69 | } 70 | if (dest != L"") { 71 | if (data[current_ngram] == L"") dest = L"FINAL"; 72 | fout << "(NULL (" << dest << " " << data[1] << " " << data[1] << " " << score << "!))" << endl; 73 | fout2 << "(NULL (" << dest << " *e* " << data[1] << " " << score << "!))" << endl; 74 | } 75 | continue; 76 | } 77 | if (current_ngram < order) { 78 | if (data.size() > current_ngram+1) { 79 | backoff = pow(10.0,tofloat(data[current_ngram+1])); 80 | dest = data[1]; 81 | for (int i = 2; i <= current_ngram; i++) dest += L"."+data[i]; 82 | wstring _dest = data[2]; 83 | for (int i = 3; i <= current_ngram; i++) _dest+= L"."+data[i]; 84 | fout << L"(" << dest << " (" << _dest << " *e* *e* " << backoff << "!))" << endl; 85 | fout2 << L"(" << dest << " (" << _dest << " *e* *e* " << backoff << "!))" << endl; 86 | } 87 | else { 88 | dest = data[2]; 89 | for (int i = 3; i <= current_ngram; i++) dest+= L"."+data[i]; 90 | } 91 | wstring dest_ = data[1]; 92 | for (int i = 2; i < current_ngram; i++) dest_+= L"."+data[i]; 93 | if (data[current_ngram] == L"") dest = L"FINAL"; 94 | fout << "(" << dest_ << " (" << dest << " " << data[current_ngram] << " " << data[current_ngram] << " " << score << "!))" << endl; 95 | fout2 << "(" << dest_ << " (" << dest << " *e* " << data[current_ngram] << " " << score << "!))" << endl; 96 | continue; 97 | } 98 | if (current_ngram == order) { 99 | wstring dest_ = data[1]; 100 | for (int i = 2; i < current_ngram; i++) dest_+= L"."+data[i]; 101 | dest = data[2]; 102 | for (int i = 3; i <= current_ngram; i++) dest += L"."+data[i]; 103 | if (data[current_ngram] == L"") dest = L"FINAL"; 104 | fout << "(" << dest_ << " (" << dest << " " << data[current_ngram] << " " << data[current_ngram] << " " << score << "!))" << endl; 105 | fout2 << "(" << dest_ << " (" << dest << " *e* " << data[current_ngram] << " " << score << "!))" << endl; 106 | } 107 | 108 | } 109 | } 110 | return 0; 111 | } 112 | -------------------------------------------------------------------------------- /src/marlin/basic/std.h: -------------------------------------------------------------------------------- 1 | #ifndef __STD_H__ 2 | #define __STD_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | using namespace std; 21 | 22 | //////////////////////////////////////////////////////////// 23 | 24 | #define len(vec) (int)(vec).size() 25 | #define sq(x) ((x)*(x)) 26 | 27 | // For loop sugar. This is such a hack! 28 | #define foridx(i, n) for(int i = 0; i < n; i++) 29 | #define forvec(i, tx, x, vec) for(int i = 0, _##i = 0; i < len(vec); i++) \ 30 | for(tx x = (vec)[i]; i == _##i; _##i++) 31 | #define formap(tx, x, ty, y, t, map) forstl(t, _##x##y, map) _mapvars(tx, x, ty, y) 32 | #define forcmap(tx, x, ty, y, t, map) forcstl(t, _##x##y, map) _mapvars(tx, x, ty, y) 33 | #define forstl(t, x, container) for(t::iterator x = (container).begin(); x != (container).end(); x++) 34 | #define forcstl(t, x, container) for(t::const_iterator x = (container).begin(); x != (container).end(); x++) 35 | #define _mapvars(tx, x, ty, y) for(tx x = _##x##y->first, *_##x = &x; _##x; _##x = NULL) \ 36 | for(ty y = _##x##y->second, *_##y = &y; _##y; _##y = NULL) 37 | 38 | //////////////////////////////////////////////////////////// 39 | // Generate random numbers. 40 | 41 | inline int mrand(int a) { return rand() % a; } 42 | inline int mrand(int a, int b) { return rand() % (b-a) + a; } 43 | inline double rand_double() { 44 | static const int BASE = 100000; 45 | return (double)(rand()%BASE)/BASE; 46 | } 47 | 48 | //////////////////////////////////////////////////////////// 49 | // Floating point stuff. 50 | 51 | const double TOL = 1e-10; 52 | 53 | inline bool flt(double u, double v) { return u + TOL < v; } 54 | inline bool fgt(double u, double v) { return u - TOL > v; } 55 | 56 | // Comparing floating point numbers. 57 | inline bool feq(double u, double v, double tol = TOL) { return fabs(u-v) < tol; } 58 | 59 | template inline int sign(T u) { 60 | if(u < 0) return -1; 61 | if(u > 0) return 1; 62 | return 0; 63 | } 64 | 65 | #define assert_feq(u, v) do { _assert_feq(u, v, __FILE__, __LINE__); } while(0); 66 | #define assert_feq2(u, v, tol) do { _assert_feq(u, v, tol, __FILE__, __LINE__); } while(0); 67 | #define assert_fneq(u, v) do { _assert_fneq(u, v, __FILE__, __LINE__); } while(0); 68 | inline void _assert_feq(double u, double v, const char *file, int line) { 69 | if(!feq(u, v)) { printf("At %s:%d, %f != %f\n", file, line, u, v); assert(0); } 70 | } 71 | inline void _assert_feq(double u, double v, double tol, const char *file, int line) { 72 | if(!feq(u, v, tol)) { printf("At %s:%d, %f != %f\n", file, line, u, v); assert(0); } 73 | } 74 | inline void _assert_fneq(double u, double v, const char *file, int line) { 75 | if(feq(u, v)) { printf("At %s:%d, %f == %f\n", file, line, u, v); assert(0); } 76 | } 77 | #define assert_eq(u, v) do { _assert_eq(u, v, __STRING(u), __STRING(v), __FILE__, __LINE__); } while(0) 78 | template inline void _assert_eq(const T &u, const T &v, const char *us, const char *vs, const char *file, int line) { 79 | if(u != v) { 80 | cout << "At " << file << ':' << line << ", " << 81 | us << '(' << u << ')' << " != " << 82 | vs << '(' << v << ')' << endl; 83 | assert(0); 84 | } 85 | } 86 | 87 | #define assert2(x, reason) \ 88 | do { \ 89 | if(!(x)) { \ 90 | cout << "\nFAILURE REASON: " << reason << endl; \ 91 | assert(x); \ 92 | } \ 93 | } while(0) 94 | 95 | string now(); 96 | string hostname(); 97 | int cpu_speed_mhz(); 98 | int mem_usage(); // in kB 99 | 100 | bool create_file(const char *file); 101 | bool file_exists(const char *file); 102 | time_t file_modified_time(const char *file); 103 | 104 | string strip_dir(string s); 105 | string get_dir(string s); 106 | string file_base(string s); 107 | bool get_files_in_dir(string dirname, bool fullpath, vector &files); 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /src/code/setup_ud-treebank_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | 6 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../.. >/dev/null 2>&1 && pwd )" 7 | CODE_DIR="$BASEDIR/src/code" 8 | UR_DIR="$BASEDIR/src/uroman" 9 | UD_DIR="$HOME/ud-treebanks-v2.2" 10 | data_dir=$BASEDIR/lm_data 11 | 12 | # format UD 13 | langs="en da nl de fr it es pt ja cs ru pl ar fa id hi" 14 | 15 | 16 | while [ $# -gt 1 ] 17 | do 18 | key="$1" 19 | case $key in 20 | -td|--tb_dir) # treebank directory 21 | UD_DIR="$2" 22 | shift # past argument 23 | ;; 24 | *) 25 | # unknown option 26 | ;; 27 | esac 28 | shift 29 | done 30 | 31 | 32 | ######################################################################################## 33 | 34 | 35 | for lang in $langs; do 36 | mkdir -p $data_dir/$lang; 37 | done 38 | 39 | split="train" 40 | # english treebanks 41 | cat $UD_DIR/UD_English-EWT/en_ewt-ud-$split.conllu \ 42 | $UD_DIR/UD_English-GUM/en_gum-ud-$split.conllu \ 43 | $UD_DIR/UD_English-LinES/en_lines-ud-$split.conllu \ 44 | $UD_DIR/UD_English-ParTUT/en_partut-ud-$split.conllu \ 45 | > $data_dir/en/$split.conllu 46 | 47 | # Danish 48 | cp $UD_DIR/UD_Danish-DDT/da_ddt-ud-$split.conllu \ 49 | $data_dir/da/$split.conllu 50 | 51 | # Dutch 52 | cat $UD_DIR/UD_Dutch-Alpino/nl_alpino-ud-$split.conllu \ 53 | $UD_DIR/UD_Dutch-LassySmall/nl_lassysmall-ud-$split.conllu \ 54 | > $data_dir/nl/$split.conllu 55 | 56 | # german 57 | cp $UD_DIR/UD_German-GSD/de_gsd-ud-$split.conllu \ 58 | $data_dir/de/$split.conllu 59 | 60 | 61 | # french treebanks 62 | cat $UD_DIR/UD_French-GSD/fr_gsd-ud-$split.conllu \ 63 | $UD_DIR/UD_French-ParTUT/fr_partut-ud-$split.conllu \ 64 | $UD_DIR/UD_French-Sequoia/fr_sequoia-ud-$split.conllu \ 65 | $UD_DIR/UD_French-Spoken/fr_spoken-ud-$split.conllu \ 66 | > $data_dir/fr/$split.conllu 67 | 68 | # spanish 69 | cat $UD_DIR/UD_Spanish-AnCora/es_ancora-ud-$split.conllu \ 70 | $UD_DIR/UD_Spanish-GSD/es_gsd-ud-$split.conllu \ 71 | > $data_dir/es/$split.conllu 72 | 73 | # italian 74 | cat $UD_DIR/UD_Italian-ISDT/it_isdt-ud-$split.conllu \ 75 | $UD_DIR/UD_Italian-ParTUT/it_partut-ud-$split.conllu \ 76 | $UD_DIR/UD_Italian-PoSTWITA/it_postwita-ud-$split.conllu \ 77 | > $data_dir/it/$split.conllu 78 | 79 | # portuguese 80 | cat $UD_DIR/UD_Portuguese-Bosque/pt_bosque-ud-$split.conllu \ 81 | $UD_DIR/UD_Portuguese-GSD/pt_gsd-ud-$split.conllu \ 82 | > $data_dir/pt/$split.conllu 83 | 84 | 85 | # japanese 86 | cat $UD_DIR/UD_Japanese-GSD/ja_gsd-ud-$split.conllu \ 87 | $UD_DIR/UD_Japanese-BCCWJ/ja_bccwj-ud-$split.conllu \ 88 | > $data_dir/ja/$split.conllu.all 89 | 90 | cat $UD_DIR/UD_Japanese-GSD/ja_gsd-ud-$split.conllu \ 91 | > $data_dir/ja/$split.conllu 92 | 93 | 94 | # czech 95 | cat $UD_DIR/UD_Czech-PDT/cs_pdt-ud-$split.conllu \ 96 | $UD_DIR/UD_Czech-CAC/cs_cac-ud-$split.conllu \ 97 | $UD_DIR/UD_Czech-FicTree/cs_fictree-ud-$split.conllu \ 98 | > $data_dir/cs/$split.conllu 99 | 100 | # russian 101 | cat $UD_DIR/UD_Russian-GSD/ru_gsd-ud-$split.conllu \ 102 | $UD_DIR/UD_Russian-SynTagRus/ru_syntagrus-ud-$split.conllu \ 103 | $UD_DIR/UD_Russian-Taiga/ru_taiga-ud-$split.conllu \ 104 | > $data_dir/ru/$split.conllu 105 | 106 | # polish 107 | cat $UD_DIR/UD_Polish-LFG/pl_lfg-ud-$split.conllu \ 108 | $UD_DIR/UD_Polish-SZ/pl_sz-ud-$split.conllu \ 109 | > $data_dir/pl/$split.conllu 110 | 111 | 112 | # arabic 113 | cat $UD_DIR/UD_Arabic-PADT/ar_padt-ud-$split.conllu \ 114 | $UD_DIR/UD_Arabic-NYUAD/ar_nyuad-ud-$split.conllu \ 115 | > $data_dir/ar/$split.conllu.all 116 | 117 | cat $UD_DIR/UD_Arabic-PADT/ar_padt-ud-$split.conllu \ 118 | > $data_dir/ar/$split.conllu 119 | 120 | # persian 121 | cp $UD_DIR/UD_Persian-Seraji/fa_seraji-ud-$split.conllu \ 122 | $data_dir/fa/$split.conllu 123 | 124 | 125 | # Indonesian 126 | cp $UD_DIR/UD_Indonesian-GSD/id_gsd-ud-$split.conllu \ 127 | $data_dir/id/$split.conllu 128 | 129 | # hindi 130 | cp $UD_DIR/UD_Hindi-HDTB/hi_hdtb-ud-$split.conllu \ 131 | $data_dir/hi/$split.conllu 132 | 133 | 134 | for lang in $langs; do 135 | echo "lang- split :: $lang - $split" 136 | grep -v "^#" $data_dir/$lang/$split.conllu | grep -v "^\s*$" | \ 137 | grep -vP "^[0-9]+-[0-9]+" > temp1 138 | 139 | cp temp1 temp2 140 | if [ $lang = "ja" ] || [ $lang = "fa" ] || [ $lang = "ar" ] || [ $lang = "ru" ] || [ $lang = "hi" ]; then 141 | bash $CODE_DIR/rom_conllu.sh $lang temp1 temp2 $UR_DIR 142 | fi 143 | 144 | mv temp2 $data_dir/$lang/$split.conllu 145 | rm temp1 146 | 147 | if [ $lang = "ja" ] || [ $lang = "ar" ]; then 148 | grep -v "^#" $data_dir/$lang/$split.conllu.all | grep -v "^\s*$" | \ 149 | grep -vP "^[0-9]+-[0-9]+" > temp1 150 | 151 | mv temp1 $data_dir/$lang/$split.conllu.all 152 | fi 153 | done 154 | 155 | -------------------------------------------------------------------------------- /src/code/makelmfsa.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | bool isfloat(wstring s) { 16 | if (s[0] == L'-' || isdigit(s[0])) return true; 17 | return false; 18 | } 19 | 20 | double tofloat(wstring s) { 21 | wstringstream ss(s); 22 | double d; 23 | ss >> d; 24 | return d; 25 | } 26 | 27 | 28 | int main(int argc, char** argv) { 29 | locale::global(locale("")); 30 | string infile_fn(argv[1]); 31 | string out1=infile_fn+".wfst"; 32 | string out2=infile_fn+".wfsa"; 33 | string in1= infile_fn; 34 | wifstream fin(in1.c_str()); 35 | wofstream fout(out1.c_str()); 36 | wofstream fout2(out2.c_str()); 37 | wstring l = L"",s; 38 | int order; 39 | double score,backoff; 40 | wstring dest; 41 | while (l[0] != L'n') getline(fin,l); 42 | while (l[0] == L'n') {getline(fin,l); order++;} 43 | int current_ngram = 1; 44 | fout << "FINAL" << endl; 45 | fout << "(START ( \"\" \"\" 1!))" << endl; 46 | fout2 << "FINAL" << endl; 47 | fout2 << "(START ( *e* \"\" 1!))" << endl; 48 | while (getline(fin,l)) { 49 | if (l[0] == L'\\') {current_ngram++; cout << current_ngram << endl; continue;} 50 | wstringstream ss(l); 51 | vector data; 52 | while (ss >> s) data.push_back(s); 53 | if (current_ngram > 0 && data.size() > 1 && isfloat(data[0])) { 54 | score=pow(10.0,tofloat(data[0])); 55 | //if (score < 0.1) continue; 56 | if (current_ngram == 1) { 57 | //cout << data[1] << " " << score << endl; 58 | if (data.size() > 2) { 59 | backoff = pow(10.0,tofloat(data[2])); 60 | dest = data[1]; 61 | fout << "(" << data[1] << " (NULL *e* *e* " << backoff << "!))" << endl; 62 | fout2 << "(" << data[1] << " (NULL *e* *e* " << backoff << "!))" << endl; 63 | } 64 | else { 65 | if (data[1] == L"") 66 | dest = data[1]; 67 | else 68 | dest = L"NULL"; 69 | } 70 | if (dest != L"") { 71 | if (data[current_ngram] == L"") dest = L"FINAL"; 72 | fout << "(NULL (" << dest << " \"" << data[1] << "\" \"" << data[1] << "\" " << score << "!))" << endl; 73 | fout2 << "(NULL (" << dest << " *e* \"" << data[1] << "\" " << score << "!))" << endl; 74 | } 75 | continue; 76 | } 77 | if (current_ngram < order) { 78 | if (data.size() > current_ngram+1) { 79 | backoff = pow(10.0,tofloat(data[current_ngram+1])); 80 | // dest = L""; 81 | // for (int i = 1; i <= current_ngram; i++) dest+=data[i]; 82 | dest = data[1]; 83 | for (int i = 2; i <= current_ngram; i++) dest += L"."+data[i]; 84 | // wstring _dest = L""; 85 | // for (int i = 2; i <= current_ngram; i++) _dest+=data[i]; 86 | wstring _dest = data[2]; 87 | for (int i = 3; i <= current_ngram; i++) _dest+= L"."+data[i]; 88 | fout << L"(" << dest << " (" << _dest << " *e* *e* " << backoff << "!))" << endl; 89 | fout2 << L"(" << dest << " (" << _dest << " *e* *e* " << backoff << "!))" << endl; 90 | } 91 | else { 92 | // dest = L""; 93 | // for (int i = 2; i <= current_ngram; i++) dest+=data[i]; 94 | dest = data[2]; 95 | for (int i = 3; i <= current_ngram; i++) dest+= L"."+data[i]; 96 | } 97 | // wstring dest_ = L""; 98 | // for (int i = 1; i < current_ngram; i++) dest_+=data[i]; 99 | wstring dest_ = data[1]; 100 | for (int i = 2; i < current_ngram; i++) dest_+= L"."+data[i]; 101 | if (data[current_ngram] == L"") dest = L"FINAL"; 102 | fout << "(" << dest_ << " (" << dest << " \"" << data[current_ngram] << "\" \"" << data[current_ngram] << "\" " << score << "!))" << endl; 103 | fout2 << "(" << dest_ << " (" << dest << " *e* \"" << data[current_ngram] << "\" " << score << "!))" << endl; 104 | continue; 105 | } 106 | if (current_ngram == order) { 107 | // wstring dest_ = L""; 108 | // for (int i = 1; i < current_ngram; i++) dest_+=data[i]; 109 | wstring dest_ = data[1]; 110 | for (int i = 2; i < current_ngram; i++) dest_+= L"."+data[i]; 111 | // dest = L""; 112 | // for (int i = 2; i <= current_ngram; i++) dest+=data[i]; 113 | dest = data[2]; 114 | for (int i = 3; i <= current_ngram; i++) dest += L"."+data[i]; 115 | if (data[current_ngram] == L"") dest = L"FINAL"; 116 | fout << "(" << dest_ << " (" << dest << " \"" << data[current_ngram] << "\" \"" << data[current_ngram] << "\" " << score << "!))" << endl; 117 | fout2 << "(" << dest_ << " (" << dest << " *e* \"" << data[current_ngram] << "\" " << score << "!))" << endl; 118 | } 119 | 120 | } 121 | } 122 | return 0; 123 | } 124 | -------------------------------------------------------------------------------- /src/code/elisa2flat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import sys 4 | import codecs 5 | 6 | from lxml import etree as ET # pip install lxml 7 | #from xml.etree import ElementTree as ET 8 | from collections import defaultdict as dd 9 | import re 10 | import os.path 11 | import gzip 12 | scriptdir = os.path.dirname(os.path.abspath(__file__)) 13 | 14 | reader = codecs.getreader('utf8') 15 | writer = codecs.getwriter('utf8') 16 | 17 | 18 | def prepfile(fh, code): 19 | ret = gzip.open(fh.name, code if code.endswith("t") else code+"t") if fh.name.endswith(".gz") else fh 20 | if sys.version_info[0] == 2: 21 | if code.startswith('r'): 22 | ret = reader(fh) 23 | elif code.startswith('w'): 24 | ret = writer(fh) 25 | else: 26 | sys.stderr.write("I didn't understand code "+code+"\n") 27 | sys.exit(1) 28 | return ret 29 | 30 | # this code is used below but not in this form 31 | #http://stackoverflow.com/questions/7171140/using-python-iterparse-for-large-xml-files 32 | # def fast_iter(context, func, *args, **kwargs): 33 | # """ 34 | # http://lxml.de/parsing.html#modifying-the-tree 35 | # Based on Liza Daly's fast_iter 36 | # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ 37 | # See also http://effbot.org/zone/element-iterparse.htm 38 | # """ 39 | # for event, elem in context: 40 | # func(elem, *args, **kwargs) 41 | # # It's safe to call clear() here because no descendants will be 42 | # # accessed 43 | # elem.clear() 44 | # # Also eliminate now-empty references from the root node to elem 45 | # for ancestor in elem.xpath('ancestor-or-self::*'): 46 | # while ancestor.getprevious() is not None: 47 | # del ancestor.getparent()[0] 48 | # del context 49 | 50 | 51 | # def process_element(elem): 52 | # print elem.xpath( 'description/text( )' ) 53 | 54 | # context = etree.iterparse( MYFILE, tag='item' ) 55 | # fast_iter(context,process_element) 56 | 57 | def main(): 58 | parser = argparse.ArgumentParser(description="Given a compressed elisa xml file and list of attributes, print them out, tab separated", 59 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 60 | parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="input file") 61 | parser.add_argument("--fields", "-f", nargs='+', help="list of fields to extract text from. if attribute is desired, use field.attribute. Separate fallback fields with :") 62 | parser.add_argument("--segment", "-s", default="SEGMENT", help="segment name. pre v4, PARALLEL for x-eng, SEGMENT for monolingual. Otherwise SEGMENT. More than one match per segment will be concatenated") 63 | parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file") 64 | 65 | 66 | 67 | try: 68 | args = parser.parse_args() 69 | except IOError as msg: 70 | parser.error(str(msg)) 71 | 72 | 73 | infile = args.infile 74 | infile = gzip.open(infile.name, 'rb') if infile.name.endswith(".gz") else infile 75 | outfile = prepfile(args.outfile, 'w') 76 | 77 | 78 | ctxt = ET.iterparse(infile, events=("end", "start")) 79 | # don't delete when in the middle of an element you want to investigate 80 | lock = False 81 | for event, element in ctxt: 82 | if event == "start" and element.tag == args.segment: 83 | lock = True 84 | if event == "end" and element.tag == args.segment: 85 | outfields = [] 86 | for fieldopts in args.fields: 87 | wrotesomething = False 88 | fieldopts = fieldopts.split(":") 89 | while len(fieldopts) > 0: 90 | field = fieldopts.pop(0) 91 | subfields = field.split(".") 92 | matches = [element,] if subfields[0] == args.segment else element.findall(".//"+subfields[0]) 93 | for match in matches: 94 | value = match.get(subfields[1]) if len(subfields) > 1 else match.text 95 | value = value.replace('\n', ' ') if value is not None else None 96 | value = value.replace('\t', ' ') if value is not None else None 97 | if value is not None: 98 | outfields.append(value) 99 | wrotesomething = True 100 | del matches 101 | if wrotesomething: 102 | break 103 | if not wrotesomething: 104 | outfields.append("") 105 | ostr = "\t".join(outfields)+"\n" 106 | outfile.write(ostr) 107 | lock = False 108 | # recover memory 109 | if event == "end" and not lock: 110 | element.clear() 111 | for ancestor in element.xpath('ancestor-or-self::*'): 112 | while ancestor.getprevious() is not None and ancestor.getparent() is not None and ancestor.getparent()[0] is not None: 113 | del ancestor.getparent()[0] 114 | del ctxt 115 | 116 | 117 | if __name__ == '__main__': 118 | main() 119 | 120 | -------------------------------------------------------------------------------- /src/marlin/marlin_count: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | marlin_count, extract bigram staitics from a file. 4 | """ 5 | 6 | import optparse 7 | import collections 8 | import gzip 9 | import bz2 10 | 11 | 12 | def myopen(filename, mode='r'): 13 | """ 14 | Open file. Use gzip or bzip2 if appropriate. 15 | """ 16 | if filename.endswith('.gz'): 17 | return gzip.open(filename, mode) 18 | 19 | if filename.endswith('.bz2'): 20 | return bz2.BZ2File(filename, mode) 21 | 22 | return open(filename, mode) 23 | 24 | 25 | def read_unigram_counts(file_handle, sent_limit=-1): 26 | """ 27 | Read unigram counts from text file. 28 | If sentence_limit is positive only read that-many sentences/lines. 29 | """ 30 | unigram_counts = collections.defaultdict(int) 31 | for number, line in enumerate(file_handle): 32 | tokens = line.split() 33 | for token in tokens: 34 | unigram_counts[token] += 1 35 | if sent_limit >= 0 and sent_limit > number: 36 | break 37 | return unigram_counts 38 | 39 | 40 | def write_words(file_handle, table): 41 | """ 42 | Write words in order of their index to file. 43 | """ 44 | table_inv = [None] * len(table) 45 | for word, index in table.items(): 46 | table_inv[index] = word 47 | for index, word in enumerate(table_inv): 48 | file_handle.write(word) 49 | file_handle.write('\n') 50 | 51 | 52 | def read_bigram_counts(file_handle, table, stop_index, rare_index, sent_limit): 53 | """ 54 | Count bigram coocurrences. Use rare_index for words not in table. 55 | If sentence_limit is positive only read that-many sentences/lines. 56 | """ 57 | counts = [] 58 | for _ in table: 59 | counts.append(collections.defaultdict(int)) 60 | for number, line in enumerate(file_handle): 61 | tokens = line.split() 62 | last = stop_index 63 | for token in tokens: 64 | current = table.get(token, None) 65 | if current is None: 66 | current = rare_index 67 | counts[last][current] += 1 68 | last = current 69 | counts[last][stop_index] += 1 70 | if sent_limit >= 0 and sent_limit > number: 71 | break 72 | return counts 73 | 74 | 75 | def write_bigram_counts(file_handle, counts): 76 | """ 77 | Write bigram statistics to file. 78 | """ 79 | for _, neighbors in enumerate(counts): 80 | items = [] 81 | for neighbor, count in neighbors.items(): 82 | items.append('%d:%d' % (neighbor, count)) 83 | print >>file_handle, ' '.join(items) 84 | 85 | 86 | def main(): 87 | """ 88 | Main function. 89 | """ 90 | 91 | parser = optparse.OptionParser() 92 | parser.add_option("-t", "--text", dest="text", 93 | help="Input text. (one sentence per line, whitespace separated)", 94 | metavar="FILE") 95 | parser.add_option("-w", "--words", dest="words", 96 | help="Output: Word list.", metavar="FILE") 97 | parser.add_option("-b", "--bigrams", dest="bigrams", 98 | help="Output: Bigrams counts.", metavar="FILE") 99 | parser.add_option("-r", "--rank-limit", dest="rank_limit", default=250000, 100 | help="If positive, only extract the r most frequent words.", 101 | type=int) 102 | parser.add_option("-s", "--sent-limit", dest="sent_limit", default=-1, 103 | help="If positive, only process the s first sentences/lines.", 104 | type=int) 105 | 106 | options, _ = parser.parse_args() 107 | 108 | with myopen(options.text) as file_handle: 109 | unigram_counts = read_unigram_counts(file_handle, options.sent_limit) 110 | 111 | table = {} 112 | stop_index = 0 113 | rare_index = 1 114 | table[''] = stop_index 115 | table[''] = rare_index 116 | 117 | unigram_counts = unigram_counts.items() 118 | unigram_counts.sort(key=lambda (word, count): count, reverse=True) 119 | # Add high rank words to table 120 | for word, _ in unigram_counts: 121 | table[word] = len(table) 122 | if options.rank_limit >= 0 and len(table) >= options.rank_limit: 123 | break 124 | # Don't need this anymore: 125 | del unigram_counts 126 | 127 | with myopen(options.words, 'w') as file_handle: 128 | write_words(file_handle, table) 129 | 130 | with myopen(options.text) as file_handle: 131 | bigram_counts = read_bigram_counts(file_handle, table, stop_index, rare_index, 132 | options.sent_limit) 133 | 134 | with myopen(options.bigrams, 'w') as file_handle: 135 | write_bigram_counts(file_handle, bigram_counts) 136 | 137 | 138 | if __name__ == '__main__': 139 | main() 140 | 141 | -------------------------------------------------------------------------------- /utagger: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | 6 | LM=2 # LM order 7 | CHANNEL="" 8 | INPUT="" 9 | OUTPUT="output" 10 | 11 | MODE="train" # [train,test] 12 | ROM="false" 13 | IL="xx" # incident language 14 | PL="en" # parent language(s), comman separated 15 | 16 | BASELINE="brown" 17 | NCLUSTERS=500 18 | NJOBS=2 19 | NITERS=500 20 | 21 | W_LM=1 22 | W_CM=1 23 | 24 | INPUT_FORMAT="txt" #[elisa:tgz,xml;txt,bio] 25 | BIO_DELIM="" 26 | LRLP_FIELD="ULF_LRLP_TOKENIZED_SOURCE" 27 | 28 | BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 29 | EXP_DIR="$BASEDIR/exp-cipher" 30 | CODE_DIR="$BASEDIR/src/code" 31 | 32 | if [ -z "$CARMEL_DIR" ]; then 33 | CARMEL_DIR="/usr/local" 34 | fi 35 | if [ -z "$SRILM_DIR" ]; then 36 | SRILM_DIR="$HOME/srilm-1.7.2" 37 | fi 38 | 39 | 40 | while [ $# -gt 1 ] 41 | do 42 | key="$1" 43 | case $key in 44 | -lm_o|--lm_order) 45 | LM="$2" 46 | shift # past argument 47 | ;; 48 | -i|--input) 49 | INPUT="$2" 50 | shift # past argument 51 | ;; 52 | -if|--in_format) 53 | INPUT_FORMAT="$2" 54 | shift # past argument 55 | ;; 56 | -bio_delim|--bio_delim) 57 | BIO_DELIM="$2" 58 | shift # past argument 59 | ;; 60 | -m|--mode) 61 | MODE="$2" 62 | shift # past argument 63 | ;; 64 | -rom|--rom) 65 | ROM="$2" 66 | shift # past argument 67 | ;; 68 | -cl|--child_lang) 69 | IL="$2" 70 | shift # past argument 71 | ;; 72 | -pl|--par_langs) 73 | PL="$2" 74 | shift # past argument 75 | ;; 76 | -o|--output) 77 | OUTPUT="$2" 78 | shift # past argument 79 | ;; 80 | -exp|--exp_dir) 81 | EXP_DIR="$2" 82 | shift # past argument 83 | ;; 84 | -ca|--clust_alg) 85 | BASELINE="$2" 86 | shift # past argument 87 | ;; 88 | -nc|--nclusters) 89 | NCLUSTERS="$2" 90 | shift # past argument 91 | ;; 92 | -nj|--njobs) 93 | NJOBS="$2" 94 | shift # past argument 95 | ;; 96 | -wlm|--wlm) 97 | W_LM="$2" 98 | shift # past argument 99 | ;; 100 | -wcm|--wcm) 101 | W_CM="$2" 102 | shift # past argument 103 | ;; 104 | -carmel|--carmel) 105 | CARMEL_DIR="$2" 106 | shift # past argument 107 | ;; 108 | -sri|--sridir) 109 | SRILM_DIR="$2" 110 | shift # past argument 111 | ;; 112 | *) 113 | # unknown option 114 | ;; 115 | esac 116 | shift 117 | done 118 | 119 | export CARMEL_DIR=$CARMEL_DIR 120 | export SRILM_DIR=$SRILM_DIR 121 | export EXP_DIR=$EXP_DIR 122 | 123 | 124 | PL_CODES=(${PL//,/ }) 125 | mkdir -p $EXP_DIR/logs $EXP_DIR/models $EXP_DIR/data 126 | datadir="$EXP_DIR/data" 127 | 128 | 129 | if [ $OUTPUT = "output" ];then 130 | OUTPUT="$datadir/output.tagged" 131 | fi 132 | 133 | 134 | ########################################################## 135 | # PREPROCESS 136 | ########################################################## 137 | 138 | #extract from xml 139 | echo "Extracting..." 140 | 141 | if [ $INPUT_FORMAT = "xml" ]; then 142 | python3 src/code/elisa2flat.py --infile $INPUT \ 143 | --fields $LRLP_FIELD \ 144 | --outfile $datadir/input.raw 145 | elif [ $INPUT_FORMAT = "bio" ]; then 146 | cat $INPUT | cut -f 1 -d " " | sed 's/^$/#eos/g' | tr '\n' ' ' | \ 147 | sed 's/ #eos /\n/g' > "$datadir"/input.raw 148 | else 149 | cp $INPUT $datadir/input.raw 150 | fi 151 | 152 | # preprocess 153 | echo "Preprocessing..." 154 | bash $CODE_DIR/preprocess.sh -i $datadir/input.raw -rom $ROM -m $MODE -l input -exp $EXP_DIR 155 | 156 | 157 | 158 | ########################################################## 159 | # RUN TAGGER 160 | ########################################################## 161 | 162 | # train combined LM 163 | if [ $MODE = "train" ]&&[ "${#PL_CODES[@]}" -gt 1 ]; then 164 | echo "Training combined language model..." 165 | bash $CODE_DIR/train_combined_lm.sh -l $PL -ord $LM -exp $EXP_DIR 166 | fi 167 | 168 | # run clustering 169 | if [ $MODE = "train" ]; then 170 | echo "Training clustering..." 171 | bash src/code/run_clustering.sh -i $datadir/input.clean.filt \ 172 | -b $BASELINE -nc $NCLUSTERS -nj $NJOBS -exp $EXP_DIR 173 | fi 174 | 175 | echo "Tagging text with cluster ids..." 176 | bash $CODE_DIR/tag_with_clusters.sh -b $BASELINE -n $NCLUSTERS -i $datadir/input.clean.filt -e $EXP_DIR 177 | 178 | 179 | if [ $MODE = "train" ]; then 180 | echo "Training cipher model..." 181 | python3 $CODE_DIR/train_cipher.py -il $IL -rl $PL -exp $EXP_DIR \ 182 | -it $NITERS -rc 100 -lm 2 -b $BASELINE -nc $NCLUSTERS -j $NJOBS -m train -dc 1.1 183 | 184 | python3 $CODE_DIR/combine_channels.py -nc $NCLUSTERS -ca $BASELINE -exp $EXP_DIR -it $NITERS \ 185 | -il $IL -rl $PL 186 | fi 187 | 188 | 189 | echo "Decoding with cipher model..." 190 | if [ $INPUT_FORMAT = "xml" ]||[ $INPUT_FORMAT = "txt" ]; then 191 | python3 $CODE_DIR/train_cipher.py -il $IL -rl comb -exp $EXP_DIR \ 192 | -o $OUTPUT -tk $datadir/input.raw -tkr $datadir/input.raw.roman \ 193 | -fm $INPUT_FORMAT \ 194 | -it $NITERS -rc 100 -lm 2 -b $BASELINE -nc $NCLUSTERS -j 1 -m eval -dc 1.1 -ct 195 | 196 | elif [ $INPUT_FORMAT = "bio" ]; then 197 | python3 src/code/train_cipher.py -il $IL -rl comb -exp $EXP_DIR \ 198 | -o $OUTPUT -tk $INPUT -tkr $datadir/input.raw.roman \ 199 | -fm $INPUT_FORMAT \ 200 | -it $NITERS -rc 100 -lm 2 -b $BASELINE -nc $NCLUSTERS -j 1 -m eval -dc 1.1 -ct 201 | fi -------------------------------------------------------------------------------- /src/marlin/basic/opt.cc: -------------------------------------------------------------------------------- 1 | #include "opt.h" 2 | #include "std.h" 3 | #include 4 | 5 | //////////////////////////////////////////////////////////////////////// 6 | // command-line arguments 7 | 8 | void GetOpt::AddOpt(const string &name, bool has_arg) { 9 | opts.push_back(pair(name, has_arg)); 10 | } 11 | 12 | void GetOpt::Parse(int argc, char *argv[]) { 13 | option *opt_list = new option[opts.size()+1]; 14 | for(int i = 0; i <= (int)opts.size(); i++) { 15 | option *o = &opt_list[i]; 16 | if(i < (int)opts.size()) { 17 | o->name = opts[i].first.c_str(); 18 | o->has_arg = opts[i].second; 19 | //printf("N %s\n", o->name); 20 | } 21 | else { 22 | o->name = NULL; 23 | o->has_arg = 0; 24 | } 25 | o->flag = NULL; 26 | o->val = 0; 27 | } 28 | 29 | int i; 30 | 31 | values.clear(); 32 | values.resize(opts.size()); 33 | while(true) { 34 | int status = getopt_long(argc, argv, "", opt_list, &i); 35 | if(status == -1) break; 36 | assert(status == 0); 37 | //debug("%d %s -> %s\n", i, opt_list[i].name, optarg); 38 | // put a 1 to signify that the argument exists 39 | values[i] = optarg ? optarg : "1"; 40 | } 41 | 42 | delete [] opt_list; 43 | } 44 | 45 | int GetOpt::Lookup(const string &name) const { 46 | for(int i = 0; i < (int)opts.size(); i++) { 47 | if(opts[i].first == name) return i; 48 | } 49 | return -1; 50 | } 51 | 52 | string GetOpt::Get(const string &name, const string &default_value) const { 53 | int i = Lookup(name); 54 | return i != -1 && !values[i].empty() ? values[i] : default_value; 55 | } 56 | 57 | string GetOpt::Get(const string &name) const { 58 | string x = Get(name, ""); 59 | if(x.empty()) { 60 | fprintf(stderr, "Missing required parameter `%s'.\n", name.c_str()); 61 | exit(1); 62 | } 63 | return x; 64 | } 65 | 66 | bool GetOpt::Exists(const string &name) const { 67 | return !Get(name, "").empty(); 68 | } 69 | 70 | int GetOpt::GetInt(const string &name) const { 71 | int x; 72 | int r = sscanf(Get(name).c_str(), "%d", &x); 73 | assert(r == 1); 74 | return x; 75 | } 76 | 77 | int GetOpt::GetInt(const string &name, int default_value) const { 78 | return Exists(name) ? GetInt(name) : default_value; 79 | } 80 | 81 | double GetOpt::GetDouble(const string &name) const { 82 | double x; 83 | int r = sscanf(Get(name).c_str(), "%lf", &x); 84 | assert(r == 1); 85 | return x; 86 | } 87 | 88 | double GetOpt::GetDouble(const string &name, double default_value) const { 89 | return Exists(name) ? GetDouble(name) : default_value; 90 | } 91 | 92 | //////////////////////////////////////////////////////////// 93 | 94 | void process_opt(int argc, char *argv[]) { 95 | GetOpt opt; 96 | 97 | // set up GetOpt to parse 98 | for(int i = 0; i < (int)bool_opts.size(); i++) { 99 | opt.AddOpt(bool_opts[i].name, false); 100 | opt.AddOpt("no" + bool_opts[i].name, false); 101 | } 102 | for(int i = 0; i < (int)int_opts.size(); i++) 103 | opt.AddOpt(int_opts[i].name, true); 104 | for(int i = 0; i < (int)double_opts.size(); i++) 105 | opt.AddOpt(double_opts[i].name, true); 106 | for(int i = 0; i < (int)string_opts.size(); i++) 107 | opt.AddOpt(string_opts[i].name, true); 108 | opt.AddOpt("help", false); 109 | 110 | // parse 111 | opt.Parse(argc, argv); 112 | 113 | // print help if called for 114 | if(opt.Exists("help")) { 115 | printf("usage: %s\n", argv[0]); 116 | for(int i = 0; i < (int)bool_opts.size(); i++) { 117 | const OptInfo &o = bool_opts[i]; 118 | printf(" %c%-20s: %s", " *"[o.required], o.name.c_str(), o.msg.c_str()); 119 | if(!o.required) printf(" [%s]", *(o.var) ? "true" : "false"); 120 | printf("\n"); 121 | } 122 | for(int i = 0; i < (int)int_opts.size(); i++) { 123 | const OptInfo &o = int_opts[i]; 124 | printf(" %c%-13s : %s", " *"[o.required], o.name.c_str(), o.msg.c_str()); 125 | if(!o.required) printf(" [%d]", *(o.var)); 126 | printf("\n"); 127 | } 128 | for(int i = 0; i < (int)double_opts.size(); i++) { 129 | const OptInfo &o = double_opts[i]; 130 | printf(" %c%-13s : %s", " *"[o.required], o.name.c_str(), o.msg.c_str()); 131 | if(!o.required) printf(" [%f]", *(o.var)); 132 | printf("\n"); 133 | } 134 | for(int i = 0; i < (int)string_opts.size(); i++) { 135 | const OptInfo &o = string_opts[i]; 136 | printf(" %c%-13s : %s", " *"[o.required], o.name.c_str(), o.msg.c_str()); 137 | if(!o.required) printf(" [%s]", (o.var)->c_str()); 138 | printf("\n"); 139 | } 140 | exit(1); 141 | } 142 | 143 | // retrieve data; store the variables 144 | for(int i = 0; i < (int)bool_opts.size(); i++) { 145 | const OptInfo &o = bool_opts[i]; 146 | bool yes = opt.Exists(o.name); 147 | bool no = opt.Exists("no" + o.name); 148 | assert(!o.required || (yes || no)); 149 | assert(!yes || !no); 150 | if(yes) *(o.var) = true; 151 | if(no) *(o.var) = false; 152 | } 153 | for(int i = 0; i < (int)int_opts.size(); i++) { 154 | const OptInfo &o = int_opts[i]; 155 | *(o.var) = o.required ? opt.GetInt(o.name) : opt.GetInt(o.name, *(o.var)); 156 | } 157 | for(int i = 0; i < (int)double_opts.size(); i++) { 158 | const OptInfo &o = double_opts[i]; 159 | *(o.var) = o.required ? opt.GetDouble(o.name) : opt.GetDouble(o.name, *(o.var)); 160 | } 161 | for(int i = 0; i < (int)string_opts.size(); i++) { 162 | const OptInfo &o = string_opts[i]; 163 | *(o.var) = o.required ? opt.Get(o.name) : opt.Get(o.name, *(o.var)); 164 | } 165 | } 166 | 167 | void init_opt(int argc, char *argv[]) { 168 | process_opt(argc, argv); 169 | srand(rand_seed); 170 | } 171 | 172 | void print_opts() { 173 | forvec(_, const OptInfo &, o, bool_opts) 174 | cerr << o.name << " = " << (*o.var ? "true" : "false") << endl; 175 | forvec(_, const OptInfo &, o, int_opts) 176 | cerr << o.name << " = " << *o.var << endl; 177 | forvec(_, const OptInfo &, o, double_opts) 178 | cerr << o.name << " = " << *o.var << endl; 179 | forvec(_, const OptInfo &, o, string_opts) 180 | cerr << o.name << " = " << *o.var << endl; 181 | } 182 | 183 | //////////////////////////////////////////////////////////// 184 | // Pre defined options. 185 | 186 | // allow user to specify a comment always, so some arbitrary description 187 | // of this program execution can be embedded in the command-line 188 | -------------------------------------------------------------------------------- /src/marlin/basic/stl-utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __STL_UTILS__ 2 | #define __STL_UTILS__ 3 | 4 | #include "stl-basic.h" 5 | #include 6 | 7 | #define contains(X, x) ((X).find(x) != (X).end()) 8 | 9 | inline void improve(DoubleInt &x, const DoubleInt &y) { 10 | if(y.first > x.first) x = y; // Bigger is better. 11 | } 12 | 13 | template inline void improve(DoubleInt &x, const DoubleInt &y, Compare compare) { 14 | if(compare(y.first, x.first)) x = y; 15 | } 16 | 17 | // Free up the memory in a vector or hash_map. 18 | template void destroy(T &obj) { 19 | T empty_obj; 20 | obj.swap(empty_obj); 21 | } 22 | 23 | template int index_of(const vector &vec, const T &x, int i0 = 0) { 24 | for(int i = i0; i < len(vec); i++) 25 | if(vec[i] == x) return i; 26 | return -1; 27 | } 28 | 29 | template int count_of(const vector &vec, const T &x) { 30 | int n = 0; 31 | forvec(_, const T &, y, vec) 32 | if(x == y) n++; 33 | return n; 34 | } 35 | 36 | // Get vec[i], but if i is out of range, expand the vector and fill 37 | // everything with x. 38 | template T &expand_get(vector &vec, int i, const T &x) { 39 | int n = len(vec); 40 | if(i >= n) { 41 | vec.resize(i+1); 42 | for(int ii = n; ii <= i; ii++) vec[ii] = x; 43 | } 44 | return vec[i]; 45 | } 46 | template T &expand_get(vector< vector > &mat, int i, int j, const T &x) { 47 | int n = len(mat); 48 | if(i >= n) mat.resize(i+1); 49 | return expand_get(mat[i], j, x); 50 | } 51 | template T &expand_get(vector< vector< vector > > &mat, int i, int j, int k, const T &x) { 52 | int n = len(mat); 53 | if(i >= n) mat.resize(i+1); 54 | return expand_get(mat[i], j, k, x); 55 | } 56 | 57 | // Assuming this vector/matrix will not grow any more, 58 | // we can safely call compact to reduce the memory usage. 59 | // This is only effective after deletions. 60 | // This isn't necessary if we haven't actually touched 61 | // the memory past size (i.e., we didn't have a bigger 62 | // structure). 63 | template void vector_compact(vector &vec) { 64 | vector new_vec(len(vec)); 65 | new_vec = vec; 66 | vec.swap(new_vec); 67 | } 68 | template void matrix_compact(vector< vector > &mat) { 69 | vector< vector > new_mat(len(mat)); 70 | foridx(i, len(mat)) compact(mat[i]); 71 | new_mat = mat; 72 | mat.swap(new_mat); 73 | } 74 | 75 | // Append to a vector and return the value type. 76 | template inline T &push_back(vector &vec, const T &x = T()) { 77 | vec.push_back(x); 78 | return vec[len(vec)-1]; 79 | } 80 | 81 | template inline void matrix_resize(vector< vector > &mat, int nr, int nc) { 82 | mat.resize(nr); 83 | foridx(r, nr) mat[r].resize(nc); 84 | } 85 | 86 | template inline void matrix_resize(vector< vector< vector > > &mat, int n1, int n2, int n3) { 87 | mat.resize(n1); 88 | foridx(i, n1) { 89 | mat[i].resize(n2); 90 | foridx(j, n2) 91 | mat[i][j].resize(n3); 92 | } 93 | } 94 | 95 | template inline vector< vector > new_matrix(int nr, int nc, T v) { 96 | vector< vector > mat; 97 | mat.resize(nr); 98 | foridx(r, nr) { 99 | mat[r].resize(nc); 100 | foridx(c, nc) 101 | mat[r][c] = v; 102 | } 103 | return mat; 104 | } 105 | 106 | template inline void matrix_fill(vector< vector > &mat, T v) { 107 | foridx(i, len(mat)) vector_fill(mat[i], v); 108 | } 109 | 110 | template inline void vector_fill(vector &vec, T v) { 111 | foridx(i, len(vec)) vec[i] = v; 112 | } 113 | 114 | template inline T vector_sum(const vector &vec) { 115 | T sum = 0; 116 | foridx(i, len(vec)) sum += vec[i]; 117 | return sum; 118 | } 119 | 120 | // Returns the index of the minimum element in vec. 121 | template inline int vector_index_min(const vector &vec) { 122 | T min = vec[0]; 123 | int best_i = 0; 124 | foridx(i, len(vec)) { 125 | if(vec[i] < min) { 126 | min = vec[i]; 127 | best_i = i; 128 | } 129 | } 130 | return best_i; 131 | } 132 | 133 | template inline int vector_min(const vector &vec) { 134 | return vec[vector_index_min(vec)]; 135 | } 136 | 137 | // Returns the index of the maximum element in vec. 138 | template inline int vector_index_max(const vector &vec) { 139 | T max = vec[0]; 140 | int best_i = 0; 141 | foridx(i, len(vec)) { 142 | if(vec[i] > max) { 143 | max = vec[i]; 144 | best_i = i; 145 | } 146 | } 147 | return best_i; 148 | } 149 | 150 | template inline int vector_max(const vector &vec) { 151 | return vec[vector_index_max(vec)]; 152 | } 153 | 154 | // Returns the index of the maximum element in vec. 155 | template inline IntPair matrix_index_max(const vector< vector > &mat) { 156 | T max = mat[0][0]; 157 | IntPair best_ij = IntPair(0, 0); 158 | foridx(i, len(mat)) { 159 | foridx(j, len(mat[i])) { 160 | if(mat[i][j] > max) { 161 | max = mat[i][j]; 162 | best_ij = IntPair(i, j); 163 | } 164 | } 165 | } 166 | return best_ij; 167 | } 168 | 169 | // Returns the sum of the elements in column c. 170 | template inline T matrix_col_sum(const vector< vector > &mat, int c) { 171 | T sum = 0; 172 | foridx(r, len(mat)) sum += mat[r][c]; 173 | return sum; 174 | } 175 | 176 | template ostream &operator<<(ostream &out, const pair &p) { 177 | return out << p.first << ' ' << p.second; 178 | } 179 | 180 | template ostream &operator<<(ostream &out, const vector &vec) { 181 | foridx(i, len(vec)) { 182 | if(i > 0) out << ' '; 183 | out << vec[i]; 184 | } 185 | return out; 186 | } 187 | 188 | template ostream &operator<<(ostream &out, const vector< vector > &mat) { 189 | foridx(r, len(mat)) out << mat[r] << endl; 190 | return out; 191 | } 192 | 193 | template vector subvector(const vector &vec, int i, int j = -1) { 194 | int N = len(vec); 195 | if(j < 0) j += N; 196 | if(j < i) j = i; 197 | 198 | // Probably some fancy STL way to do this. 199 | vector subvec(j-i); 200 | foridx(k, j-i) subvec[k] = vec[i+k]; 201 | return subvec; 202 | } 203 | 204 | template vector to_vector(T arr[], int n) { 205 | vector vec(n); 206 | foridx(i, n) vec[i] = arr[i]; 207 | return vec; 208 | } 209 | 210 | inline IntVec to_vector(int n, ...) { 211 | va_list ap; 212 | IntVec vec; 213 | va_start(ap, n); 214 | foridx(i, n) vec.push_back(va_arg(ap, int)); 215 | va_end(ap); 216 | return vec; 217 | } 218 | 219 | inline DoubleVec to_fvector(int n, ...) { 220 | va_list ap; 221 | DoubleVec vec; 222 | va_start(ap, n); 223 | foridx(i, n) vec.push_back(va_arg(ap, double)); 224 | va_end(ap); 225 | return vec; 226 | } 227 | 228 | template inline void operator+=(vector &vec1, const vector &vec2) { 229 | foridx(i, len(vec1)) vec1[i] += vec2[i]; 230 | } 231 | 232 | #endif 233 | -------------------------------------------------------------------------------- /src/code/filter_lowfreq.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | import argparse 3 | from collections import Counter 4 | import re 5 | import pdb 6 | 7 | filters_detailed = [ 8 | ("url" , [re.compile(r'^https?[:/]{1,3}(www\.)?[a-z]+(\.?[a-z]+\/?)+.*?$',re.UNICODE), 9 | re.compile(r'^[wW]{3}\.[a-zA-Z]+(\.?[A-Z]+\/?)+.*?$',re.UNICODE), 10 | re.compile(r'^([a-zA-Z][^@])[a-zA-Z.]+\.com$',re.UNICODE), 11 | ]), 12 | ('email', [re.compile(r'^[-a-zA-Z0-9_.]+\@([a-zA-Z0-9]+\.)+[a-zA-Z]+$',re.UNICODE) ]), 13 | ("00:00" , [re.compile(r'[0-9](:[0-9]{2})+',re.UNICODE), 14 | re.compile(r'[0-9](:[0-9]{2})*[aApP][mM]$',re.UNICODE), 15 | re.compile(r'[0-9]hour$',re.UNICODE),] ), 16 | ("00km", [re.compile(r'[0-9]km$',re.UNICODE)]), 17 | ("00kg", [re.compile(r'[0-9]kg$',re.UNICODE)]), 18 | ("haha", [re.compile(r'^haha$',re.UNICODE), 19 | re.compile(r'^wkwk$',re.UNICODE)]), 20 | 21 | ] 22 | 23 | filters = [ 24 | ("snUser" , [re.compile(r'^[@]([0-9]*[-a-zA-Z._]+[0-9]*[!?]?)+$',re.UNICODE)] ), 25 | ("hashTag" , [re.compile(r'^[#][-a-zA-Z._]{3,}[0-9]*[!?]?$',re.UNICODE), 26 | re.compile(r'^[#][0-9]+[-a-zA-Z._]{3,}[!?]?$',re.UNICODE), 27 | re.compile(r'^[#][0-9]+[-a-zA-Z._]{3,}[0-9]+[!?]?$',re.UNICODE), ]), 28 | ("twoDigitNum" , [re.compile(r'^[0-9]{2}$',re.UNICODE)] ), 29 | ("fourDigitNum" , [re.compile(r'^[0-9]{4}$',re.UNICODE)] ), 30 | ("hasDigitAndAlpha" , [re.compile(r'[0-9].*[a-zA-Z]',re.UNICODE) , 31 | re.compile(r'[a-zA-Z].*[0-9]',re.UNICODE) ]) , 32 | ("hasDigitAndDash" , [re.compile(r'[0-9]-[0-9]',re.UNICODE)] ), 33 | ("hasDigitAndSlash" , [re.compile(r'[0-9]/[0-9]',re.UNICODE)] ), 34 | ("hasDigitAndComma" , [re.compile(r'[0-9],[0-9]',re.UNICODE)] ), 35 | ("hasDigitAndPeriod" , [re.compile(r'[0-9][.][0-9]',re.UNICODE)] ), 36 | ("isHour" , [re.compile(r'[0-9]:[0-9]',re.UNICODE), 37 | re.compile(r'[0-9][aApP][mM]$',re.UNICODE)] ), 38 | ("othernum" , [re.compile(r'^[0-9]+$',re.UNICODE)] ), 39 | ("allCaps" , [re.compile(r'^[A-Z]+$',re.UNICODE)] ), 40 | ("capPeriod" , [re.compile(r'^[A-Z][.]$',re.UNICODE)] ), 41 | ("initCap" , [re.compile(r'^[A-Z][a-z]+$',re.UNICODE)] ), 42 | ("lowercase" , [re.compile(r'^[a-z]$',re.UNICODE)] ), 43 | ] 44 | 45 | is_prob_word = re.compile(r"^([a-zA-Z]+[-._',&]?)+$",re.UNICODE) 46 | 47 | 48 | def get_filter_tag(word,filter_list): 49 | for tag,reg_list in filter_list: 50 | for reg in reg_list: 51 | if reg.search(word)!=None: 52 | return tag 53 | return word 54 | 55 | 56 | 57 | 58 | 59 | if __name__ == "__main__": 60 | parser = argparse.ArgumentParser() 61 | #parser.add_argument("--l","-l", type=str, help="Language -aaa-") 62 | parser.add_argument("--input","-i", type=str, help="Input file") 63 | parser.add_argument("--mode","-m", type=str, default="train", help="Mode [train,eval]") 64 | parser.add_argument("--vocab","-v", type=str, default=None, help="Filtered vocabulary") 65 | parser.add_argument("--thr","-t", type=int, default=3, help="Cut-off threshold") 66 | #parser.add_argument("--sent_len","-sl", type=int, default=190, help="Filter threshold for long sentences") 67 | parser.add_argument("--dom","-d", type=str, default=None, help="Test domain (valid only for outd exps)") 68 | parser.add_argument("--aggr","-aggr", action='store_true', help="Perform aggresive filtering (threshold oriented)") 69 | parser.add_argument("--ign_emp","-ig", action='store_true', help="Ignore empty lines/sentences.") 70 | parser.add_argument("--lower","-low", action='store_true', help="Lowercase all text") 71 | args = parser.parse_args() 72 | 73 | vocab = set() 74 | 75 | # load input 76 | data = open(args.input,'r').read().split('\n') 77 | data = [line for line in data] 78 | if data[-1] == '': data = data[:-1] 79 | 80 | ### aggressive filtering mode 81 | 82 | ## train mode 83 | # create vocabulary 84 | if args.mode == "train": 85 | vocab = Counter() 86 | for sent in data: 87 | if sent=='': continue 88 | if args.lower: sent = sent.lower() 89 | vocab.update(sent.split(' ')) 90 | filt = [] 91 | count = 0 92 | 93 | for x,y in vocab.most_common(): 94 | # if aggresive, evth below threshold is ignored 95 | if y<=args.thr and args.aggr: 96 | break 97 | if len(x)>40: 98 | continue 99 | # if not aggressive, evth be;pw thre that is not a word is ignored 100 | if y<=args.thr and is_prob_word.search(x)==None: 101 | continue 102 | 103 | # all possible urls, email and hours are ignored 104 | if get_filter_tag(x,filters_detailed)!=x: 105 | continue 106 | filt.append([x,y]) 107 | if count%100000 == 0: 108 | print('->',count) 109 | count += 1 110 | #filt = [[x,y] for x,y in vocab.most_common() if y>args.thr] 111 | dom_pref = '' if args.dom==None else '.'+args.dom 112 | vocab_fn = os.path.join(os.path.dirname(args.input),"vocab"+dom_pref) 113 | open(vocab_fn,'w').write('\n'.join(["%s\t%d" % (w,f) for w,f in filt]) + '\n') 114 | vocab = set([x for x,y in filt]) 115 | 116 | del filt 117 | 118 | # eval mode 119 | # load vocabulary 120 | else: 121 | if args.vocab==None: 122 | print("Error: Filtered vocabulary file not specified!\nCheck arguments list with -h option") 123 | sys.exit(1) 124 | elif not os.path.exists(args.vocab): 125 | print("Error: Filtered vocabulary file does not exist!") 126 | sys.exit(1) 127 | else: 128 | for line in open(args.vocab,'r'): 129 | line = line.strip('\n').strip(' ') 130 | if line=='': continue 131 | w,f = line.split('\t') 132 | vocab.add(w) 133 | # 134 | #END-IF-MODE 135 | 136 | outfile = open(args.input+".filt",'w') 137 | count = 0 138 | 139 | # filter data 140 | for sent in data: 141 | if sent=='' and not args.ign_emp: 142 | print('',file=outfile) 143 | continue 144 | 145 | new_sent = [] 146 | if args.lower: 147 | sent = sent.lower() 148 | sent_tok = sent.split(' ') 149 | #if args.ign_emp and len(sent_tok)>args.sent_len-1: 150 | # continue 151 | for word in sent_tok: 152 | if word in vocab: 153 | new_sent.append(word) 154 | else: 155 | tag = get_filter_tag(word,filters_detailed) 156 | if tag!=word: 157 | new_sent.append(tag) 158 | continue 159 | tag = get_filter_tag(word,filters) 160 | if tag==word: 161 | tag = 'unk' 162 | new_sent.append("<"+tag+">") 163 | #END-IF-VOCAB 164 | #END-FOR-W 165 | new_sent.append("#eos") 166 | print(' '.join(new_sent),file=outfile) 167 | 168 | if count % 100000 == 0: 169 | print("->",count) 170 | count+=1 171 | #END-FOR-SENT 172 | 173 | 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /src/code/train_cipher.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | import argparse 3 | from multiprocessing import Pool 4 | import subprocess as sp 5 | import numpy as np 6 | from utils import * 7 | import pdb 8 | 9 | 10 | import warnings 11 | warnings.filterwarnings("ignore") 12 | 13 | PRIME_MOD = 19751 14 | 15 | 16 | def run_train_channel(conf): 17 | rl,order,il,c_alg,cl,it,_id,IS_ELISA,exp_dir = conf.split('.') 18 | print(" running conf: ",conf) 19 | print(" ",rl,order,il,c_alg,cl,it,_id,IS_ELISA,exp_dir) 20 | 21 | IS_ELISA = bool(int(IS_ELISA)) 22 | seed = (int(_id)+1) * PRIME_MOD * 100 23 | argums = ["bash","src/code/train_channel.sh", 24 | "-rl",rl,'-o',order, 25 | "-il",il,'-c',cl, 26 | "-it",it,'-id',_id, 27 | "-s",str(seed), 28 | "-exp",exp_dir 29 | ] 30 | 31 | if IS_ELISA: 32 | argums.extend(["-elisa","e"]) 33 | if c_alg!="": 34 | argums.extend(["-ca",c_alg]) 35 | 36 | pobj = sp.Popen(argums) 37 | while pobj.wait(): continue 38 | 39 | channel_name = "%s%s-%s.%s.%s.%s.%s" % (rl,order,il,c_alg,cl,it,_id) 40 | 41 | if not os.path.exists(exp_dir + "/logs/" + channel_name): 42 | return None 43 | 44 | lines = open(exp_dir + "/logs/" + channel_name,'r').read().strip('\n').split('\n') 45 | to_mine = "" 46 | for line in lines[-10:]: 47 | if line.startswith("Setting weights to model"): 48 | to_mine = line 49 | break 50 | idx = to_mine.rfind("^") 51 | print(to_mine,idx) 52 | score = float(to_mine[idx+1:].strip(' ')) 53 | 54 | return [int(_id),score] 55 | 56 | 57 | 58 | if __name__ == "__main__": 59 | parser = argparse.ArgumentParser() 60 | parser.add_argument("--tokens" ,"-tk" , type=str, default='', help="Input raw to tag --non-romanized, if applicable") 61 | parser.add_argument("--tokens_roman","-tkr", type=str, default='', help="Input raw to tag --romanized") 62 | parser.add_argument("--format" ,"-fm" , type=str, default='txt', help="Format of input tokens [txt,bio]") 63 | parser.add_argument("--bio_delim" ,"-bio_delim" , type=str, default=' ', help="Delimiter in BIO column format [' ', \n]") 64 | 65 | parser.add_argument("--output" ,"-o", type=str, default='', help="Output file") 66 | parser.add_argument("--exp_dir" ,"-exp", type=str, default='', help="Experiment folder") 67 | parser.add_argument("--il" ,"-il", type=str, default='en', help="Incident Language") 68 | parser.add_argument("--rl" ,"-rl", type=str, default=None, help="Related Language") 69 | parser.add_argument("--iter" ,"-it", type=int, default=10, help="N. iterations per cipher run") 70 | parser.add_argument("--run_per_ch" ,"-rc", type=int, default=100, help="N. runs per cipher conf") 71 | parser.add_argument("--lm_order" ,"-lm", type=int, default=2, help="LM order") 72 | parser.add_argument("--baseline" ,"-b", type=str, default="brown", help="clustering algorithm [brown,ah,{l,p}{k,s}{100,300}{mono,multi}]") 73 | parser.add_argument("--num_clusters","-nc", type=int, default=500, help="Number of clusters on cipher side") 74 | parser.add_argument("--njobs" ,"-j", type=int, default=4, help="Number of jobs") 75 | parser.add_argument("--mode" ,"-m", type=str, default="train", help="mode [train,eval]") 76 | parser.add_argument("--test_data" ,"-td", type=str, default="ud", help="which test data to evaluate [ud,elisa]") 77 | parser.add_argument("--dec_conf" ,"-dc", type=str, default="1.1", help="weights for decoder (LM,CM)") 78 | parser.add_argument("--comb_table","-ct", action='store_true', help="Used combined cipher table channel model to decode") 79 | 80 | args = parser.parse_args() 81 | 82 | IS_ELISA = False 83 | CLUST_ALG = args.baseline 84 | 85 | 86 | if args.mode == "train": 87 | rl_list = [] 88 | def_rfs = "en,de,fr,it,es,ja,ar,cs,ru,sw-hcs,hi" 89 | 90 | # No RL spec : default list 91 | if args.rl==None: 92 | rl_list = def_rfs.split(",") 93 | # Single RL spec : will run only one RL-IL pair 94 | elif ',' not in args.rl: 95 | rl_list = [args.rl] 96 | # Multiple RL spec: arg format "en.de.du.da", will run for RL-IL for all RL specified 97 | else: 98 | rl_list = args.rl.split(',') 99 | 100 | with Pool(args.njobs) as pool: 101 | for rl in rl_list: 102 | print() 103 | print("RL: ",rl) 104 | print("-"*60) 105 | conf_pref = "%s.%d.%s.%s.%d.%d" % (rl,args.lm_order,args.il,CLUST_ALG,args.num_clusters,args.iter) 106 | channel_name = "%s%d-%s.%s.%d.%d" % (rl,args.lm_order,args.il,CLUST_ALG,args.num_clusters,args.iter) 107 | # train channel 108 | if "elisa" in args.il or args.il in ["ta","tl"]: 109 | IS_ELISA = True 110 | 111 | confs = ["%s.%d.%d.%s" % (conf_pref,_id,IS_ELISA,args.exp_dir) for _id in range(args.run_per_ch) ] 112 | res = pool.map(run_train_channel,confs) 113 | res = [x for x in res if x!=None] 114 | idxs = [x for x,y in res] 115 | idx = np.array([y for x,y in res]).argmin() 116 | print("best model:",res[idx]) 117 | open("%s/logs/%s.scores" % (args.exp_dir,channel_name),'w').write('\n'.join(["%d %f" % (x,y) for x,y in res])) 118 | 119 | # clean directories 120 | to_rm = ["%s/logs/%s.%d" % (args.exp_dir,channel_name,_id) for _id in idxs if _id!=res[idx][0] ] 121 | if len(to_rm)>0: 122 | sp.run(["rm"] + to_rm) 123 | sp.run(["mv","%s/logs/%s.%d" % (args.exp_dir,channel_name,res[idx][0]),args.exp_dir + "/logs/"+channel_name]) 124 | to_rm = ["%s/models/%s.%d" % (args.exp_dir,channel_name,_id) for _id in idxs if _id!=res[idx][0] ] 125 | if len(to_rm)>0: 126 | sp.run(["rm"] + to_rm) 127 | sp.run(["mv","%s/models/%s.%d" % (args.exp_dir,channel_name,res[idx][0]),args.exp_dir+"/models/"+channel_name]) 128 | #END-FOR 129 | #END-WITH 130 | 131 | 132 | # eval & tag 133 | else: 134 | rl = args.rl 135 | lm_dir = "../../lms" 136 | if args.comb_table: 137 | rl = "comb" # placeholder for combination code 138 | lm_dir = args.exp_dir + "/lm" 139 | 140 | channel_name = "%s.%s.%d.%d.comb" % (args.il,args.baseline,args.num_clusters,args.iter) 141 | lm_file = "%s/%s.%d.fsa.noe" % (lm_dir,rl,args.lm_order) 142 | wlm,wcm = args.dec_conf.split(".") 143 | 144 | test_file = "%s/data/output.%d.%s.carmel" % (args.exp_dir,args.num_clusters,args.baseline) 145 | 146 | agms = ["bash","src/code/decode.sh","-lm",lm_file, 147 | "-ch",channel_name, 148 | "-i",test_file, 149 | "-wlm",wlm, 150 | "-wcm",wcm 151 | ] 152 | pobj = sp.Popen(agms) 153 | while pobj.wait(): continue 154 | 155 | outfile = open(args.output,'w') 156 | outfile_rom = open(args.output+".roman",'w') 157 | toks_file = open(args.tokens,'r') 158 | toks_rom_file = open(args.tokens_roman,'r') 159 | tags_file = open("%s.%s.%s.%s.decoded" % \ 160 | (test_file,channel_name,wlm,wcm), 'r' ) 161 | 162 | 163 | for tag_line in tags_file: 164 | tags = tag_line.split() 165 | tok_rom_line = toks_rom_file.readline().strip('\n') # raw, romanized text, always in txt format 166 | tok_roms = tok_rom_line.split() 167 | 168 | if args.format == 'txt': 169 | tok_line = toks_file.readline().strip('\n') 170 | ntags=[] 171 | for tk,tag in zip(tok_roms,tags): 172 | ntags.append( ground_tag(tk,tag) ) 173 | # 174 | pairs = zip(tok_line.split(),ntags) 175 | print(" ".join(["%s/%s"%(x,y) for x,y in pairs]), file=outfile) 176 | 177 | pairs = zip(tok_roms,ntags) 178 | print(" ".join(["%s/%s"%(x,y) for x,y in pairs]), file=outfile_rom) 179 | 180 | elif args.format == 'bio': 181 | idx = 0 182 | while(True): 183 | tok_line = toks_file.readline().strip('\n') 184 | if tok_line=='': 185 | print("",file=outfile) 186 | break 187 | tk = tok_roms[idx] 188 | tag = ground_tag(tk,tags[idx]) 189 | print("%s%s%s" % (tok_line,args.bio_delim,tag),file=outfile) 190 | idx += 1 191 | # 192 | # 193 | 194 | #END-FOR 195 | -------------------------------------------------------------------------------- /src/code/utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from sklearn.metrics import classification_report,\ 3 | accuracy_score, v_measure_score, \ 4 | precision_score, recall_score, f1_score 5 | from collections import defaultdict, Counter 6 | import subprocess as sp 7 | import unicodedata 8 | import pdb 9 | import pickle 10 | import sys 11 | sys.path.append("cpos") 12 | 13 | # UDv2 & UTv2 support 14 | upos2char = { 15 | "NOUN" :'N', 16 | "PROPN" :'O', 17 | "ADJ" :'A', 18 | "ADV" :'R', 19 | "ADP" :"I", 20 | "AUX" :"B", 21 | "CCONJ" :"C", 22 | "SCONJ" :"J", 23 | "DET" :"D", 24 | "INTJ" :'T', 25 | "NUM" :"M", 26 | "PART" :"F", 27 | "PRON" :"P", 28 | "PUNCT" :"E", 29 | "SYM" :"Y", 30 | "VERB" :"V", 31 | "X" :"X", 32 | } 33 | 34 | 35 | char2upos = {v:k for k,v in upos2char.items() } 36 | 37 | ut2char = { 38 | "NOUN" :'N', 39 | "PROPN" :'O', 40 | "ADJ" :'A', 41 | "ADV" :'R', 42 | "ADP" :"I", 43 | "AUX" :"B", 44 | "CONJ" :"C", 45 | "DET" :"D", 46 | "INTJ" :'T', 47 | "NUM" :"M", 48 | "PRT" :"F", 49 | "PRON" :"P", 50 | "." :"E", 51 | "SYM" :"Y", 52 | "VERB" :"V", 53 | "X" :"X", 54 | } 55 | 56 | char2ut = {v:k for k,v in ut2char.items() } 57 | 58 | 59 | MAX_LINES_DECODE=20000 60 | # MAX_LINES_DECODE=10 61 | 62 | mapper = { 63 | 'en' : ['en','fr','de','es','it','ar','ja','cs','ru','sw-hcs'], 64 | 'fr' : ['fr','de','es','ja','ar','it','en','cs','ru','sw-hcs'], 65 | 'fa' : ['fa','fr','de,''es','ja','cs','ar','it','en','ru','sw-hcs'], 66 | 'sw-hcs' : ['sw-hcs','de','fr','es','ja','ar', 'it','en','cs','ru'], 67 | #'tl-elisa' : ['id','fr','es','ja','it','en','cs','ru','sw-hcs'], 68 | #'tl-elisa' : ['fr','es','ja','it','en','cs','ru','sw-hcs'], 69 | 'si-elisa' : ['de','fr','es','ja','ar', 'it','en','cs','ru','sw-hcs'], 70 | 'rw-elisa' : ['de','fr','es','ja','ar', 'it','en','cs','ru','sw-hcs'] 71 | } 72 | 73 | default_rls = ['en','de','fr','es','it','ja','ar','cs','ru','sw-hcs'] 74 | 75 | 76 | def saveObject(obj, name='model'): 77 | with open(name + '.pickle', 'wb') as fd: 78 | pickle.dump(obj, fd, protocol=pickle.HIGHEST_PROTOCOL) 79 | 80 | 81 | def uploadObject(obj_name): 82 | # Load tagger 83 | with open(obj_name, 'rb') as fd: 84 | obj = pickle.load(fd) 85 | return obj 86 | 87 | 88 | def test_punct(token): 89 | for c in token: 90 | if unicodedata.category(c)[0] != 'P': 91 | return False 92 | return True 93 | 94 | 95 | def test_num(token): 96 | for c in token: 97 | if unicodedata.category(c)[0] != 'N': 98 | return False 99 | return True 100 | 101 | 102 | def ground_tag(tk,tag,pos_tagset="ud"): 103 | mapper = char2ut if pos_tagset=="ut" else char2upos 104 | if tk.isdigit() or test_num(tk): 105 | return mapper["M"] 106 | else: 107 | is_punct = test_punct(tk) 108 | # false negatives 109 | if is_punct: 110 | return mapper["E"] 111 | # false positives 112 | elif tag==mapper["E"] and not is_punct: 113 | return "X" 114 | # the rest 115 | return tag 116 | 117 | 118 | def evaluate_core(gold_fn,pred_fn): 119 | gold,pred = [],[] 120 | count = 1 121 | gls = [] 122 | for line in open(gold_fn,'r'): 123 | gold.extend(line.strip('\n').split(' ')) 124 | gls.append(line.strip('\n').split(' ')) 125 | if count>=MAX_LINES_DECODE: 126 | break 127 | count += 1 128 | count = 0 129 | for line in open(pred_fn,'r'): 130 | pline = line.strip('\n').split(' ') 131 | pred.extend(pline) 132 | if len(pline)!= len(gls[count]): 133 | print("->",count,len(gls[count]), len(pline)) 134 | print(gls[count]) 135 | print(pline) 136 | print("-"*50) 137 | pdb.set_trace() 138 | count += 1 139 | return gold,pred 140 | 141 | 142 | def evaluate(gold_fn,pred_fn,report=True): 143 | gold,pred = evaluate_core(gold_fn,pred_fn) 144 | 145 | acc = accuracy_score(gold,pred) 146 | if report: 147 | print("ACC: %.4f" % acc ) 148 | print("VM : %.4f" % v_measure_score(gold,pred)) 149 | print(classification_report(gold,pred,digits=4)) 150 | return acc 151 | 152 | def evaluate_all_metrics(gold_fn,pred_fn): 153 | gold,pred = evaluate_core(gold_fn,pred_fn) 154 | acc = accuracy_score(gold,pred) 155 | p = precision_score(gold,pred) 156 | r = recall_score(gold,pred) 157 | f1 = f1_score(gold,pred) 158 | support = Counter(gold) 159 | 160 | 161 | 162 | 163 | def eval_lexicon(lexicon_fn,pred_fn,words_fn,report=True): 164 | # read lexicon 165 | lexicon = defaultdict(set) 166 | for line in open(lexicon_fn,'r'): 167 | line= line.strip('\t') 168 | if line=='': continue 169 | w,pos,_ = line.split("\t") 170 | if pos=='PRT': pos = 'PART' 171 | lexicon[w].add(pos) 172 | 173 | # read pred file 174 | gold,pred = [],[] 175 | pred_vocab = defaultdict(set) 176 | predpos_lines = open(pred_fn,'r').read().strip('\n').split('\n') 177 | word_lines = open(words_fn,'r').read().strip('\n').split('\n') 178 | for wform_line,pred_line in zip(word_lines,predpos_lines): 179 | wforms = wform_line.lower().split(" ")[:-1] 180 | ptags = pred_line.split(" ") 181 | for w,pos in zip(wforms,ptags): 182 | if w not in lexicon: continue 183 | pred_vocab[w].add(pos) 184 | # 185 | 186 | #compare 187 | correct = 0.0 188 | for w,pred_pos_list in pred_vocab.items(): 189 | if len(pred_pos_list & lexicon[w])>0: 190 | correct += 1 191 | acc = correct / len(pred_vocab) 192 | if report: 193 | print("ACC: %.4f" % acc ) 194 | print("Inters. size: ",len(pred_vocab) ) 195 | return acc 196 | 197 | 198 | def get_ppl(channel,wlm,wcm): 199 | fn = 'logs/%s.%d.%d.dec' % (channel,wlm,wcm) 200 | 201 | lines = open(fn,'r').read().strip('\n').split('\n') 202 | 203 | idx = lines[-1].rfind("^") 204 | pl_sc = float(lines[-1][idx+1:].strip(' ')) 205 | 206 | tmp = lines[-1][:idx] 207 | idx = tmp.rfind("^") 208 | idx2 = tmp.rfind(" ") 209 | pt_sc = float(tmp[idx+1:idx2].strip(' ')) 210 | 211 | return pl_sc,pt_sc 212 | 213 | 214 | def decoder_acc(channel,conf,wlm,wcm): 215 | rl,order,il,c_alg,cl,IS_ELISA = conf.split('.') 216 | IS_ELISA = bool(int(IS_ELISA)) 217 | lm_file = "lms/%s.%s.fsa.noe" % (rl,order) 218 | acc = 0.0 219 | 220 | # cases sw si ta tl 221 | if IS_ELISA: 222 | test_file = "data/%s/test.elisa.%s.carmel" % (il,cl) if c_alg=="br" else \ 223 | "data/%s/test.elisa.%s.%s.carmel" % (il,cl,c_alg) 224 | test_wf_file = "data/%s/test.elisa.true.filt" % (il) 225 | lexicon_fn = "data/%s/lexicon.elisa" % (il) 226 | 227 | agms = ["sh","decode.sh","-lm",lm_file, 228 | "-ch",channel, 229 | "-i",test_file, 230 | "-wlm",str(wlm), 231 | "-wcm",str(wcm) 232 | ] 233 | pobj = sp.Popen(agms) 234 | while pobj.wait(): continue 235 | 236 | acc = eval_lexicon(lexicon_fn, '%s.%s.%d.%d.decoded' % (test_file,channel,wlm,wcm),test_wf_file,False) 237 | 238 | else: 239 | test_file = "data/%s/test.%s.carmel" % (il,cl) if c_alg=="br" else \ 240 | "data/%s/test.%s.%s.carmel" % (il,cl,c_alg) 241 | goldfn = "data/%s/test.upos" % (il) 242 | 243 | agms = ["sh","decode.sh","-lm",lm_file, 244 | "-ch",channel, 245 | "-i",test_file, 246 | "-wlm",str(wlm), 247 | "-wcm",str(wcm) 248 | ] 249 | pobj = sp.Popen(agms) 250 | while pobj.wait(): continue 251 | 252 | # if args.dec_ch: 253 | acc = evaluate(goldfn,'%s.%s.%d.%d.decoded' % (test_file,channel,wlm,wcm),False) 254 | 255 | return acc 256 | 257 | 258 | def post_process(tk_rom_fn, dec_fn,out_fn,pos_tagset="ud"): 259 | outfile = open(out_fn,'w') 260 | toks_rom_file = open(tk_rom_fn,'r') 261 | tags_file = open(dec_fn,'r') 262 | 263 | for tag_line in tags_file: 264 | tags = tag_line.split() 265 | tok_rom_line = toks_rom_file.readline().strip('\n') # raw, romanized text, always in txt format 266 | tok_roms = tok_rom_line.split() 267 | 268 | ntags=[] 269 | for tk,tag in zip(tok_roms,tags): 270 | ntags.append( ground_tag(tk,tag,pos_tagset) ) 271 | # 272 | # what if carmel could not decode input? fallback to all nouns 273 | if ntags==[]: 274 | ntags = ["NOUN"]*len(tok_roms) 275 | 276 | print(" ".join(ntags), file=outfile) 277 | 278 | 279 | #END-FOR -------------------------------------------------------------------------------- /src/marlin/basic/city.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2011 Google, Inc. 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in 11 | // all copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | // THE SOFTWARE. 20 | // 21 | // CityHash, by Geoff Pike and Jyrki Alakuijala 22 | // 23 | // This file provides CityHash64() and related functions. 24 | // 25 | // It's probably possible to create even faster hash functions by 26 | // writing a program that systematically explores some of the space of 27 | // possible hash functions, by using SIMD instructions, or by 28 | // compromising on hash quality. 29 | 30 | #include "city.h" 31 | 32 | #include 33 | #include // for memcpy and memset 34 | 35 | using namespace std; 36 | 37 | static uint64 UNALIGNED_LOAD64(const char *p) { 38 | uint64 result; 39 | memcpy(&result, p, sizeof(result)); 40 | return result; 41 | } 42 | 43 | static uint32 UNALIGNED_LOAD32(const char *p) { 44 | uint32 result; 45 | memcpy(&result, p, sizeof(result)); 46 | return result; 47 | } 48 | 49 | #if !defined(WORDS_BIGENDIAN) 50 | 51 | #define uint32_in_expected_order(x) (x) 52 | #define uint64_in_expected_order(x) (x) 53 | 54 | #else 55 | 56 | #ifdef _MSC_VER 57 | #include 58 | #define bswap_32(x) _byteswap_ulong(x) 59 | #define bswap_64(x) _byteswap_uint64(x) 60 | 61 | #elif defined(__APPLE__) 62 | // Mac OS X / Darwin features 63 | #include 64 | #define bswap_32(x) OSSwapInt32(x) 65 | #define bswap_64(x) OSSwapInt64(x) 66 | 67 | #else 68 | #include 69 | #endif 70 | 71 | #define uint32_in_expected_order(x) (bswap_32(x)) 72 | #define uint64_in_expected_order(x) (bswap_64(x)) 73 | 74 | #endif // WORDS_BIGENDIAN 75 | 76 | #if !defined(LIKELY) 77 | #if HAVE_BUILTIN_EXPECT 78 | #define LIKELY(x) (__builtin_expect(!!(x), 1)) 79 | #else 80 | #define LIKELY(x) (x) 81 | #endif 82 | #endif 83 | 84 | static uint64 Fetch64(const char *p) { 85 | return uint64_in_expected_order(UNALIGNED_LOAD64(p)); 86 | } 87 | 88 | static uint32 Fetch32(const char *p) { 89 | return uint32_in_expected_order(UNALIGNED_LOAD32(p)); 90 | } 91 | 92 | // Some primes between 2^63 and 2^64 for various uses. 93 | static const uint64 k0 = 0xc3a5c85c97cb3127ULL; 94 | static const uint64 k1 = 0xb492b66fbe98f273ULL; 95 | static const uint64 k2 = 0x9ae16a3b2f90404fULL; 96 | static const uint64 k3 = 0xc949d7c7509e6557ULL; 97 | 98 | // Bitwise right rotate. Normally this will compile to a single 99 | // instruction, especially if the shift is a manifest constant. 100 | static uint64 Rotate(uint64 val, int shift) { 101 | // Avoid shifting by 64: doing so yields an undefined result. 102 | return shift == 0 ? val : ((val >> shift) | (val << (64 - shift))); 103 | } 104 | 105 | // Equivalent to Rotate(), but requires the second arg to be non-zero. 106 | // On x86-64, and probably others, it's possible for this to compile 107 | // to a single instruction if both args are already in registers. 108 | static uint64 RotateByAtLeast1(uint64 val, int shift) { 109 | return (val >> shift) | (val << (64 - shift)); 110 | } 111 | 112 | static uint64 ShiftMix(uint64 val) { 113 | return val ^ (val >> 47); 114 | } 115 | 116 | static uint64 HashLen16(uint64 u, uint64 v) { 117 | return Hash128to64(uint128(u, v)); 118 | } 119 | 120 | static uint64 HashLen0to16(const char *s, size_t len) { 121 | if (len > 8) { 122 | uint64 a = Fetch64(s); 123 | uint64 b = Fetch64(s + len - 8); 124 | return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b; 125 | } 126 | if (len >= 4) { 127 | uint64 a = Fetch32(s); 128 | return HashLen16(len + (a << 3), Fetch32(s + len - 4)); 129 | } 130 | if (len > 0) { 131 | uint8 a = s[0]; 132 | uint8 b = s[len >> 1]; 133 | uint8 c = s[len - 1]; 134 | uint32 y = static_cast(a) + (static_cast(b) << 8); 135 | uint32 z = len + (static_cast(c) << 2); 136 | return ShiftMix(y * k2 ^ z * k3) * k2; 137 | } 138 | return k2; 139 | } 140 | 141 | // This probably works well for 16-byte strings as well, but it may be overkill 142 | // in that case. 143 | static uint64 HashLen17to32(const char *s, size_t len) { 144 | uint64 a = Fetch64(s) * k1; 145 | uint64 b = Fetch64(s + 8); 146 | uint64 c = Fetch64(s + len - 8) * k2; 147 | uint64 d = Fetch64(s + len - 16) * k0; 148 | return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d, 149 | a + Rotate(b ^ k3, 20) - c + len); 150 | } 151 | 152 | // Return a 16-byte hash for 48 bytes. Quick and dirty. 153 | // Callers do best to use "random-looking" values for a and b. 154 | static pair WeakHashLen32WithSeeds( 155 | uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) { 156 | a += w; 157 | b = Rotate(b + a + z, 21); 158 | uint64 c = a; 159 | a += x; 160 | a += y; 161 | b += Rotate(a, 44); 162 | return make_pair(a + z, b + c); 163 | } 164 | 165 | // Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. 166 | static pair WeakHashLen32WithSeeds( 167 | const char* s, uint64 a, uint64 b) { 168 | return WeakHashLen32WithSeeds(Fetch64(s), 169 | Fetch64(s + 8), 170 | Fetch64(s + 16), 171 | Fetch64(s + 24), 172 | a, 173 | b); 174 | } 175 | 176 | // Return an 8-byte hash for 33 to 64 bytes. 177 | static uint64 HashLen33to64(const char *s, size_t len) { 178 | uint64 z = Fetch64(s + 24); 179 | uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0; 180 | uint64 b = Rotate(a + z, 52); 181 | uint64 c = Rotate(a, 37); 182 | a += Fetch64(s + 8); 183 | c += Rotate(a, 7); 184 | a += Fetch64(s + 16); 185 | uint64 vf = a + z; 186 | uint64 vs = b + Rotate(a, 31) + c; 187 | a = Fetch64(s + 16) + Fetch64(s + len - 32); 188 | z = Fetch64(s + len - 8); 189 | b = Rotate(a + z, 52); 190 | c = Rotate(a, 37); 191 | a += Fetch64(s + len - 24); 192 | c += Rotate(a, 7); 193 | a += Fetch64(s + len - 16); 194 | uint64 wf = a + z; 195 | uint64 ws = b + Rotate(a, 31) + c; 196 | uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); 197 | return ShiftMix(r * k0 + vs) * k2; 198 | } 199 | 200 | uint64 CityHash64(const char *s, size_t len) { 201 | if (len <= 32) { 202 | if (len <= 16) { 203 | return HashLen0to16(s, len); 204 | } else { 205 | return HashLen17to32(s, len); 206 | } 207 | } else if (len <= 64) { 208 | return HashLen33to64(s, len); 209 | } 210 | 211 | // For strings over 64 bytes we hash the end first, and then as we 212 | // loop we keep 56 bytes of state: v, w, x, y, and z. 213 | uint64 x = Fetch64(s + len - 40); 214 | uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56); 215 | uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24)); 216 | pair v = WeakHashLen32WithSeeds(s + len - 64, len, z); 217 | pair w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x); 218 | x = x * k1 + Fetch64(s); 219 | 220 | // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. 221 | len = (len - 1) & ~static_cast(63); 222 | do { 223 | x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; 224 | y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; 225 | x ^= w.second; 226 | y += v.first + Fetch64(s + 40); 227 | z = Rotate(z + w.first, 33) * k1; 228 | v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); 229 | w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); 230 | std::swap(z, x); 231 | s += 64; 232 | len -= 64; 233 | } while (len != 0); 234 | return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, 235 | HashLen16(v.second, w.second) + x); 236 | } 237 | 238 | uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) { 239 | return CityHash64WithSeeds(s, len, k2, seed); 240 | } 241 | 242 | uint64 CityHash64WithSeeds(const char *s, size_t len, 243 | uint64 seed0, uint64 seed1) { 244 | return HashLen16(CityHash64(s, len) - seed0, seed1); 245 | } 246 | 247 | // A subroutine for CityHash128(). Returns a decent 128-bit hash for strings 248 | // of any length representable in signed long. Based on City and Murmur. 249 | static uint128 CityMurmur(const char *s, size_t len, uint128 seed) { 250 | uint64 a = Uint128Low64(seed); 251 | uint64 b = Uint128High64(seed); 252 | uint64 c = 0; 253 | uint64 d = 0; 254 | signed long l = len - 16; 255 | if (l <= 0) { // len <= 16 256 | a = ShiftMix(a * k1) * k1; 257 | c = b * k1 + HashLen0to16(s, len); 258 | d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); 259 | } else { // len > 16 260 | c = HashLen16(Fetch64(s + len - 8) + k1, a); 261 | d = HashLen16(b + len, c + Fetch64(s + len - 16)); 262 | a += d; 263 | do { 264 | a ^= ShiftMix(Fetch64(s) * k1) * k1; 265 | a *= k1; 266 | b ^= a; 267 | c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; 268 | c *= k1; 269 | d ^= c; 270 | s += 16; 271 | l -= 16; 272 | } while (l > 0); 273 | } 274 | a = HashLen16(a, c); 275 | b = HashLen16(d, b); 276 | return uint128(a ^ b, HashLen16(b, a)); 277 | } 278 | 279 | uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) { 280 | if (len < 128) { 281 | return CityMurmur(s, len, seed); 282 | } 283 | 284 | // We expect len >= 128 to be the common case. Keep 56 bytes of state: 285 | // v, w, x, y, and z. 286 | pair v, w; 287 | uint64 x = Uint128Low64(seed); 288 | uint64 y = Uint128High64(seed); 289 | uint64 z = len * k1; 290 | v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s); 291 | v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8); 292 | w.first = Rotate(y + z, 35) * k1 + x; 293 | w.second = Rotate(x + Fetch64(s + 88), 53) * k1; 294 | 295 | // This is the same inner loop as CityHash64(), manually unrolled. 296 | do { 297 | x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; 298 | y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; 299 | x ^= w.second; 300 | y += v.first + Fetch64(s + 40); 301 | z = Rotate(z + w.first, 33) * k1; 302 | v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); 303 | w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); 304 | std::swap(z, x); 305 | s += 64; 306 | x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; 307 | y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; 308 | x ^= w.second; 309 | y += v.first + Fetch64(s + 40); 310 | z = Rotate(z + w.first, 33) * k1; 311 | v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); 312 | w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); 313 | std::swap(z, x); 314 | s += 64; 315 | len -= 128; 316 | } while (LIKELY(len >= 128)); 317 | x += Rotate(v.first + z, 49) * k0; 318 | z += Rotate(w.first, 37) * k0; 319 | // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. 320 | for (size_t tail_done = 0; tail_done < len; ) { 321 | tail_done += 32; 322 | y = Rotate(x + y, 42) * k0 + v.second; 323 | w.first += Fetch64(s + len - tail_done + 16); 324 | x = x * k0 + w.first; 325 | z += w.second + Fetch64(s + len - tail_done); 326 | w.second += v.first; 327 | v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second); 328 | } 329 | // At this point our 56 bytes of state should contain more than 330 | // enough information for a strong 128-bit hash. We use two 331 | // different 56-byte-to-8-byte hashes to get a 16-byte final result. 332 | x = HashLen16(x, v.first); 333 | y = HashLen16(y + z, w.first); 334 | return uint128(HashLen16(x + v.second, w.second) + y, 335 | HashLen16(x + w.second, y + v.second)); 336 | } 337 | 338 | uint128 CityHash128(const char *s, size_t len) { 339 | if (len >= 16) { 340 | return CityHash128WithSeed(s + 16, 341 | len - 16, 342 | uint128(Fetch64(s) ^ k3, 343 | Fetch64(s + 8))); 344 | } else if (len >= 8) { 345 | return CityHash128WithSeed(NULL, 346 | 0, 347 | uint128(Fetch64(s) ^ (len * k0), 348 | Fetch64(s + len - 8) ^ k1)); 349 | } else { 350 | return CityHash128WithSeed(s, len, uint128(k0, k1)); 351 | } 352 | } 353 | 354 | #ifdef __SSE4_2__ 355 | #include 356 | #include 357 | 358 | // Requires len >= 240. 359 | static void CityHashCrc256Long(const char *s, size_t len, 360 | uint32 seed, uint64 *result) { 361 | uint64 a = Fetch64(s + 56) + k0; 362 | uint64 b = Fetch64(s + 96) + k0; 363 | uint64 c = result[0] = HashLen16(b, len); 364 | uint64 d = result[1] = Fetch64(s + 120) * k0 + len; 365 | uint64 e = Fetch64(s + 184) + seed; 366 | uint64 f = seed; 367 | uint64 g = 0; 368 | uint64 h = 0; 369 | uint64 i = 0; 370 | uint64 j = 0; 371 | uint64 t = c + d; 372 | 373 | // 240 bytes of input per iter. 374 | size_t iters = len / 240; 375 | len -= iters * 240; 376 | do { 377 | #define CHUNK(multiplier, z) \ 378 | { \ 379 | uint64 old_a = a; \ 380 | a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s); \ 381 | b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8); \ 382 | c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16); \ 383 | d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24); \ 384 | e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32); \ 385 | t = old_a; \ 386 | } \ 387 | f = _mm_crc32_u64(f, a); \ 388 | g = _mm_crc32_u64(g, b); \ 389 | h = _mm_crc32_u64(h, c); \ 390 | i = _mm_crc32_u64(i, d); \ 391 | j = _mm_crc32_u64(j, e); \ 392 | s += 40 393 | 394 | CHUNK(1, 1); CHUNK(k0, 0); 395 | CHUNK(1, 1); CHUNK(k0, 0); 396 | CHUNK(1, 1); CHUNK(k0, 0); 397 | } while (--iters > 0); 398 | 399 | while (len >= 40) { 400 | CHUNK(k0, 0); 401 | len -= 40; 402 | } 403 | if (len > 0) { 404 | s = s + len - 40; 405 | CHUNK(k0, 0); 406 | } 407 | j += i << 32; 408 | a = HashLen16(a, j); 409 | h += g << 32; 410 | b += h; 411 | c = HashLen16(c, f) + i; 412 | d = HashLen16(d, e + result[0]); 413 | j += e; 414 | i += HashLen16(h, t); 415 | e = HashLen16(a, d) + j; 416 | f = HashLen16(b, c) + a; 417 | g = HashLen16(j, i) + c; 418 | result[0] = e + f + g + h; 419 | a = ShiftMix((a + g) * k0) * k0 + b; 420 | result[1] += a + result[0]; 421 | a = ShiftMix(a * k0) * k0 + c; 422 | result[2] = a + result[1]; 423 | a = ShiftMix((a + e) * k0) * k0; 424 | result[3] = a + result[2]; 425 | } 426 | 427 | // Requires len < 240. 428 | static void CityHashCrc256Short(const char *s, size_t len, uint64 *result) { 429 | char buf[240]; 430 | memcpy(buf, s, len); 431 | memset(buf + len, 0, 240 - len); 432 | CityHashCrc256Long(buf, 240, ~static_cast(len), result); 433 | } 434 | 435 | void CityHashCrc256(const char *s, size_t len, uint64 *result) { 436 | if (LIKELY(len >= 240)) { 437 | CityHashCrc256Long(s, len, 0, result); 438 | } else { 439 | CityHashCrc256Short(s, len, result); 440 | } 441 | } 442 | 443 | uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed) { 444 | if (len <= 900) { 445 | return CityHash128WithSeed(s, len, seed); 446 | } else { 447 | uint64 result[4]; 448 | CityHashCrc256(s, len, result); 449 | uint64 u = Uint128High64(seed) + result[0]; 450 | uint64 v = Uint128Low64(seed) + result[1]; 451 | return uint128(HashLen16(u, v + result[2]), 452 | HashLen16(Rotate(v, 32), u * k0 + result[3])); 453 | } 454 | } 455 | 456 | uint128 CityHashCrc128(const char *s, size_t len) { 457 | if (len <= 900) { 458 | return CityHash128(s, len); 459 | } else { 460 | uint64 result[4]; 461 | CityHashCrc256(s, len, result); 462 | return uint128(result[2], result[3]); 463 | } 464 | } 465 | 466 | #endif 467 | -------------------------------------------------------------------------------- /src/marlin/marlin_cluster.cc: -------------------------------------------------------------------------------- 1 | #include "basic/std.h" 2 | #include "basic/stl-basic.h" 3 | #include "basic/stl-utils.h" 4 | #include "basic/opt.h" 5 | #include 6 | #include 7 | 8 | vector< OptInfo > bool_opts; 9 | vector< OptInfo > int_opts; 10 | vector< OptInfo > double_opts; 11 | vector< OptInfo > string_opts; 12 | 13 | opt_define_string_req(unigram_file_, "words", "Text file with words."); 14 | opt_define_string_req(bigram_file_, "bigrams", "Text file with bigrams."); 15 | opt_define_string_req(output_, "output", "Output."); 16 | 17 | opt_define_double(alpha_, "alpha", 0.0, "Character Model Strength [0,1]."); 18 | opt_define_int(num_classes_, "c", 1000, "Number of clusters."); 19 | opt_define_int(num_steps_, "steps", -1, "Number of steps or -1 to run until convergence."); 20 | opt_define_int(rand_seed, "rand", time(NULL)*getpid(), "Number to call srand with."); 21 | opt_define_int(verbose_, "v", 2, "Verbosity level."); 22 | opt_define_int(sanity_checks_, "sanity", 0, "Sanity Check level. Makes things slow!"); 23 | opt_define_int(cache_size_, "cache-size", 100000, "N up to which to cache n * log(n)."); 24 | 25 | const real REAL_NEG_INF = -1e99; 26 | 27 | struct UpdateResult { 28 | int current_w; 29 | int current_c; 30 | int best_c; 31 | real best_delta; 32 | DoubleVec deltas; 33 | 34 | UpdateResult() { 35 | deltas.resize(num_classes_); 36 | } 37 | 38 | }; 39 | 40 | struct Entry{ 41 | int item; 42 | int count; 43 | }; 44 | 45 | bool compare_entry(const Entry& entry, const Entry& other) { 46 | return entry.item < other.item; 47 | } 48 | 49 | typedef vector EntryVec; 50 | typedef vector EntryVecVec; 51 | 52 | void addToSparseEntryVec(EntryVec& vector, int klass, int count) { 53 | Entry entry; 54 | entry.item = klass; 55 | entry.count = count; 56 | 57 | EntryVec::iterator pos = lower_bound (vector.begin(), vector.end(), entry, compare_entry); 58 | 59 | if (pos != vector.end() && pos->item == klass) { 60 | pos->count += count; 61 | } else { 62 | vector.insert(pos, entry); 63 | } 64 | 65 | } 66 | 67 | struct BiEntry{ 68 | int item; 69 | int item2; 70 | int count; 71 | }; 72 | 73 | typedef vector BiEntryVec; 74 | typedef vector BiEntryVecVec; 75 | 76 | // Observed counts 77 | EntryVecVec left_context_; 78 | EntryVecVec right_context_; 79 | 80 | IntVec word_counts_; 81 | 82 | IntVec class_class_counts_; 83 | IntVec class_counts_; 84 | IntVecVec class_char_char_counts_; 85 | IntVecVec class_char_counts_; 86 | 87 | IntVec word_assignment_; 88 | 89 | int num_words_; 90 | int num_chars_; 91 | 92 | // Whether a character bigram model is used. Depends on alpha. 93 | bool character_; 94 | 95 | StringVec word_forms_; 96 | IntVecVec word_chars_; 97 | 98 | // These data structures are needed to make update() efficient. 99 | 100 | // Number of times a class has been seen left (right) of the current word. 101 | EntryVec left_class_counts_; 102 | EntryVec right_class_counts_; 103 | 104 | EntryVec left_tri_class_counts_; 105 | EntryVec center_tri_class_counts_; 106 | EntryVec right_tri_class_counts_; 107 | 108 | // Current word 109 | int current_word_; 110 | 111 | // Character bigrams and unigrams occuring in current word. 112 | EntryVec char_bigram_counts_; 113 | EntryVec char_unigram_counts_; 114 | 115 | // Class bigrams affected by current class change. 116 | IntVec current_class_class_counts_; 117 | 118 | // Number of times a word cooccurs with itself. 119 | int word_word_count_; 120 | int word_word_word_count_; 121 | EntryVec word_word_x_counts_; 122 | EntryVec word_x_word_counts_; 123 | EntryVec x_word_word_counts_; 124 | 125 | // Constant term of the Likelihood that depends on word form counts 126 | real word_ll_term_; 127 | 128 | DoubleVec nlogn_cache_; 129 | 130 | real nlogn(int n) { 131 | if (n<0) 132 | cout << "nlogn : " << n << endl; 133 | assert (n >= 0); 134 | 135 | if (n == 0) { 136 | return 0; 137 | } 138 | 139 | if (n - 1 < cache_size_) { 140 | return nlogn_cache_[n - 1]; 141 | } 142 | 143 | return n * log(n); 144 | } 145 | 146 | int classIndex(int i, int j) { 147 | assert (i >= 0 && i < num_classes_); 148 | assert (j >= 0 && j < num_classes_); 149 | int index = i * num_classes_ + j; 150 | assert (index >= 0 && index < num_classes_ * num_classes_); 151 | return index; 152 | } 153 | 154 | void addCharBigram(int klass, int c, int k, int count) { 155 | int index = c * num_chars_ + k; 156 | class_char_char_counts_[klass][index] += count; 157 | } 158 | 159 | void incrementChars(int word, int klass, int factor) { 160 | int last = 0; 161 | IntVec& chars = word_chars_[word]; 162 | 163 | forvec(_, int, c, chars) { 164 | addCharBigram(klass, last, c, factor * word_counts_[word]); 165 | 166 | class_char_counts_[klass][c] += factor * word_counts_[word]; 167 | assert (class_char_counts_[klass][c] >= 0); 168 | 169 | last = c; 170 | } 171 | 172 | addCharBigram(klass, last, 0, factor * word_counts_[word]); 173 | 174 | class_char_counts_[klass][0] += factor * word_counts_[word]; 175 | assert (class_char_counts_[klass][last] >= 0); 176 | } 177 | 178 | void assignToZero() { 179 | 180 | fill(word_assignment_.begin(), word_assignment_.end(), 0); 181 | 182 | for (int word = 0; word < num_words_; word++) { 183 | class_counts_[0] += word_counts_[word]; 184 | if (character_) { 185 | if (word > 0) { 186 | incrementChars(word, 0, +1); 187 | } 188 | } 189 | } 190 | class_class_counts_[0] = class_counts_[0]; 191 | } 192 | 193 | void addTagTagCount(int klass, int cclass, int count) { 194 | class_class_counts_[classIndex(klass, cclass)] += count; 195 | } 196 | 197 | void incrementBigrams(int word, int klass, int factor) { 198 | forvec (_, Entry, entry, left_context_[word]) { 199 | int cword = entry.item; 200 | if (cword != word) { 201 | int cclass = word_assignment_[cword]; 202 | addTagTagCount(cclass, klass, factor * entry.count); 203 | } else { 204 | addTagTagCount(klass, klass, factor * entry.count); 205 | } 206 | } 207 | 208 | forvec (_, Entry, entry, right_context_[word]) { 209 | int cword = entry.item; 210 | if (cword != word) { 211 | int cclass = word_assignment_[cword]; 212 | addTagTagCount(klass, cclass, factor * entry.count); 213 | } 214 | } 215 | } 216 | 217 | void increment(int word, int klass, int factor) { 218 | 219 | assert (word > 0); 220 | assert (klass > 0 || factor < 0); 221 | 222 | class_counts_[klass] += factor * word_counts_[word]; 223 | 224 | incrementBigrams(word, klass, factor); 225 | 226 | if (character_) { 227 | incrementChars(word, klass, factor); 228 | } 229 | 230 | word_assignment_[word] = klass; 231 | } 232 | 233 | void randomInit() { 234 | assert (num_words_ > num_classes_); 235 | 236 | assignToZero(); 237 | 238 | int half_num_classes = num_classes_ / 2; 239 | 240 | for (int word = 1; word < half_num_classes; word++) { 241 | increment(word, 0, -1); 242 | increment(word, word, +1); 243 | } 244 | 245 | for (int word = half_num_classes; word < num_words_; word++) { 246 | int klass = half_num_classes + mrand((int)ceil(num_classes_ / 2.)); 247 | increment(word, 0, -1); 248 | increment(word, klass, +1); 249 | } 250 | } 251 | 252 | void strtok(StringVec& vec, string string, char delim) { 253 | uint last = 0; 254 | for (uint i=0; i 0 && delim == c )) { 257 | int length = i - last; 258 | if (length > 0) 259 | vec.push_back(string.substr(last, length)); 260 | last = i + 1; 261 | } 262 | } 263 | 264 | if (last < string.length()) { 265 | vec.push_back(string.substr(last, string.length() - last)); 266 | } 267 | } 268 | 269 | void readWordForms() { 270 | ifstream in(unigram_file_.c_str()); 271 | string buf; 272 | while(getline(in, buf)) { 273 | StringVec tokens; 274 | strtok(tokens, buf, (char) 0); 275 | const string& word_form = tokens[0]; 276 | word_forms_.push_back(word_form); 277 | } 278 | num_words_ = word_forms_.size(); 279 | } 280 | 281 | void readBigrams() { 282 | ifstream in(bigram_file_.c_str()); 283 | string buf; 284 | int word = 0; 285 | 286 | while(getline(in, buf)) { 287 | EntryVec& cwords = right_context_[word]; 288 | 289 | StringVec line; 290 | strtok(line, buf, (char) 0); 291 | 292 | forvec (_, string, pair_string, line) { 293 | Entry entry; 294 | 295 | StringVec tokens; 296 | strtok(tokens, pair_string, ':'); 297 | 298 | const string& word_string = tokens[0]; 299 | const string& count_string = tokens[1]; 300 | 301 | int cword = atoi(word_string.c_str()); 302 | int count = atoi(count_string.c_str()); 303 | 304 | // Add to right contexts 305 | entry.item = cword; 306 | entry.count = count; 307 | cwords.push_back(entry); 308 | 309 | // Add to left contexts 310 | entry.item = word; 311 | left_context_[cword].push_back(entry); 312 | 313 | word_counts_[word] += count; 314 | } 315 | 316 | word++; 317 | } 318 | 319 | 320 | // Check consistency 321 | for (word = 0; word < num_words_; word++) { 322 | int count = word_counts_[word]; 323 | 324 | int left_count = 0; 325 | forvec (_, const Entry&, entry, left_context_[word]) { 326 | left_count += entry.count; 327 | } 328 | 329 | int right_count = 0; 330 | forvec (_, const Entry&, entry, right_context_[word]) { 331 | right_count += entry.count; 332 | } 333 | 334 | assert (count == left_count && count == right_count); 335 | } 336 | 337 | } 338 | 339 | void readData() { 340 | readWordForms(); 341 | 342 | if (alpha_ > 1e-5) { 343 | character_ = true; 344 | } else { 345 | character_ = false; 346 | } 347 | 348 | // Fill nlogn cache. 349 | nlogn_cache_.resize(cache_size_); 350 | for (int i=0; i CIMap; 372 | 373 | CIMap table; 374 | table['^'] = 0; 375 | 376 | word_chars_.resize(num_words_); 377 | 378 | for (int w = 1; w < num_words_; w++) { 379 | const string& form = word_forms_[w]; 380 | 381 | IntVec& chars = word_chars_[w]; 382 | chars.resize(form.length()); 383 | 384 | for (uint i = 0; i < form.length(); i++) { 385 | char c = form[i]; 386 | if (c=='^') c='"'; 387 | 388 | assert (c != '^'); 389 | 390 | int k = table[c]; 391 | 392 | if (k == 0) { 393 | k = table.size(); 394 | table[c] = k; 395 | } 396 | 397 | chars[i] = k; 398 | } 399 | 400 | } 401 | 402 | num_chars_ = table.size(); 403 | 404 | matrix_resize(class_char_char_counts_, num_classes_, num_chars_* num_chars_); 405 | matrix_resize(class_char_counts_, num_classes_, num_chars_); 406 | 407 | } 408 | 409 | word_ll_term_ = 0; 410 | for (int w = 1; w < num_words_; w++) { 411 | word_ll_term_ += nlogn(word_counts_[w]); 412 | } 413 | } 414 | 415 | real calcCharLikelihood(int klass) { 416 | real ll = 0; 417 | 418 | // Bigram Counts 419 | for (int c = 0; c < num_chars_; c++) { 420 | for (int k = 0; k < num_chars_; k++) { 421 | int index = c * num_chars_ + k; 422 | ll += nlogn(class_char_char_counts_[klass][index]); 423 | } 424 | } 425 | 426 | // Unigram Counts 427 | for (int c = 0; c < num_chars_; c++) { 428 | ll -= nlogn(class_char_counts_[klass][c]); 429 | } 430 | 431 | return ll; 432 | } 433 | 434 | real calcLikelihood() { 435 | real ll = 0; 436 | 437 | // Bigram Context Counts. 438 | real ll_b = 0; 439 | 440 | for (int c = 0; c < num_classes_; c++) { 441 | for (int k = 0; k < num_classes_; k++) { 442 | ll_b += nlogn(class_class_counts_[classIndex(c, k)]); 443 | } 444 | } 445 | 446 | for (int c = 0; c < num_classes_; c++) { 447 | ll_b -= nlogn(class_counts_[c]); 448 | } 449 | 450 | ll += ll_b; 451 | 452 | // Unigram Word Emission Counts. 453 | real ll_w = word_ll_term_; 454 | for (int c = 0; c < num_classes_; c++) { 455 | ll_w -= nlogn(class_counts_[c]); 456 | } 457 | ll += (1.0 - alpha_) * ll_w; 458 | 459 | // Character Model Counts. 460 | 461 | real ll_c = 0; 462 | if (character_) { 463 | for (int c = 0; c < num_classes_; c++) { 464 | ll_c += calcCharLikelihood(c); 465 | } 466 | } 467 | ll += alpha_ * ll_c; 468 | 469 | return ll; 470 | } 471 | 472 | void setCharCounts(int w) { 473 | char_bigram_counts_.clear(); 474 | char_unigram_counts_.clear(); 475 | 476 | const IntVec& chars = word_chars_[w]; 477 | 478 | int last = 0; 479 | forvec (_, int, c, chars) { 480 | addToSparseEntryVec(char_bigram_counts_, last * num_chars_ + c, 1); 481 | addToSparseEntryVec(char_unigram_counts_, c, 1); 482 | last = c; 483 | } 484 | 485 | addToSparseEntryVec(char_bigram_counts_, last * num_chars_ + 0, 1); 486 | addToSparseEntryVec(char_unigram_counts_, 0, 1); 487 | } 488 | 489 | void setContextClassCount(int word) { 490 | left_class_counts_.clear(); 491 | right_class_counts_.clear(); 492 | word_word_count_ = 0; 493 | 494 | forvec (_, Entry, entry, left_context_[word]) { 495 | int cword = entry.item; 496 | if (cword != word) { 497 | int cclass = word_assignment_[cword]; 498 | 499 | addToSparseEntryVec(left_class_counts_, cclass, entry.count); 500 | } else { 501 | word_word_count_ = entry.count; 502 | } 503 | } 504 | 505 | forvec (_, Entry, entry, right_context_[word]) { 506 | int cword = entry.item; 507 | if (cword != word) { 508 | int cclass = word_assignment_[cword]; 509 | addToSparseEntryVec(right_class_counts_, cclass, entry.count); 510 | } 511 | } 512 | } 513 | 514 | void setCurrentWord(int w) { 515 | current_word_ = w; 516 | 517 | setContextClassCount(w); 518 | 519 | if (character_) 520 | setCharCounts(w); 521 | } 522 | 523 | real calcCharDelta(int klass) { 524 | int wcount = word_counts_[current_word_]; 525 | real delta = 0; 526 | 527 | forvec (_, const Entry&, entry, char_bigram_counts_) { 528 | int old_count = class_char_char_counts_[klass][entry.item]; 529 | int new_count = old_count + entry.count * wcount; 530 | delta += nlogn(new_count) - nlogn(old_count); 531 | } 532 | 533 | forvec (_, const Entry&, entry, char_unigram_counts_) { 534 | int old_count = class_char_counts_[klass][entry.item]; 535 | int new_count = old_count + entry.count * wcount; 536 | delta -= nlogn(new_count) - nlogn(old_count); 537 | } 538 | 539 | return delta; 540 | } 541 | 542 | real calcLocalDelta(int c, int k, int count) { 543 | assert (k < num_classes_); 544 | assert (c < num_classes_); 545 | assert (count >= 0); 546 | int index = classIndex(c, k); 547 | int oldcount = class_class_counts_[index] + current_class_class_counts_[index]; 548 | int newcount = oldcount + count; 549 | current_class_class_counts_[index] += count; 550 | return nlogn(newcount) - nlogn(oldcount); 551 | } 552 | 553 | void setZero(int c, int k) { 554 | assert (k < num_classes_); 555 | assert (c < num_classes_); 556 | int index = classIndex(c, k); 557 | current_class_class_counts_[index] = 0; 558 | } 559 | 560 | real calcBigramDelta(int klass) { 561 | real delta = 0; 562 | 563 | forvec (_, Entry, entry, left_class_counts_) { 564 | delta += calcLocalDelta(entry.item, klass, entry.count); 565 | } 566 | 567 | forvec (_, Entry, entry, right_class_counts_) { 568 | delta += calcLocalDelta(klass, entry.item, entry.count); 569 | } 570 | 571 | delta += calcLocalDelta(klass, klass, word_word_count_); 572 | 573 | forvec (_, Entry, entry, left_class_counts_) { 574 | setZero(entry.item, klass); 575 | } 576 | 577 | forvec (_, Entry, entry, right_class_counts_) { 578 | setZero(klass, entry.item); 579 | } 580 | 581 | setZero(klass, klass); 582 | 583 | return delta; 584 | } 585 | 586 | real calcDelta(int klass) { 587 | int wcount = word_counts_[current_word_]; 588 | real delta = 0; 589 | 590 | real bigram_delta = calcBigramDelta(klass); 591 | 592 | real delta_b = 0; 593 | delta_b += bigram_delta; 594 | delta_b -= (nlogn(class_counts_[klass] + wcount ) - nlogn(class_counts_[klass])); 595 | delta += delta_b; 596 | 597 | // Word Emission 598 | delta -= (1.0 - alpha_) * (nlogn(class_counts_[klass] + wcount ) - nlogn(class_counts_[klass])); 599 | 600 | // Character Bigram Emission 601 | if (character_) { 602 | delta += alpha_ * calcCharDelta(klass); 603 | } 604 | 605 | return delta; 606 | } 607 | 608 | 609 | void update(UpdateResult &r) { 610 | setCurrentWord(r.current_w); 611 | 612 | for (int c = 1; c < num_classes_; c++) { 613 | real delta = calcDelta(c); 614 | r.deltas[c] = delta; 615 | 616 | if (delta > r.best_delta) { 617 | r.best_delta = delta; 618 | r.best_c = c; 619 | } 620 | } 621 | } 622 | 623 | int update() { 624 | int swaps = 0; 625 | 626 | real current_ll = calcLikelihood(); 627 | 628 | UpdateResult r; 629 | 630 | for (int w = 1; w < num_words_; w++) { 631 | r.current_w = w; 632 | r.current_c = word_assignment_[w]; 633 | fill(r.deltas.begin(), r.deltas.end(), 0.0); 634 | r.best_delta = REAL_NEG_INF; 635 | r.best_c = -1; 636 | 637 | increment(w, r.current_c, -1); 638 | update(r); 639 | 640 | current_ll = current_ll - r.deltas[r.current_c] + r.deltas[r.best_c]; 641 | increment(w, r.best_c, +1); 642 | 643 | if (r.current_c != r.best_c) { 644 | swaps += 1; 645 | } 646 | 647 | if (sanity_checks_ > 0) { 648 | real actual_ll = calcLikelihood(); 649 | real delta = fabs((actual_ll - current_ll) / current_ll); 650 | if (delta > 1e-5) { 651 | cerr << "Sanity check failed: " << delta << " " << actual_ll << " " << current_ll << endl; 652 | assert (false); 653 | } 654 | } 655 | 656 | if (verbose_ > 1 && w % (num_words_ / 4) == 0) { 657 | fprintf(stderr, "W:%6d / %6d LL: %g Swaps: %5d\n", w, num_words_, current_ll, swaps); 658 | } 659 | } 660 | 661 | if (verbose_ > 0) { 662 | fprintf(stderr, "W:%6d / %6d LL: %g Swaps: %5d\n", num_words_, num_words_, current_ll, swaps); 663 | } 664 | 665 | return swaps; 666 | } 667 | 668 | void writeAssignment() { 669 | ofstream os; 670 | os.open(output_); 671 | for (int w = 0; w < num_words_; w++) { 672 | os << word_forms_[w] << ' ' << word_assignment_[w] << endl; 673 | } 674 | os.close(); 675 | } 676 | 677 | int main(int argc, char** argv) { 678 | clock_t t1; 679 | t1=clock(); 680 | init_opt(argc, argv); 681 | 682 | readData(); 683 | randomInit(); 684 | 685 | if (verbose_ > 0) { 686 | cerr << "W: Number of words processed / total" << endl 687 | << "LL: Current log-likelihood" << endl 688 | << "Swaps: Number of words that changed their class" << endl 689 | << endl; 690 | } 691 | 692 | for (int step = 0; num_steps_ < 0 || step < num_steps_; step++) { 693 | if (verbose_ > 0) 694 | cerr << "iter: " << step << endl; 695 | 696 | int swaps = update(); 697 | 698 | if (swaps == 0) { 699 | break; 700 | } 701 | 702 | } 703 | 704 | writeAssignment(); 705 | 706 | float secs = ((float)clock()-(float)t1) / CLOCKS_PER_SEC; 707 | cerr << "Time: " << secs << endl; 708 | 709 | return 0; 710 | } 711 | 712 | --------------------------------------------------------------------------------