[0-9]+)" (?P[-.0-9e^]+)\)\)')
20 |
21 |
22 | if __name__ == "__main__":
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument("--il","-il",type=str, default=500, help="Incident language")
25 | parser.add_argument("--rl" ,"-rl", type=str, default=None, help="Related Languages")
26 | parser.add_argument("--num_clusters","-nc",type=int, default=500, help="Number of clusters")
27 | parser.add_argument("--iters","-it",type=int, default=500, help="Number of iterations")
28 | parser.add_argument("--ca" ,"-ca", type=str, default="br", help="Clutering algorithm [brown,anchor,]")
29 | parser.add_argument("--exp_dir" ,"-exp", type=str, default='', help="Experiment folder")
30 | args = parser.parse_args()
31 |
32 | #rls = "en.de.fr.it.es.ja.ar.cs.ru.sw-hcs".split('.')
33 | rls = args.rl.split(",")
34 | il = args.il
35 |
36 | if il[:2]=='tl':
37 | il = 'tl'
38 | p_c_t = np.zeros([17,500])
39 | t2id = LabelDictionary()
40 | for rl in rls:
41 | if il==rl: continue
42 | model = "%s/models/%s2-%s.%s.%d.%d" % (args.exp_dir,rl,il,args.ca,args.num_clusters,args.iters)
43 | temp = np.zeros([17,500])
44 |
45 | for line in open(model,'r'):
46 | line = line.strip('\n')
47 | if line=='' or line=='0': continue
48 | match = regex.match(line)
49 | if match==None:
50 | # print("not found!",line)
51 | # pdb.set_trace()
52 | continue
53 | # pdb.set_trace()
54 | t = match.group("T")
55 | c = int(match.group("C"))
56 | ps = match.group("P")
57 | if t=="" or t=="":
58 | continue
59 | if ps[0]!="e":
60 | p = float(ps)
61 | else:
62 | p = np.exp(float(ps[2:]))
63 | tid = t2id.add(t)
64 | p_c_t[tid,c] += p
65 | temp[tid,c] = p
66 | #END-FOR-LINE
67 |
68 | #END-FOR-RLS
69 |
70 | # normalize
71 | for t in range(17):
72 | # print(t2id.get_label_name(t),p_c_t[t,:].sum(),len(rls), (p_c_t[t,:]/len(rls)).sum() )
73 | p_c_t[t,:] /= p_c_t[t,:].sum()
74 |
75 | # print out result
76 | outfile_fn = "%s/models/%s.%s.%d.500.comb" % (args.exp_dir,il,args.ca,args.num_clusters)
77 | outfile = open(outfile_fn,'w')
78 | print("0",file=outfile)
79 | print('(0 (0 "" "" 1))',file=outfile)
80 | print('(0 (0 "" "" 1))',file=outfile)
81 | for t in range(17):
82 | for c in range(500):
83 | tag = t2id.get_label_name(t)
84 | prob = str(p_c_t[t,c])
85 | print('(0 (0 "%s" "%d" %s))' % (tag,c,prob ), file=outfile )
86 | if p_c_t[t,c]==0:
87 | print(il,tag,c)
88 | outfile.close()
89 |
90 | for rl in rls:
91 | model_name = "%s/models/%s2-%s.%s.%d.500.comb" % (args.exp_dir,rl,il,args.ca,args.num_clusters)
92 | sp.Popen(["cp",outfile_fn,model_name])
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
--------------------------------------------------------------------------------
/src/code/tag_text.py:
--------------------------------------------------------------------------------
1 | from label_dictionary import LabelDictionary
2 | from collections import defaultdict
3 | from utils import *
4 | import os,sys
5 | import argparse
6 | import pdb
7 | import numpy as np
8 |
9 |
10 | START=""
11 | END=""
12 |
13 |
14 | if __name__ == "__main__":
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument("--input","-i", type=str, help="Cluster dict")
17 | parser.add_argument("--baseline","-b", type=str, default="brown", help="Clustering model used")
18 | parser.add_argument("--mode","-m", type=str, help="train / eval")
19 | parser.add_argument("--mapper","-v", type=str, help="label dict")
20 | parser.add_argument("--clt_vocab","-c", type=str, help="cluster vocab")
21 | parser.add_argument("--nclusters","-nc", type=int, default=50, help="number of clusters")
22 | parser.add_argument("--output_pref","-op", type=str,default="train",help="output filename prefix")
23 | parser.add_argument("--subs","-subs", type=int,default=10000,help="subsample size for carmel")
24 |
25 | args = parser.parse_args()
26 |
27 | np.random.seed(42)
28 | w2cid = {}
29 |
30 | if args.mode == 'train':
31 |
32 | cl2cid = LabelDictionary()
33 | mapper_fn = os.path.join(os.path.dirname(args.clt_vocab),'clt.mapper')
34 |
35 | output_file = open(args.clt_vocab+".norm",'w')
36 |
37 | for line in open(args.clt_vocab,'r'):
38 | line = line.strip('\n')
39 | if line=='': continue
40 | w,c = '',''
41 | if args.baseline=='brown':
42 | c,w,_ = line.split('\t')
43 | elif args.baseline=='clark':
44 | w,c,_ = line.split(' ')
45 | elif args.baseline[0] in "lp":
46 | w,c = line.split('\t')
47 | elif args.baseline == "marlin":
48 | w,c = line.split(' ')
49 |
50 | cid = cl2cid.add(c)
51 | w2cid[w] = str(cid)
52 | print("%s\t%d" % (w,cid),file=output_file)
53 | ##
54 | saveObject(w2cid,mapper_fn)
55 |
56 | else:
57 | if args.mapper==None:
58 | print("Error: LabelDictionary object not specified!\nCheck arguments list with -h option")
59 | sys.exit(1)
60 | elif not os.path.exists(args.mapper):
61 | print("Error: LabelDictionary object does not exist!")
62 | sys.exit(1)
63 | else:
64 | w2cid = uploadObject(args.mapper)
65 | ##
66 |
67 |
68 | # pdb.set_trace()
69 |
70 | outfile = open(os.path.join(os.path.dirname(args.input), "%s.%d.%s.ctag" % (args.output_pref,args.nclusters,args.baseline) ),'w')
71 | outfile_carmel = open(os.path.join(os.path.dirname(args.input), "%s.%d.%s.carmel" % (args.output_pref,args.nclusters,args.baseline) ),'w')
72 | outfile_carmel_10k = open(os.path.join(os.path.dirname(args.input), "%s.%d.%s.carmel.10k" % (args.output_pref,args.nclusters,args.baseline) ),'w')
73 | lines = []
74 |
75 | for line in open(args.input,'r'):
76 | line = line.strip('\n')
77 | if line=='': continue
78 | clts = []
79 | for w in line.split(' '):
80 | if w == '#eos': continue
81 | if w not in w2cid:
82 | clts.append(w2cid[""])
83 | else:
84 | clts.append(w2cid[w])
85 | print(" ".join(clts),file=outfile)
86 |
87 | clts = [START] + clts + [END]
88 | txt = " ".join(['"%s"' % x for x in clts])
89 | lines.append(txt)
90 | print("",file=outfile_carmel)
91 | print(txt,file=outfile_carmel)
92 | # print(" ".join(clts),file=outfile_carmel)
93 |
94 | ##
95 | idxs = np.arange(len(lines))
96 | np.random.shuffle(idxs)
97 | for idx in idxs[:args.subs]:
98 | print("",file=outfile_carmel_10k)
99 | print(lines[idx],file=outfile_carmel_10k)
100 | ##
--------------------------------------------------------------------------------
/src/marlin/basic/opt.h:
--------------------------------------------------------------------------------
1 | #ifndef __OPT_H__
2 | #define __OPT_H__
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | using namespace std;
9 |
10 | // First thing to call in main().
11 | void init_opt(int argc, char *argv[]);
12 |
13 | ////////////////////////////////////////////////////////////////////////
14 | // command-line arguments
15 |
16 | class GetOpt {
17 | public:
18 | GetOpt() { }
19 |
20 | void AddOpt(const string &name, bool has_arg);
21 | void Parse(int argc, char *argv[]);
22 | int Lookup(const string &name) const;
23 |
24 | bool Exists(const string &name) const;
25 | string Get(const string &name, const string &default_value) const;
26 | string Get(const string &name) const;
27 | int GetInt(const string &name) const;
28 | int GetInt(const string &name, int default_value) const;
29 | double GetDouble(const string &name) const;
30 | double GetDouble(const string &name, double default_value) const;
31 |
32 | private:
33 | vector< pair > opts;
34 | vector values;
35 | };
36 |
37 | template struct OptInfo {
38 | OptInfo(const string &name, T *var, const string &msg, bool required)
39 | : name(name), var(var), msg(msg), required(required) { }
40 |
41 | string name;
42 | T *var; // location of the variable that stores this value
43 | string msg;
44 | bool required;
45 | };
46 |
47 | extern vector< OptInfo > bool_opts;
48 | extern vector< OptInfo > int_opts;
49 | extern vector< OptInfo > double_opts;
50 | extern vector< OptInfo > string_opts;
51 |
52 | ////////////////////////////////////////////////////////////
53 |
54 | // two versions: in one, option is required
55 | #define opt_define_bool_req(var, name, msg) \
56 | bool var = opt_define_bool_wrap(name, &var, false, msg, true)
57 | #define opt_define_bool(var, name, val, msg) \
58 | bool var = opt_define_bool_wrap(name, &var, val, msg, false)
59 | #define opt_define_int_req(var, name, msg) \
60 | int var = opt_define_int_wrap(name, &var, 0, msg, true)
61 | #define opt_define_int(var, name, val, msg) \
62 | int var = opt_define_int_wrap(name, &var, val, msg, false)
63 | #define opt_define_double_req(var, name, msg) \
64 | double var = opt_define_double_wrap(name, &var, 0.0, msg, true)
65 | #define opt_define_double(var, name, val, msg) \
66 | double var = opt_define_double_wrap(name, &var, val, msg, false)
67 | #define opt_define_string_req(var, name, msg) \
68 | string var = opt_define_string_wrap(name, &var, "", msg, true)
69 | #define opt_define_string(var, name, val, msg) \
70 | string var = opt_define_string_wrap(name, &var, val, msg, false)
71 |
72 | inline bool opt_define_bool_wrap(const string &name, bool *var, bool val, const string &msg, bool required) {
73 | bool_opts.push_back(OptInfo(name, var, msg, required));
74 | return val;
75 | }
76 |
77 | inline int opt_define_int_wrap(const string &name, int *var, int val, const string &msg, bool required) {
78 | //printf("HELLO %s\n", name.c_str());
79 | int_opts.push_back(OptInfo(name, var, msg, required));
80 | //printf("N %d\n", (int)int_opts.size());
81 | return val;
82 | }
83 | inline double opt_define_double_wrap(const string &name, double *var, double val, const string &msg, bool required) {
84 | double_opts.push_back(OptInfo(name, var, msg, required));
85 | return val;
86 | }
87 | inline string opt_define_string_wrap(const string &name, string *var, const string &val, const string &msg, bool required) {
88 | string_opts.push_back(OptInfo(name, var, msg, required));
89 | return val;
90 | }
91 |
92 | ////////////////////////////////////////////////////////////
93 |
94 | void print_opts();
95 |
96 | extern int rand_seed;
97 |
98 | #endif
99 |
--------------------------------------------------------------------------------
/src/marlin/basic/stl-basic.h:
--------------------------------------------------------------------------------
1 | #ifndef __STL_BASIC_H__
2 | #define __STL_BASIC_H__
3 |
4 | #include "std.h"
5 | #include "city.h"
6 |
7 | ////////////////////////////////////////////////////////////
8 |
9 | typedef double real;
10 | //typedef float real;
11 |
12 | typedef pair IntPair;
13 | typedef pair IntDouble;
14 | typedef pair DoubleInt;
15 | typedef pair DoublePair;
16 | typedef vector IntPairVec;
17 | typedef vector DoubleIntVec;
18 | typedef vector BoolVec;
19 | typedef vector IntVec;
20 | typedef vector StringVec;
21 | typedef vector IntMat;
22 | typedef vector IntVecVec;
23 | typedef vector IntVecVecVec;
24 | typedef vector IntVecVecVecVec;
25 | typedef vector DoubleVec;
26 | typedef vector DoubleVecVec;
27 | typedef vector DoubleVecVecVec;
28 | typedef vector DoubleVecVecVecVec;
29 | typedef vector IntDoubleVec;
30 | typedef vector IntDoubleVecVec;
31 | typedef vector IntDoubleVecVecVec;
32 | typedef vector IntDoubleVecVecVecVec;
33 |
34 | typedef IntVec ivector;
35 | typedef DoubleVec fvector;
36 | typedef DoubleVecVec fmatrix;
37 |
38 | ////////////////////////////////////////////////////////////
39 |
40 | struct vector_eq {
41 | bool operator()(const IntVec &v1, const IntVec &v2) const {
42 | return v1 == v2;
43 | }
44 | };
45 | struct vector_hf {
46 | size_t operator()(const IntVec &v) const {
47 | return CityHash64(reinterpret_cast(&v[0]), sizeof(int) * v.size());
48 | #if 0
49 | int h = 0;
50 | foridx(i, len(v))
51 | h = (h<<4)^(h>>28)^v[i];
52 | return h;
53 | #endif
54 | }
55 | };
56 |
57 | struct pair_eq {
58 | bool operator()(const IntPair &p1, const IntPair &p2) const {
59 | return p1 == p2;
60 | }
61 | };
62 | struct pair_hf {
63 | size_t operator()(const IntPair &p) const {
64 | return (p.first<<4)^(p.first>>28) ^ p.second;
65 | }
66 | };
67 |
68 | struct str_eq {
69 | bool operator()(const char *s1, const char *s2) const {
70 | return strcmp(s1, s2) == 0;
71 | }
72 | };
73 | struct str_hf {
74 | size_t operator()(const char *s) const {
75 | return CityHash64(s, strlen(s));
76 | }
77 | };
78 |
79 | struct string_eq {
80 | bool operator()(const string &s1, const string &s2) const {
81 | return s1 == s2;
82 | }
83 | };
84 | struct string_hf {
85 | size_t operator()(const string &s) const {
86 | return CityHash64(s.c_str(), s.size());
87 | }
88 | };
89 |
90 | ////////////////////////////////////////////////////////////
91 |
92 | typedef unordered_set IntSet;
93 | typedef unordered_set IntPairSet;
94 | typedef unordered_set IntVecSet;
95 | typedef unordered_map IntVecDoubleMap;
96 | typedef unordered_map IntVecIntMap;
97 | typedef unordered_map IntIntMap;
98 | typedef unordered_map IntDoubleMap;
99 | typedef unordered_map IntIntPairMap;
100 | typedef unordered_map IntIntVecMap;
101 | typedef unordered_map IntIntIntMapMap;
102 | typedef unordered_map IntPairIntMap;
103 | typedef unordered_map IntPairDoubleMap;
104 | typedef unordered_map IntPairDoubleVecMap;
105 | typedef unordered_map IntVecIntVecMap;
106 | typedef unordered_map IntVecDoubleVecMap;
107 | typedef vector IntIntMapVec;
108 |
109 | typedef vector StrVec;
110 | typedef unordered_map StrIntMap;
111 | typedef unordered_map StrStrMap;
112 |
113 | #endif
114 |
--------------------------------------------------------------------------------
/src/marlin/basic/city.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2011 Google, Inc.
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
4 | // of this software and associated documentation files (the "Software"), to deal
5 | // in the Software without restriction, including without limitation the rights
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | // copies of the Software, and to permit persons to whom the Software is
8 | // furnished to do so, subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in
11 | // all copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | // THE SOFTWARE.
20 | //
21 | // CityHash, by Geoff Pike and Jyrki Alakuijala
22 | //
23 | // This file provides a few functions for hashing strings. On x86-64
24 | // hardware in 2011, CityHash64() is faster than other high-quality
25 | // hash functions, such as Murmur. This is largely due to higher
26 | // instruction-level parallelism. CityHash64() and CityHash128() also perform
27 | // well on hash-quality tests.
28 | //
29 | // CityHash128() is optimized for relatively long strings and returns
30 | // a 128-bit hash. For strings more than about 2000 bytes it can be
31 | // faster than CityHash64().
32 | //
33 | // Functions in the CityHash family are not suitable for cryptography.
34 | //
35 | // WARNING: This code has not been tested on big-endian platforms!
36 | // It is known to work well on little-endian platforms that have a small penalty
37 | // for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs.
38 | //
39 | // By the way, for some hash functions, given strings a and b, the hash
40 | // of a+b is easily derived from the hashes of a and b. This property
41 | // doesn't hold for any hash functions in this file.
42 |
43 | #ifndef CITY_HASH_H_
44 | #define CITY_HASH_H_
45 |
46 | #include // for size_t.
47 | #include
48 | #include
49 |
50 | typedef uint8_t uint8;
51 | typedef uint32_t uint32;
52 | typedef uint64_t uint64;
53 | typedef std::pair uint128;
54 |
55 | inline uint64 Uint128Low64(const uint128& x) { return x.first; }
56 | inline uint64 Uint128High64(const uint128& x) { return x.second; }
57 |
58 | // Hash function for a byte array.
59 | uint64 CityHash64(const char *buf, size_t len);
60 |
61 | // Hash function for a byte array. For convenience, a 64-bit seed is also
62 | // hashed into the result.
63 | uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed);
64 |
65 | // Hash function for a byte array. For convenience, two seeds are also
66 | // hashed into the result.
67 | uint64 CityHash64WithSeeds(const char *buf, size_t len,
68 | uint64 seed0, uint64 seed1);
69 |
70 | // Hash function for a byte array.
71 | uint128 CityHash128(const char *s, size_t len);
72 |
73 | // Hash function for a byte array. For convenience, a 128-bit seed is also
74 | // hashed into the result.
75 | uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed);
76 |
77 | // Hash 128 input bits down to 64 bits of output.
78 | // This is intended to be a reasonably good hash function.
79 | inline uint64 Hash128to64(const uint128& x) {
80 | // Murmur-inspired hashing.
81 | const uint64 kMul = 0x9ddfea08eb382d69ULL;
82 | uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
83 | a ^= (a >> 47);
84 | uint64 b = (Uint128High64(x) ^ a) * kMul;
85 | b ^= (b >> 47);
86 | b *= kMul;
87 | return b;
88 | }
89 |
90 | #endif // CITY_HASH_H_
91 |
--------------------------------------------------------------------------------
/src/code/makelmfsa_x.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include