├── .gitignore
├── DependencyInstance.cpp
├── DependencyInstance.h
├── DependencyPipe.cpp
├── DependencyPipe.h
├── FeatureEncoder.cpp
├── FeatureEncoder.h
├── FeatureExtractor.cpp
├── FeatureExtractor.h
├── LICENSE
├── Options.cpp
├── Options.h
├── Parameters.cpp
├── Parameters.h
├── README.md
├── Release
    ├── SharedTaskCommon.py
    ├── SharedTaskCommon.pyc
    ├── decoder
    │   └── subdir.mk
    ├── io
    │   └── subdir.mk
    ├── lattice_to_segmentation.py
    ├── makefile
    ├── objects.mk
    ├── run.sh
    ├── run_chinese.sh
    ├── run_chinese_test.sh
    ├── run_classical.sh
    ├── run_classical_test.sh
    ├── run_spmrl.sh
    ├── run_spmrl_test.sh
    ├── run_test.sh
    ├── sources.mk
    ├── subdir.mk
    ├── test.txt
    ├── util
    │   └── subdir.mk
    └── validateFormat.py
├── SegParser.cpp
├── SegParser.h
├── TedWrappers_20131015
    ├── SharedTaskCommon.py
    ├── SharedTaskCommon.pyc
    ├── TedEvalApps.jar
    ├── TedPart.jar
    ├── cleanconll.pl
    ├── cleanptb.pl
    ├── debug
    │   ├── check_sourceid.pl
    │   └── do_check.sh
    ├── genere_tfm_tedeval.pl
    ├── get_cutoffed_sent.pl
    ├── get_ted_res.pl
    ├── lattice_to_segmentation.py
    ├── lines
    ├── pproj_24934
    │   ├── conllx.xml
    │   └── pproj_24934_pseudo.info
    ├── reprojectivize.sh
    ├── skip_lines.pl
    ├── tedeval-2.2.jar
    ├── tedeval.jar
    ├── tedeval.sh
    ├── tedeval_cross.sh
    ├── tedeval_cross2.sh
    ├── tedeval_debug.jar
    ├── tedeval_seg.sh
    ├── tedeval_simple.sh
    ├── tedeval_simple.sh.good
    ├── tedeval_simple.sh.old
    ├── tedeval_simple_polish.sh
    ├── validateFormat.py
    └── wc
├── data
    ├── core12map.txt
    ├── spmrl.seg.dev
    ├── spmrl.seg.test
    ├── spmrl.seg.train
    ├── spmrl.uni.map
    ├── tags-all.mod.txt
    ├── tags-mada2core12.txt
    ├── test.Arabic
    └── test0.mada.gz
├── decoder
    ├── ClassifierDecoder.cpp
    ├── ClassifierDecoder.h
    ├── DependencyDecoder.cpp
    ├── DependencyDecoder.h
    ├── DevelopmentThread.cpp
    ├── DevelopmentThread.h
    ├── HillClimbingDecoder.cpp
    └── HillClimbingDecoder.h
├── io
    ├── DependencyReader.cpp
    ├── DependencyReader.h
    ├── DependencyWriter.cpp
    └── DependencyWriter.h
├── runs
    └── .gitignore
├── spmrl_code_generator
    ├── MadaReader.java
    ├── SpmrlDataGenerator.java
    └── SpmrlReader.java
└── util
    ├── Alphabet.cpp
    ├── Alphabet.h
    ├── Constant.cpp
    ├── Constant.h
    ├── FeatureAlphabet.cpp
    ├── FeatureAlphabet.h
    ├── FeatureVector.cpp
    ├── FeatureVector.h
    ├── Logarithm.cpp
    ├── Logarithm.h
    ├── Random.h
    ├── SerializationUtils.cpp
    ├── SerializationUtils.h
    ├── StringUtils.cpp
    ├── StringUtils.h
    └── Timer.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | 
19 | # Compiled Static libraries
20 | *.lai
21 | *.la
22 | *.a
23 | *.lib
24 | 
25 | # Executables
26 | *.exe
27 | *.out
28 | *.app
29 | 


--------------------------------------------------------------------------------
/DependencyInstance.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * DependencyInstance.h
  3 |  *
  4 |  *  Created on: Mar 21, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #ifndef DEPENDENCYINSTANCE_H_
  9 | #define DEPENDENCYINSTANCE_H_
 10 | 
 11 | #include <vector>
 12 | #include <string>
 13 | #include <boost/shared_ptr.hpp>
 14 | #include <unordered_set>
 15 | #include "util/FeatureVector.h"
 16 | 
 17 | namespace segparser {
 18 | 
 19 | class DependencyPipe;
 20 | class Options;
 21 | 
 22 | using namespace std;
 23 | 
 24 | class HeadIndex {
 25 | public:
 26 | 	int hWord;
 27 | 	int hSeg;
 28 | 
 29 | 	HeadIndex(int _hWord, int _hSeg) : hWord(_hWord), hSeg(_hSeg) { }
 30 | 
 31 | 	HeadIndex() : hWord(-1), hSeg(0) { }
 32 | 
 33 | 	void setIndex(int _hWord, int _hSeg) {
 34 | 		hWord = _hWord; hSeg = _hSeg;
 35 | 	}
 36 | 
 37 | 	friend bool operator < (HeadIndex &id1, HeadIndex &id2) {
 38 | 		return id1.hWord < id2.hWord || (id1.hWord == id2.hWord && id1.hSeg < id2.hSeg);
 39 | 	}
 40 | 
 41 | 	friend bool operator != (HeadIndex &id1, HeadIndex &id2) {
 42 | 		return id1.hWord != id2.hWord || id1.hSeg != id2.hSeg;
 43 | 	}
 44 | 
 45 | 	friend bool operator == (HeadIndex &id1, HeadIndex &id2) {
 46 | 		return id1.hWord == id2.hWord && id1.hSeg == id2.hSeg;
 47 | 	}
 48 | 
 49 | 	friend ostream& operator << (ostream& os, const HeadIndex& id) {
 50 | 		os << id.hWord << "/" << id.hSeg;
 51 | 		return os;
 52 | 	}
 53 | };
 54 | 
 55 | class SegElement {
 56 | public:
 57 | 	string form;
 58 | 	int formid;
 59 | 	string lemma;
 60 | 	int lemmaid;
 61 | 
 62 | 	HeadIndex dep;
 63 | 	int labid;
 64 | 
 65 | 	vector<HeadIndex> child;
 66 | 
 67 | 	int currPosCandID;			// id of the pos in candidate list
 68 | 	vector<string> candPos;
 69 | 	vector<int> candPosid;
 70 | 	vector<int> candDetPosid;
 71 | 	vector<int> candSpecialPos;
 72 | 	vector<double> candProb;
 73 | 
 74 | 	// for chinese character
 75 | 	int st;
 76 | 	int en;
 77 | 
 78 | 	SegElement() : form(""), formid(-1), lemma(""), lemmaid(-1), labid(-1), currPosCandID(-1), st(-1), en(-1) {}
 79 | 
 80 | 	int candPosNum() {
 81 | 		return candPos.size();
 82 | 	}
 83 | 
 84 | 	bool isOptPos() {
 85 | 		return currPosCandID == 0;
 86 | 	}
 87 | 
 88 | 	int getCurrPos() {
 89 | 		return candPosid[currPosCandID];
 90 | 	}
 91 | 
 92 | 	int getCurrDetPos() {
 93 | 		return candDetPosid[currPosCandID];
 94 | 	}
 95 | 
 96 | 	int getCurrSpecialPos() {
 97 | 		return candSpecialPos[currPosCandID];
 98 | 	}
 99 | 
100 | 	friend ostream& operator << (ostream& os, const SegElement& ele) {
101 | 		int i = ele.currPosCandID;
102 | 		os << ele.form << "_" << ele.formid << " " << ele.dep << " " <<
103 | 				ele.candPos[i] << "_" << ele.candPosid[i] << "_" << ele.candProb[i] << endl;
104 | 		return os;
105 | 	}
106 | };
107 | 
108 | class SegInstance {
109 | public:
110 | 	vector<SegElement> element;
111 | 	string segStr;
112 | 	double prob;
113 | 
114 | 	// morphology features
115 | 	int AlIndex;
116 | 	int morphIndex;
117 | 	vector<string> morph;		//per/gen/num
118 | 	vector<int> morphid;
119 | 
120 | 	SegInstance() : segStr(""), prob(0.0), AlIndex(-1), morphIndex(-1) {}
121 | 
122 | 	int size() {
123 | 		return element.size();
124 | 	}
125 | 
126 | 	friend ostream& operator << (ostream& os, const SegInstance& inst) {
127 | 		os << "seg str: " << inst.segStr << "_" << inst.prob << endl;
128 | 		for (unsigned int i = 0; i < inst.element.size(); ++i)
129 | 			os << "element: " << i << " " << inst.element[i] << endl;
130 | 		return os;
131 | 	}
132 | };
133 | 
134 | class WordInstance {
135 | public:
136 | 	// form/pos/dep/lab are retrieved from the candidate and id
137 | 	// the following are just for temporarily record gold info
138 | 	vector<string> goldForm;
139 | 	vector<string> goldLemma;
140 | 	vector<string> goldPos;
141 | 
142 | 	int goldAlIndex;
143 | 	int goldMorphIndex;
144 | 	vector<string> goldMorph;
145 | 
146 | 	vector<HeadIndex> goldDep;
147 | 	vector<string> goldLab;
148 | 
149 | 	string wordStr;
150 | 	int wordid;
151 | 
152 | 	int currSegCandID;		// id of the seg in candidate list
153 | 	vector<SegInstance> candSeg;
154 | 
155 | 	int optPosCount;		// number of segs with optimal pos
156 | 
157 | 	vector< vector<int> > inMap;		// [a->b id][size of b], for each element of b, need a map to decide the head and POS
158 | 	vector< vector<int> > outMap;		// [a->b id][size of a], for each child of a, need a map to decide its new parent
159 | 
160 | 	WordInstance() {
161 | 		wordStr = "";
162 | 		wordid = -1;
163 | 		goldAlIndex = -1;
164 | 		goldMorphIndex = -1;
165 | 		optPosCount = 0;
166 | 		currSegCandID = 0;
167 | 	}
168 | 
169 | 	SegInstance& getCurrSeg() {
170 | 		return candSeg[currSegCandID];
171 | 	}
172 | 
173 | 	bool isOptSeg() {
174 | 		return currSegCandID == 0;
175 | 	}
176 | 
177 | 	void setOptPosCount() {
178 | 		optPosCount = 0;
179 | 		SegInstance& segInst = getCurrSeg();
180 | 		for (int j = 0; j < segInst.size(); ++j)
181 | 			optPosCount += (segInst.element[j].currPosCandID == 0);
182 | 	}
183 | };
184 | 
185 | class DependencyInstance {
186 | public:
187 | 	DependencyInstance();
188 | 	virtual ~DependencyInstance();
189 | 
190 | 	vector<WordInstance> word;
191 | 	int numWord;			// number of words
192 | 
193 | 	vector<int> characterid;
194 | 
195 | 	int optSegCount;		// number of words with optimal seg
196 | 
197 | 	FeatureVector fv;		// feature vector of the current tree
198 | 
199 | 	// word index and seg index conversion
200 | 	void constructConversionList();
201 | 	void setOptSegPosCount();
202 | 	int getNumSeg();
203 | 	int wordToSeg(HeadIndex& id);
204 | 	int wordToSeg(int hid, int segid);
205 | 	HeadIndex segToWord(int id);
206 | 
207 | 	void setInstIds(DependencyPipe* pipe, Options* options);
208 | 	string normalize(string s);
209 | 
210 | 	int segDist(HeadIndex& head, HeadIndex& mod);
211 | 	SegElement& getElement(int hw, int hs);
212 | 	SegElement& getElement(HeadIndex id);
213 | 
214 | 	void buildChild();
215 | 	void updateChildList(HeadIndex& newH, HeadIndex& oldH, HeadIndex& arg);
216 | 
217 | 	void output();
218 | private:
219 | 	vector<int> numSeg;		// total number of segs before this word, appending the total number in the end
220 | 							// size = number of words
221 | 	vector<int> seg2Word;	// word index for the segment, size = number of segs
222 | 
223 | 	bool isPunc(string& w);
224 | 	bool isCoord(int lang, string& w);
225 | 
226 | 	int computeOverlap(SegElement& e1, SegElement& e2);
227 | 	vector<int> buildInMap(WordInstance& w, int a, int b);
228 | 	vector<int> buildOutMap(WordInstance& w, int a, int b);
229 | };
230 | 
231 | typedef boost::shared_ptr<DependencyInstance> inst_ptr;
232 | 
233 | class VariableInfo {
234 | public:
235 | 	vector<int> segID;
236 | 	vector<int> posID;
237 | 	vector<HeadIndex> dep;
238 | 
239 | 	VariableInfo() {
240 | 
241 | 	}
242 | 
243 | 	VariableInfo(DependencyInstance* inst) {
244 | 		copyInfoFromInst(inst);
245 | 	}
246 | 
247 | 	void copyInfoFromInst(DependencyInstance* inst) {
248 | 		if ((int)segID.size() != inst->numWord) {
249 | 			segID.resize(inst->numWord);
250 | 		}
251 | 
252 | 		int numSeg = inst->getNumSeg();
253 | 		if ((int)posID.size() != numSeg) {
254 | 			posID.resize(numSeg);
255 | 			dep.resize(numSeg);
256 | 		}
257 | 
258 | 		int p = 0;
259 | 		for (int i = 0; i < inst->numWord; ++i) {
260 | 			segID[i] = inst->word[i].currSegCandID;
261 | 			SegInstance& segInst = inst->word[i].getCurrSeg();
262 | 
263 | 			for (int j = 0; j < segInst.size(); ++j) {
264 | 				posID[p] = segInst.element[j].currPosCandID;
265 | 				dep[p] = segInst.element[j].dep;
266 | 				p++;
267 | 			}
268 | 		}
269 | 		assert(p == inst->getNumSeg());
270 | 	}
271 | 
272 | 	void loadInfoToInst(DependencyInstance* inst) {
273 | 		assert((int)segID.size() == inst->numWord);
274 | 
275 | 		int p = 0;
276 | 		for (int i = 0; i < inst->numWord; ++i) {
277 | 			inst->word[i].currSegCandID = segID[i];
278 | 			SegInstance& segInst = inst->word[i].getCurrSeg();
279 | 
280 | 			for (int j = 0; j < segInst.size(); ++j) {
281 | 				segInst.element[j].currPosCandID = posID[p];
282 | 				segInst.element[j].dep = dep[p];
283 | 				p++;
284 | 			}
285 | 		}
286 | 
287 | 		assert(p == (int)posID.size());
288 | 	}
289 | 
290 | 	bool isChanged(DependencyInstance* inst) {
291 | 		assert((int)segID.size() == inst->numWord);
292 | 
293 | 		int p = 0;
294 | 		for (int i = 0; i < inst->numWord; ++i) {
295 | 			if (inst->word[i].currSegCandID != segID[i])
296 | 				return true;
297 | 			SegInstance& segInst = inst->word[i].getCurrSeg();
298 | 
299 | 			for (int j = 0; j < segInst.size(); ++j) {
300 | 				if (segInst.element[j].currPosCandID != posID[p])
301 | 					return true;
302 | 				if (segInst.element[j].dep != dep[p])
303 | 					return true;
304 | 				p++;
305 | 			}
306 | 		}
307 | 
308 | 		assert(p == (int)posID.size());
309 | 		return false;
310 | 	}
311 | };
312 | 
313 | } /* namespace segparser */
314 | 
315 | #include "DependencyPipe.h"
316 | 
317 | #endif /* DEPENDENCYINSTANCE_H_ */
318 | 


--------------------------------------------------------------------------------
/DependencyPipe.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * DependencyPipe.h
 3 |  *
 4 |  *  Created on: Apr 4, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef DEPENDENCYPIPE_H_
 9 | #define DEPENDENCYPIPE_H_
10 | 
11 | #include "Options.h"
12 | #include "DependencyInstance.h"
13 | #include "util/Alphabet.h"
14 | #include "util/FeatureAlphabet.h"
15 | #include <unordered_set>
16 | 
17 | namespace segparser {
18 | 
19 | using namespace std;
20 | 
21 | class DependencyPipe {
22 | public:
23 | 	DependencyPipe(Options* options);
24 | 	virtual ~DependencyPipe();
25 | 
26 | 	void loadCoarseMap(string& file);
27 | 	void setAndCheckOffset();
28 | 	void buildDictionary(string& goldfile);
29 | 	void buildDictionaryWithOOV(string& goldfile);
30 | 	void closeAlphabets();
31 | 	void createAlphabet(string& goldfile);
32 | 	vector<inst_ptr> createInstances(string goldFile);
33 | 
34 | 	int findRightNearestChildID(vector<HeadIndex>& child, HeadIndex id);
35 | 	HeadIndex findRightNearestChild(vector<HeadIndex>& child, HeadIndex id);
36 | 	HeadIndex findLeftNearestChild(vector<HeadIndex>& child, HeadIndex id);
37 | 	vector<HeadIndex> findConjArg(DependencyInstance* s, HeadIndex& arg);
38 | 
39 | 	void createFeatureVector(DependencyInstance* inst, FeatureVector* fv);
40 | 	int getBinnedDistance(int x);
41 | 	void createArcFeatureVector(DependencyInstance* inst, HeadIndex& headIndex, HeadIndex& modIndex, FeatureVector* fv);
42 | 	void createTripsFeatureVector(DependencyInstance* inst, HeadIndex& par, HeadIndex& ch1, HeadIndex& ch2, FeatureVector* fv);
43 | 	void createSibsFeatureVector(DependencyInstance* inst, HeadIndex& ch1, HeadIndex& ch2, bool isST, FeatureVector* fv);
44 | 	void createGPCFeatureVector(DependencyInstance* inst, HeadIndex& gp, HeadIndex& par, HeadIndex& c, FeatureVector* fv);
45 | 	void createGPSibFeatureVector(DependencyInstance* inst, SegElement* gp, SegElement* par, SegElement* ch1, SegElement* ch2, FeatureVector* fv);
46 | 	void createTriSibFeatureVector(DependencyInstance* inst, SegElement* par, SegElement* ch1, SegElement* ch2, SegElement* ch3, FeatureVector* fv);
47 | 	void createPos1OFeatureVector(DependencyInstance* inst, HeadIndex& m, FeatureVector* fv);
48 | 	void createPosHOFeatureVector(DependencyInstance* inst, HeadIndex& m, bool unigram, FeatureVector* fv);
49 | 	void createSegFeatureVector(DependencyInstance* inst, int wordid, FeatureVector* fv);
50 | 	void createHighOrderFeatureVector(DependencyInstance* inst, FeatureVector* fv);
51 | 	void createPartialHighOrderFeatureVector(DependencyInstance* inst, HeadIndex& x, bool bigram, FeatureVector* fv);
52 | 	void createPartialPosHighOrderFeatureVector(DependencyInstance* inst, HeadIndex& x, FeatureVector* fv);
53 | 	void addCode(int type, uint64_t code, double val, FeatureVector* fv);
54 | 	void addCode(int type, uint64_t code, FeatureVector* fv);
55 | 
56 | 	FeatureAlphabet* dataAlphabet;
57 | 
58 | 	// dictionary
59 | 	Alphabet* typeAlphabet;
60 | 	Alphabet* posAlphabet;			// pos
61 | 	Alphabet* lexAlphabet;			// lemma, word
62 | 	unordered_set<string> suffixList;
63 | 
64 | 	unordered_map<string, string> coarseMap;
65 | 
66 | 	// encoder
67 | 	FeatureEncoder* fe;
68 | private:
69 | 	Options* options;
70 | 
71 | 	void buildSuffixList();
72 | };
73 | 
74 | } /* namespace segparser */
75 | #endif /* DEPENDENCYPIPE_H_ */
76 | 


--------------------------------------------------------------------------------
/FeatureEncoder.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * FeatureEncoder.cpp
  3 |  *
  4 |  *  Created on: Mar 29, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #include "FeatureEncoder.h"
  9 | #include <iostream>
 10 | #include <stdlib.h>
 11 | #include <iomanip>
 12 | #include <bitset>
 13 | 
 14 | namespace segparser {
 15 | 
 16 | FeatureEncoder::FeatureEncoder() {
 17 | 	largeOff = 17;
 18 | 	midOff = 9;
 19 | 	flagOff = 4;
 20 | 	tempOff = 7;
 21 | }
 22 | 
 23 | FeatureEncoder::~FeatureEncoder() {
 24 | }
 25 | 
 26 | /*********************************
 27 |  * code generator
 28 |  * generally flag will be added lately, because code without flag is also needed
 29 |  */
 30 | 
 31 | int FeatureEncoder::getBits(uint64_t x) {
 32 | 	uint64_t y = 1;
 33 |     int i = 0;
 34 |     while (y < x) {
 35 |             y = y << 1;
 36 |             ++i;
 37 |     }
 38 |     return i;
 39 | }
 40 | 
 41 | uint64_t FeatureEncoder::genCodePF(uint64_t temp, uint64_t p1) {
 42 | 	return ((p1 << flagOff) << tempOff) | temp;
 43 | }
 44 | 
 45 | uint64_t FeatureEncoder::genCodePPF(uint64_t temp, uint64_t p1, uint64_t p2) {
 46 | 	return ((((p1 << midOff) | p2) << flagOff) << tempOff) | temp;
 47 | }
 48 | 
 49 | uint64_t FeatureEncoder::genCodePPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3) {
 50 | 	return ((((((p1 << midOff) | p2) << midOff) | p3) << flagOff) << tempOff) | temp;
 51 | }
 52 | 
 53 | uint64_t FeatureEncoder::genCodePPPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t p4) {
 54 | 	return ((((((((p1 << midOff) | p2) << midOff) | p3) << midOff) | p4) << flagOff) << tempOff) | temp;
 55 | }
 56 | 
 57 | uint64_t FeatureEncoder::genCodePPPPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t p4, uint64_t p5) {
 58 | 	return ((((((((((p1 << midOff) | p2) << midOff) | p3) << midOff) | p4) << midOff) | p5) << flagOff) << tempOff) | temp;
 59 | }
 60 | 
 61 | uint64_t FeatureEncoder::genCodeWF(uint64_t temp, uint64_t w1) {
 62 | 	return ((w1 << flagOff) << tempOff) | temp;
 63 | }
 64 | 
 65 | uint64_t FeatureEncoder::genCodePWF(uint64_t temp, uint64_t p1, uint64_t w1) {
 66 | 	return ((((w1 << midOff) | p1) << flagOff) << tempOff) | temp;
 67 | }
 68 | 
 69 | uint64_t FeatureEncoder::genCodeWWF(uint64_t temp, uint64_t w1, uint64_t w2) {
 70 | 	return ((((w1 << largeOff) | w2) << flagOff) << tempOff) | temp;
 71 | }
 72 | 
 73 | uint64_t FeatureEncoder::genCodeWWW(uint64_t temp, uint64_t w1, uint64_t w2, uint64_t w3) {
 74 | 	return (((((w1 << largeOff) | w2) << largeOff) | w3) << tempOff) | temp;
 75 | }
 76 | 
 77 | uint64_t FeatureEncoder::genCodePPWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t w1) {
 78 | 	return ((((((w1 << midOff) | p1) << midOff) | p2) << flagOff) << tempOff) | temp;
 79 | }
 80 | 
 81 | uint64_t FeatureEncoder::genCodePPPWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t w1) {
 82 | 	return ((((((((w1 << midOff) | p1) << midOff) | p2) << midOff) | p3) << flagOff) << tempOff) | temp;
 83 | }
 84 | 
 85 | uint64_t FeatureEncoder::genCodePWWF(uint64_t temp, uint64_t p1, uint64_t w1, uint64_t w2) {
 86 | 	return ((((((w1 << largeOff) | w2) << midOff) | p1) << flagOff) << tempOff) | temp;
 87 | }
 88 | 
 89 | uint64_t FeatureEncoder::genCodePPWWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t w1, uint64_t w2) {
 90 | 	return ((((((((w1 << largeOff) | w2) << midOff) | p1) << midOff) | p2) << flagOff) << tempOff) | temp;
 91 | }
 92 | 
 93 | uint64_t FeatureEncoder::genCodeIIVF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1) {
 94 | 	return ((((((i1 << midOff) | i2) << midOff) | v1) << flagOff) << tempOff) | temp;
 95 | }
 96 | 
 97 | uint64_t FeatureEncoder::genCodeIIVVF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2) {
 98 | 	return ((((((((i1 << midOff) | i2) << midOff) | v1) << midOff) | v2) << flagOff) << tempOff) | temp;
 99 | }
100 | 
101 | uint64_t FeatureEncoder::genCodeIIVVPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2, uint64_t p1) {
102 | 	return ((((((((((i1 << midOff) | i2) << midOff) | v1) << midOff) | v2) << midOff) | p1) << flagOff) << tempOff) | temp;
103 | }
104 | 
105 | uint64_t FeatureEncoder::genCodeIIVPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t p1) {
106 | 	return ((((((((i1 << midOff) | i2) << midOff) | v1) << midOff) | p1) << flagOff) << tempOff) | temp;
107 | }
108 | 
109 | uint64_t FeatureEncoder::genCodeIIVPPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t p1, uint64_t p2) {
110 | 	return ((((((((((i1 << midOff) | i2) << midOff) | v1) << midOff) | p1) << midOff) | p2) << flagOff) << tempOff) | temp;
111 | }
112 | 
113 | uint64_t FeatureEncoder::genCodeIIVVPPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2, uint64_t p1, uint64_t p2) {
114 | 	return ((((((((((((i1 << midOff) | i2) << midOff) | v1) << midOff) | v2)
115 | 			<< midOff) | p1) << midOff) | p2) << flagOff) << tempOff) | temp;
116 | }
117 | 
118 | } /* namespace segparser */
119 | 


--------------------------------------------------------------------------------
/FeatureEncoder.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * FeatureEncoder.h
  3 |  *
  4 |  *  Created on: Mar 29, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #ifndef FEATUREENCODER_H_
  9 | #define FEATUREENCODER_H_
 10 | 
 11 | #include <stdint.h>
 12 | 
 13 | namespace segparser {
 14 | 
 15 | /******************************
 16 |  * template type
 17 |  *****************************/
 18 | 
 19 | struct TemplateType {
 20 | 	enum types {
 21 | 		TArc, TSecondOrder, TThirdOrder, THighOrder, COUNT,
 22 | 	};
 23 | };
 24 | 
 25 | /**********************************
 26 |  * notation
 27 |  * P: pos; W: word;
 28 |  **********************************/
 29 | struct Arc {
 30 | 
 31 | 	enum types {
 32 | 		START,
 33 | 
 34 | 		/*************************************************
 35 | 		 * First-order dependency feature from MST
 36 | 		 * ***********************************************/
 37 | 		// feature posL posIn posR
 38 | 		LP_MP_RP,
 39 | 
 40 | 		// feature posL-1 posL posR posR+1
 41 | 		pLP_LP_RP_nRP,
 42 | 		LP_RP_nRP,
 43 | 		pLP_RP_nRP,
 44 | 		pLP_LP_nRP,
 45 | 		pLP_LP_RP,
 46 | 
 47 | 		// feature posL posL+1 posR-1 posR
 48 | 		LP_nLP_pRP_RP,
 49 | 		nLP_pRP_RP,
 50 | 		LP_pRP_RP,
 51 | 		LP_nLP_RP,
 52 | 		LP_nLP_pRP,
 53 | 
 54 | 		// feature posL-1 posL posR-1 posR
 55 | 		// feature posL posL+1 posR posR+1
 56 | 		pLP_LP_pRP_RP,
 57 | 		LP_nLP_RP_nRP,
 58 | 
 59 | 		// two obs (word, pos)
 60 | 		HP,
 61 | 		HW,
 62 | 		//MP,
 63 | 		//MW,
 64 | 		HW_HP,
 65 | 		//MW_MP,
 66 | 		HP_MP,
 67 | 		HP_MW,
 68 | 		HW_MP,
 69 | 		HW_MW,
 70 | 		HP_MP_MW,
 71 | 		HW_MP_MW,
 72 | 		HP_HW_MP,
 73 | 		HP_HW_MW,
 74 | 		HP_HW_MP_MW,
 75 | 
 76 | 		// lemma pos
 77 | 		HL,
 78 | 		//ML,
 79 | 		HP_ML,
 80 | 		HL_MP,
 81 | 		HL_ML,
 82 | 		HP_HL,
 83 | 		//MP_ML,
 84 | 		HP_MP_ML,
 85 | 		HL_MP_ML,
 86 | 		HP_HL_MP,
 87 | 		HP_HL_ML,
 88 | 		HP_HL_MP_ML,
 89 | 
 90 | 		// morphology, id, val, pos
 91 | 		FF_IDH_IDM_HV,
 92 | 		FF_IDH_IDM_MV,
 93 | 		FF_IDH_IDM_HV_MV,
 94 | 		FF_IDH_IDM_HP_MV,
 95 | 		FF_IDH_IDM_HV_MP,
 96 | 		FF_IDH_IDM_HV_HP,
 97 | 		FF_IDH_IDM_MV_MP,
 98 | 		FF_IDH_IDM_HP_MP_MV,
 99 | 		FF_IDH_IDM_HV_MP_MV,
100 | 		FF_IDH_IDM_HP_HV_MP,
101 | 		FF_IDH_IDM_HP_HV_MV,
102 | 		FF_IDH_IDM_HP_HV_MP_MV,
103 | 
104 | 		// label
105 | 		LAB,
106 | 		LAB_W_WP,
107 | 		LAB_WP,
108 | 		LAB_pWP_WP,
109 | 		LAB_WP_nWP,
110 | 		LAB_pWP_WP_nWP,
111 | 		LAB_W,
112 | 		LAB_L_WP,
113 | 		LAB_L,
114 | 		LAB_HP_MP,
115 | 		LAB_HL_MP,
116 | 		LAB_HP_ML,
117 | 		LAB_HL_ML,
118 | 
119 | 		// first order
120 | 		HD_BD_MD,
121 | 
122 | 		pHD_HD_MD_nMD,
123 | 		HD_MD_nMD,
124 | 		pHD_MD_nMD,
125 | 		pHD_HD_nMD,
126 | 		pHD_HD_MD,
127 | 
128 | 		HD_nHD_pMD_MD,
129 | 		nHD_pMD_MD,
130 | 		HD_pMD_MD,
131 | 		HD_nHD_MD,
132 | 		HD_nHD_pMD,
133 | 
134 | 		pHD_HD_pMD_MD,
135 | 		HD_nHD_MD_nMD,
136 | 
137 | 		HD,
138 | 		HD_MD,
139 | 
140 | 		// contextual
141 | 		pHW,
142 | 		nHW,
143 | 		pMW,
144 | 		MW,
145 | 		nMW,
146 | 
147 | 		// flag
148 | 		HP_MP_FLAG,
149 | 		HW_MW_FLAG,
150 | 
151 | 		COUNT,
152 | 	};
153 | };
154 | struct SecondOrder {
155 | 	enum types {
156 | 		START,
157 | 
158 | 		HP_SP_MP,
159 | 		HC_SC_MC,
160 | 
161 | 		pHC_HC_SC_MC,
162 | 		HC_nHC_SC_MC,
163 | 		HC_pSC_SC_MC,
164 | 		HC_SC_nSC_MC,
165 | 		HC_SC_pMC_MC,
166 | 		HC_SC_MC_nMC,
167 | 
168 | 		pHC_HL_SC_MC,
169 | 		HL_nHC_SC_MC,
170 | 		HL_pSC_SC_MC,
171 | 		HL_SC_nSC_MC,
172 | 		HL_SC_pMC_MC,
173 | 		HL_SC_MC_nMC,
174 | 
175 | 		pHC_HC_SL_MC,
176 | 		HC_nHC_SL_MC,
177 | 		HC_pSC_SL_MC,
178 | 		HC_SL_nSC_MC,
179 | 		HC_SL_pMC_MC,
180 | 		HC_SL_MC_nMC,
181 | 
182 | 		pHC_HC_SC_ML,
183 | 		HC_nHC_SC_ML,
184 | 		HC_pSC_SC_ML,
185 | 		HC_SC_nSC_ML,
186 | 		HC_SC_pMC_ML,
187 | 		HC_SC_ML_nMC,
188 | 
189 | 		HC_MC_SC_pHC_pMC,
190 | 		HC_MC_SC_pHC_pSC,
191 | 		HC_MC_SC_pMC_pSC,
192 | 		HC_MC_SC_nHC_nMC,
193 | 		HC_MC_SC_nHC_nSC,
194 | 		HC_MC_SC_nMC_nSC,
195 | 		HC_MC_SC_pHC_nMC,
196 | 		HC_MC_SC_pHC_nSC,
197 | 		HC_MC_SC_pMC_nSC,
198 | 		HC_MC_SC_nHC_pMC,
199 | 		HC_MC_SC_nHC_pSC,
200 | 		HC_MC_SC_nMC_pSC,
201 | 
202 | 		SP_MP,
203 | 		SW_MW,
204 | 		SW_MP,
205 | 		SP_MW,
206 | 		SC_MC,
207 | 		SL_ML,
208 | 		SL_MC,
209 | 		SC_ML,
210 | 
211 | 		// head bigram
212 | 		H1P_H2P_M1P_M2P,
213 | 		H1P_H2P_M1P_M2P_DIR,
214 | 		H1C_H2C_M1C_M2C,
215 | 		H1C_H2C_M1C_M2C_DIR,
216 | 
217 | 		// gp-p-c
218 | 		GP_HP_MP,
219 | 		GC_HC_MC,
220 | 		GL_HC_MC,
221 | 		GC_HL_MC,
222 | 		GC_HC_ML,
223 | 
224 | 		pGC_GC_HC_MC,
225 | 		GC_nGC_HC_MC,
226 | 		GC_pHC_HC_MC,
227 | 		GC_HC_nHC_MC,
228 | 		GC_HC_pMC_MC,
229 | 		GC_HC_MC_nMC,
230 | 
231 | 		pGC_GL_HC_MC,
232 | 		GL_nGC_HC_MC,
233 | 		GL_pHC_HC_MC,
234 | 		GL_HC_nHC_MC,
235 | 		GL_HC_pMC_MC,
236 | 		GL_HC_MC_nMC,
237 | 
238 | 		pGC_GC_HL_MC,
239 | 		GC_nGC_HL_MC,
240 | 		GC_pHC_HL_MC,
241 | 		GC_HL_nHC_MC,
242 | 		GC_HL_pMC_MC,
243 | 		GC_HL_MC_nMC,
244 | 
245 | 		pGC_GC_HC_ML,
246 | 		GC_nGC_HC_ML,
247 | 		GC_pHC_HC_ML,
248 | 		GC_HC_nHC_ML,
249 | 		GC_HC_pMC_ML,
250 | 		GC_HC_ML_nMC,
251 | 
252 | 		GC_HC_MC_pGC_pHC,
253 | 		GC_HC_MC_pGC_pMC,
254 | 		GC_HC_MC_pHC_pMC,
255 | 		GC_HC_MC_nGC_nHC,
256 | 		GC_HC_MC_nGC_nMC,
257 | 		GC_HC_MC_nHC_nMC,
258 | 		GC_HC_MC_pGC_nHC,
259 | 		GC_HC_MC_pGC_nMC,
260 | 		GC_HC_MC_pHC_nMC,
261 | 		GC_HC_MC_nGC_pHC,
262 | 		GC_HC_MC_nGC_pMC,
263 | 		GC_HC_MC_nHC_pMC,
264 | 
265 | 		COUNT,
266 | 	};
267 | };
268 | 
269 | struct ThirdOrder {
270 | 	enum types {
271 | 		START,
272 | 
273 | 		// move some gpc features here...
274 | 		GL_HL_MC,
275 | 		GL_HC_ML,
276 | 		GC_HL_ML,
277 | 		GL_HL_ML,
278 | 
279 | 		GC_HC,
280 | 		GL_HC,
281 | 		GC_HL,
282 | 		GL_HL,
283 | 
284 | 		// only cross with dir flag
285 | 		GC_MC,
286 | 		GL_MC,
287 | 		GC_ML,
288 | 		GL_ML,
289 | 		HC_MC,
290 | 		HL_MC,
291 | 		HC_ML,
292 | 		HL_ML,
293 | 
294 | 		// ggpc
295 | 		GGC_GC_HC_MC,
296 | 		GGL_GC_HC_MC,
297 | 		GGC_GL_HC_MC,
298 | 		GGC_GC_HL_MC,
299 | 		GGC_GC_HC_ML,
300 | 
301 | 		GGC_HC_MC,
302 | 		GGL_HC_MC,
303 | 		GGC_HL_MC,
304 | 		GGC_HC_ML,
305 | 		GGC_GC_MC,
306 | 		GGL_GC_MC,
307 | 		GGC_GL_MC,
308 | 		GGC_GC_ML,
309 | 		GGC_MC,
310 | 		GGL_MC,
311 | 		GGC_ML,
312 | 		GGL_ML,
313 | 
314 | 		HC_MC_CC_SC,
315 | 		HL_MC_CC_SC,
316 | 		HC_ML_CC_SC,
317 | 		HC_MC_CL_SC,
318 | 		HC_MC_CC_SL,
319 | 
320 | 		HC_CC_SC,
321 | 		HL_CC_SC,
322 | 		HC_CL_SC,
323 | 		HC_CC_SL,
324 | 
325 | 		// gp sibling
326 | 		GC_HC_MC_SC,
327 | 		GL_HC_MC_SC,
328 | 		GC_HL_MC_SC,
329 | 		GC_HC_ML_SC,
330 | 		GC_HC_MC_SL,
331 | 
332 | 		// tri-sibling
333 | 		HC_PC_MC_NC,
334 | 		HL_PC_MC_NC,
335 | 		HC_PL_MC_NC,
336 | 		HC_PC_ML_NC,
337 | 		HC_PC_MC_NL,
338 | 
339 | 		HC_PC_NC,
340 | 		PC_MC_NC,
341 | 		HL_PC_NC,
342 | 		HC_PL_NC,
343 | 		HC_PC_NL,
344 | 		PL_MC_NC,
345 | 		PC_ML_NC,
346 | 		PC_MC_NL,
347 | 
348 | 		PC_NC,
349 | 		PL_NC,
350 | 		PC_NL,
351 | 
352 | 		COUNT,
353 | 	};
354 | };
355 | 
356 | struct HighOrder {
357 | 	enum types {
358 | 		START,
359 | 
360 | 		// pp attachment
361 | 		PP_HC_MC,
362 | 		PP_HL_MC,
363 | 		PP_HC_ML,
364 | 		PP_HL_ML,
365 | 
366 | 		PP_PL_HC_MC,
367 | 		PP_PL_HL_MC,
368 | 		PP_PL_HC_ML,
369 | 		PP_PL_HL_ML,
370 | 
371 | 		// conjunction
372 | 		CC_CP_LP_RP,
373 | 		CC_CP_LC_RC,
374 | 		CC_CW_LP_RP,
375 | 		CC_CW_LC_RC,
376 | 
377 | 		CC_LC_RC_FID,
378 | 
379 | 		CC_CP_HC_AC,
380 | 		CC_CP_HL_AL,
381 | 		CC_CW_HC_AC,
382 | 		CC_CW_HL_AL,
383 | 
384 | 		CC_LP_RP_LENDIFF,
385 | 		CC_LC_RC_LENDIFF,
386 | 		CC_LENDIFF,
387 | 
388 | 		CC_LP_RP_CHILDF,
389 | 		CC_LC_RC_CHILDF,
390 | 
391 | 		// PNX
392 | 		PNX_MW,
393 | 		PNX_HP_MW,
394 | 
395 | 		// right branch
396 | 		RB,
397 | 
398 | 		// child num
399 | 		CN_HP_NUM,
400 | 		CN_HP_LNUM_RNUM,
401 | 		CN_STR,
402 | 
403 | 		// heavy
404 | 		HV_HP,
405 | 		HV_HC,
406 | 
407 | 		// neighbor
408 | 		NB_HP_LC_RC,
409 | 		NB_HC_LC_RC,
410 | 		NB_HL_LC_RC,
411 | 		NB_GC_HC_LC_RC,
412 | 		NB_GC_HL_LC_RC,
413 | 		NB_GL_HC_LC_RC,
414 | 
415 | 		// non-proj
416 | 		NP,
417 | 		NP_MC,
418 | 		NP_HC,
419 | 		NP_HL,
420 | 		NP_ML,
421 | 		NP_HC_MC,
422 | 		NP_HL_MC,
423 | 		NP_HC_ML,
424 | 		NP_HL_ML,
425 | 
426 | 		// pos tagging features
427 | 		ppP_P,
428 | 		pP_P,
429 | 		P_nP,		// duplicated...
430 | 		P_nnP,
431 | 
432 | 		ppP_pP_P,
433 | 		pP_P_nP,
434 | 		P_nP_nnP,
435 | 		ppP_P_nP,
436 | 		pP_P_nnP,
437 | 		ppP_P_nnP,
438 | 
439 | 		ppP_pP_P_nP,
440 | 		ppP_pP_P_nnP,
441 | 		pP_P_nP_nnP,
442 | 		ppP_P_nP_nnP,
443 | 
444 | 		ppP_pP_P_nP_nnP,
445 | 
446 | 		ppL_P,
447 | 		pL_P,
448 | 		L_P,
449 | 		P_nL,
450 | 		P_nnL,
451 | 		pP_L_P,
452 | 		L_P_nP,
453 | 		pP_L_P_nP,
454 | 		ppP_pP_L_P,
455 | 		L_P_nP_nnP,
456 | 
457 | 		POS_PROB,
458 | 		P_POS_PROB,
459 | 		W_POS_PROB,
460 | 		SEG_PROB,
461 | 		W_SEG_PROB,
462 | 
463 | 		SEG_P2,
464 | /*		SEG_P1,
465 | 		SEG_U,
466 | 		SEG_N1,
467 | 		SEG_N2,
468 | 		SEG_P2_P1,
469 | 		SEG_P1_U,
470 | 		SEG_U_N1,
471 | 		SEG_N1_N2,
472 | 		SEG_IP2P1,
473 | 		SEG_IP1U,
474 | 		SEG_IUN1,
475 | 		SEG_IN1N2,
476 | 		SEG_IP3P1,
477 | 		SEG_IP2U,
478 | 		SEG_IP1N1,
479 | 		SEG_IUN2,
480 | */
481 | 		pL_P_L,
482 | 		P_L_nL,
483 | 		pL_P_nL,
484 | 		P_START_C_pC,
485 | 		P_MID_C_pC,
486 | 		P_C_C0,
487 | 		P_C0,
488 | 		pP_P_pC_C,
489 | 		P_PRE,
490 | 		P_SUF,
491 | 		P_LENGTH,
492 | 
493 | 		SEG_W,
494 | 
495 | 		COUNT,
496 | 	};
497 | };
498 | 
499 | class FeatureEncoder {
500 | public:
501 | 	FeatureEncoder();
502 | 	virtual ~FeatureEncoder();
503 | 
504 | 	/*******************************
505 | 	 *  offset
506 | 	 ******************************/
507 | 
508 | 	int largeOff;		// word, lemma
509 | 	int midOff; 		// pos, cpos, type
510 | 	int flagOff;		// flag, children num, length diff etc.
511 | 	int tempOff;		// template
512 | 
513 | 	int getBits(uint64_t x);
514 | 
515 | 	uint64_t genCodePF(uint64_t temp, uint64_t p1);
516 | 
517 | 	uint64_t genCodePPF(uint64_t temp, uint64_t p1, uint64_t p2);
518 | 
519 | 	uint64_t genCodePPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3);;
520 | 
521 | 	uint64_t genCodePPPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t p4);
522 | 
523 | 	uint64_t genCodePPPPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t p4, uint64_t p5);
524 | 
525 | 	uint64_t genCodeWF(uint64_t temp, uint64_t w1);
526 | 
527 | 	uint64_t genCodePWF(uint64_t temp, uint64_t p1, uint64_t w1);
528 | 
529 | 	uint64_t genCodeWWF(uint64_t temp, uint64_t w1, uint64_t w2);
530 | 
531 | 	uint64_t genCodeWWW(uint64_t temp, uint64_t w1, uint64_t w2, uint64_t w3);
532 | 
533 | 	uint64_t genCodePPWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t w1);
534 | 
535 | 	uint64_t genCodePPPWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t w1);
536 | 
537 | 	uint64_t genCodePWWF(uint64_t temp, uint64_t p1, uint64_t w1, uint64_t w2);
538 | 
539 | 	uint64_t genCodePPWWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t w1, uint64_t w2);
540 | 
541 | 	uint64_t genCodeIIVF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1);
542 | 
543 | 	uint64_t genCodeIIVVF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2);
544 | 
545 | 	uint64_t genCodeIIVVPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2, uint64_t p1);
546 | 
547 | 	uint64_t genCodeIIVPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t p1);
548 | 
549 | 	uint64_t genCodeIIVPPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t p1, uint64_t p2);
550 | 
551 | 	uint64_t genCodeIIVVPPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2, uint64_t p1, uint64_t p2) ;
552 | };
553 | 
554 | } /* namespace segparser */
555 | #endif /* FEATUREENCODER_H_ */
556 | 


--------------------------------------------------------------------------------
/FeatureExtractor.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * CacheTable.h
  3 |  *
  4 |  *  Created on: Apr 2, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #ifndef FEATUREEXTRACTOR_H_
  9 | #define FEATUREEXTRACTOR_H_
 10 | 
 11 | #include <unordered_map>
 12 | #include <string>
 13 | #include <vector>
 14 | #include <boost/shared_ptr.hpp>
 15 | #include "util/FeatureVector.h"
 16 | #include "DependencyInstance.h"
 17 | #include "SegParser.h"
 18 | #include "Options.h"
 19 | #include "DependencyPipe.h"
 20 | #include "Parameters.h"
 21 | 
 22 | namespace segparser {
 23 | 
 24 | using namespace std;
 25 | 
 26 | class SegParser;
 27 | class Parameters;
 28 | 
 29 | class CacheItem {
 30 | public:
 31 | 	FeatureVector fv;
 32 | 	double score;
 33 | 	int flag;
 34 | 
 35 | 	CacheItem() {
 36 | 		score = 0.0;
 37 | 		flag = 123;
 38 | 	}
 39 | };
 40 | 
 41 | typedef boost::shared_ptr<CacheItem> item_ptr;
 42 | 
 43 | /***
 44 |  * CacheTable always uses segIndex while FeatureExtractor always uses word/seg Index.
 45 |  * DependencyInstance is responsible for the conversion
 46 |  */
 47 | 
 48 | class PrunerFeatureExtractor;
 49 | 
 50 | class CacheTable {
 51 | public:
 52 | 	CacheTable();
 53 | 	virtual ~CacheTable();
 54 | 
 55 | 	void initCacheTable(int _type, DependencyInstance* inst, PrunerFeatureExtractor* pfe, Options* options);
 56 | 
 57 | 	bool isPruned(int h, int m);
 58 | 	int arc2ID(int h, int m);
 59 | 
 60 | 	int numSeg;			// length based on seg
 61 | 	int numWord;
 62 | 	int type;
 63 | 
 64 | 	int nuparcs;						// number of un-pruned arcs, include gold
 65 | 
 66 | 	vector<item_ptr> arc;		// first order cache [h][m]
 67 | 	vector<item_ptr> trips;		// second order [dep id][sib]
 68 | 	vector<item_ptr> sibs;		// [mod][sib][2]
 69 | 	vector<item_ptr> gpc;		// [dep id][child]
 70 | 	vector<item_ptr> posho;		// pos feature [hid]
 71 | 
 72 | private:
 73 | 	vector<int> arc2id;					// map (h->m) arc to an id in [0, nuparcs-1]
 74 | 	vector<bool> pruned;				// whether a (h->m) arc is pruned, not necessarily include gold
 75 | };
 76 | 
 77 | class FeatureExtractor {
 78 | public:
 79 | 	FeatureExtractor();
 80 | 	FeatureExtractor(DependencyInstance* inst, SegParser* parser, Parameters* params, int thread);
 81 | 	virtual ~FeatureExtractor();
 82 | 
 83 | 	CacheTable* getCacheTable(DependencyInstance* s);
 84 | 
 85 | 	double getPartialDepScore(DependencyInstance* s, HeadIndex& x, CacheTable* cache);
 86 | 	double getPartialBigramDepScore(DependencyInstance* s, HeadIndex& x, HeadIndex& y, CacheTable* cache);
 87 | 	double getPartialPosScore(DependencyInstance* s, HeadIndex& x, CacheTable* cache);
 88 | 	double getScore(DependencyInstance* s);
 89 | 	double getScore(DependencyInstance* s, CacheTable* cache);
 90 | 	void getPartialFv(DependencyInstance* s, HeadIndex& x, FeatureVector* fv);
 91 | 	void getFv(DependencyInstance* s, FeatureVector* fv);
 92 | 
 93 | 	vector<bool> isPruned(DependencyInstance* s, HeadIndex& m, CacheTable* cache);
 94 | 
 95 | 	int numWord;
 96 | 	int type;
 97 | 	int thread;
 98 | 
 99 | 	//DependencyInstance* inst;		so risky to add this variable in multi-thread scenario. Other variables are read-only
100 | 	DependencyPipe* pipe;
101 | 	Parameters* parameters;
102 | 	SegParser* pruner;
103 | 	boost::shared_ptr<PrunerFeatureExtractor> pfe;
104 | 
105 | 	void (*getArcFv)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, FeatureVector*, CacheTable*);
106 | 	double (*getArcScore)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, CacheTable*);
107 | 
108 | 	void (*getSibsFv)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, bool, FeatureVector*, CacheTable*);
109 | 	double (*getSibsScore)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, bool, CacheTable*);
110 | 
111 | 	void (*getTripsFv)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, HeadIndex&, FeatureVector*, CacheTable*);
112 | 	double (*getTripsScore)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, HeadIndex&, CacheTable*);
113 | 
114 | 	void (*getGPCFv)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, HeadIndex&, FeatureVector*, CacheTable*);
115 | 	double (*getGPCScore)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, HeadIndex&, CacheTable*);
116 | 
117 | 	void (*getPosHOFv)(FeatureExtractor*, DependencyInstance*, HeadIndex&, FeatureVector*, CacheTable*);
118 | 	double (*getPosHOScore)(FeatureExtractor*, DependencyInstance*, HeadIndex&, CacheTable*);
119 | 
120 | 	// pre-computed
121 | 	void getPos1OFv(DependencyInstance* inst, HeadIndex& m, FeatureVector* fv);
122 | 	double getPos1OScore(DependencyInstance* inst, HeadIndex& m);
123 | 	void getSegFv(DependencyInstance* inst, int wordid, FeatureVector* fv);
124 | 	double getSegScore(DependencyInstance* inst, int worid);
125 | 
126 | 	vector<CacheTable> optSegCacheMap;		// cache for optimal seg for every word with different POS
127 | 	vector<CacheTable> subOptSegCacheMap;	// cache for sub-optimal seg for one word with optimal POS
128 | 
129 | 	// cache not related to seg/pos choices
130 | 	vector<item_ptr> seg1o;		// seg feature [wordid]
131 | 	vector<item_ptr> pos1o;		// pos feature [segid]
132 | 
133 | protected:
134 | 	void constructCacheMap(DependencyInstance* s);
135 | 	void initCacheMap(DependencyInstance* s);
136 | 
137 | 	// feature functions and pointers
138 | 	static void getArcFvUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& h, HeadIndex& m, FeatureVector* fv, CacheTable* cache);
139 | 	static void getArcFvAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& h, HeadIndex& m, FeatureVector* fv, CacheTable* cache);
140 | 	static double getArcScoreUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& h, HeadIndex& m, CacheTable* cache);
141 | 	static double getArcScoreAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& h, HeadIndex& m, CacheTable* cache);
142 | 
143 | 	static void getSibsFvUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& ch1, HeadIndex& ch2, bool isSt, FeatureVector* fv, CacheTable* cache);
144 | 	static void getSibsFvAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& ch1, HeadIndex& ch2, bool isSt, FeatureVector* fv, CacheTable* cache);
145 | 	static double getSibsScoreUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& ch1, HeadIndex& ch2, bool isSt, CacheTable* cache);
146 | 	static double getSibsScoreAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& ch1, HeadIndex& ch2, bool isSt, CacheTable* cache);
147 | 
148 | 	static void getTripsFvUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& par, HeadIndex& ch1, HeadIndex& ch2, FeatureVector* fv, CacheTable* cache);
149 | 	static void getTripsFvAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& par, HeadIndex& ch1, HeadIndex& ch2, FeatureVector* fv, CacheTable* cache);
150 | 	static double getTripsScoreUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& par, HeadIndex& ch1, HeadIndex& ch2, CacheTable* cache);
151 | 	static double getTripsScoreAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& par, HeadIndex& ch1, HeadIndex& ch2, CacheTable* cache);
152 | 
153 | 	static void getGPCFvUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& gp, HeadIndex& par, HeadIndex& c, FeatureVector* fv, CacheTable* cache);
154 | 	static void getGPCFvAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& gp, HeadIndex& par, HeadIndex& c, FeatureVector* fv, CacheTable* cache);
155 | 	static double getGPCScoreUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& gp, HeadIndex& par, HeadIndex& c, CacheTable* cache);
156 | 	static double getGPCScoreAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& gp, HeadIndex& par, HeadIndex& c, CacheTable* cache);
157 | 
158 | 	static void getPosHOFvUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& m, FeatureVector* fv, CacheTable* cache);
159 | 	static void getPosHOFvAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& m, FeatureVector* fv, CacheTable* cache);
160 | 	static double getPosHOScoreUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& m, CacheTable* cache);
161 | 	static double getPosHOScoreAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& m, CacheTable* cache);
162 | 
163 | 	void setAtomic(int thread);
164 | 	bool atomic;							// whether the load/store need atomic operation
165 | 
166 | 	// cache map
167 | 	vector<int> optSegCacheStPos;			// start position in the cache map for each seg
168 | 
169 | 	vector<int> subOptSegCacheStPos;		// start position in the cache map for each word
170 | 
171 | 	vector<int> seg1oStPos;		// [word]->segcand
172 | 	vector<int> pos1oStPos2d;	// [word][segcand]->segid
173 | 	vector<int> pos1oStPos3d;	// [word][segcand][segid]->poscand
174 | 
175 | 	int getSeg1OCachePos(int wordid, int segCandID);
176 | 	int getPos1OCachePos(int wordid, int segCandID, int segid, int posCandID);
177 | 
178 | 	// others
179 | 	Options* options;
180 | };
181 | 
182 | class PrunerFeatureExtractor : public segparser::FeatureExtractor {
183 | public:
184 | 	CacheTable prunerCache;
185 | 
186 | 	PrunerFeatureExtractor();
187 | 	void init(DependencyInstance* inst, SegParser* pruner, int thread);
188 | 	void prune(DependencyInstance* inst, HeadIndex& m, vector<bool>& pruned);
189 | };
190 | 
191 | } /* namespace segparser */
192 | #endif /* FEATUREEXTRACTOR_H_ */
193 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 yuanzh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/Options.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Options.cpp
  3 |  *
  4 |  *  Created on: Mar 27, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #include "Options.h"
  9 | #include "util/Constant.h"
 10 | #include "util/StringUtils.h"
 11 | #include <string>
 12 | #include <vector>
 13 | #include <iostream>
 14 | 
 15 | namespace segparser {
 16 | 
 17 | using namespace std;
 18 | 
 19 | Options::Options() {
 20 | 	trainFile = "";
 21 | 	testFile = "";
 22 | 
 23 | 	outFile = "";
 24 | 	modelName = "";
 25 | 
 26 | 	lang = -1;
 27 | 
 28 | 	train = false;
 29 | 	test = false;
 30 | 
 31 | 	trainPruner = true;
 32 | 
 33 | 	learningMode = DecodingMode::HillClimb;
 34 | 	testingMode = DecodingMode::HillClimb;
 35 | 
 36 | 	// parameter
 37 | 	numIters = 10;
 38 | 	maxHead = 20;
 39 | 	pruneThresh = 0.05;
 40 | 
 41 | 	trainSentences = 1000000;
 42 | 	testSentences = 1000000;
 43 | 	maxLength = 100;
 44 | 
 45 | 	devThread = 5;
 46 | 	trainThread = 10;
 47 | 
 48 | 	seed = 0;
 49 | 	regC = 0.0001;
 50 | 
 51 | 	// feature;
 52 | 	useCS = true;			// consecutive sibling
 53 | 	useGP = true;			// grandparent
 54 | 	useHO = true;			// high order and global
 55 | 	useSP = true;			// seg/pos feature
 56 | 
 57 | 	trainConvergeIter = 200;
 58 | 	testConvergeIter = 200;
 59 | 
 60 | 	evalPunc = true;
 61 | 	useTedEval = false;
 62 | 	jointSegPos = true;
 63 | 	earlyStop = 40;
 64 | 
 65 | 	saveBestModel = true;
 66 | 	bestScore = -100;
 67 | }
 68 | 
 69 | Options::~Options() {
 70 | }
 71 | 
 72 | void Options::processArguments(int argc, char** argv) {
 73 | 	for(int i = 0; i < argc; ++i) {
 74 | 		string str(argv[i]);
 75 | 		vector<string> pair;
 76 | 		StringSplit(str, ":", &pair);
 77 | 		if(pair[0].compare("train") == 0) {
 78 | 			train = true;
 79 | 		}
 80 | 		if(pair[0].compare("test") == 0) {
 81 | 			test = true;
 82 | 		}
 83 | 		if(pair[0].compare("iters") == 0) {
 84 | 			numIters = atoi(pair[1].c_str());
 85 | 		}
 86 | 		if(pair[0].compare("output-file") == 0) {
 87 | 			outFile = pair[1];
 88 | 		}
 89 | 		if(pair[0].compare("train-file") == 0) {
 90 | 			trainFile = pair[1];
 91 | 		}
 92 | 		if(pair[0].compare("test-file") == 0) {
 93 | 			testFile = pair[1];
 94 | 			if (outFile.empty())
 95 | 				outFile = testFile + ".res";
 96 | 		}
 97 | 		if(pair[0].compare("model-name") == 0) {
 98 | 			modelName = pair[1];
 99 | 		}
100 | 		if (pair[0].compare("seed") == 0) {
101 | 			seed = atoi(pair[1].c_str());
102 | 		}
103 | 		if (pair[0].compare("devthread") == 0) {
104 | 			devThread = atoi(pair[1].c_str());
105 | 		}
106 | 		if (pair[0].compare("trainthread") == 0) {
107 | 			trainThread = atoi(pair[1].c_str());
108 | 		}
109 | 		if (pair[0].compare("max-sent") == 0) {
110 | 			trainSentences = atoi(pair[1].c_str());
111 | 		}
112 | 		if (pair[0].compare("max-test-sent") == 0) {
113 | 			testSentences = atoi(pair[1].c_str());
114 | 		}
115 | 		if (pair[0].compare("C") == 0) {
116 | 			regC = atof(pair[1].c_str());
117 | 		}
118 | 		if (pair[0].compare("train-converge") == 0) {
119 | 			trainConvergeIter = atoi(pair[1].c_str());
120 | 		}
121 | 		if (pair[0].compare("test-converge") == 0) {
122 | 			testConvergeIter = atoi(pair[1].c_str());
123 | 		}
124 | 		if (pair[0].compare("tedeval") == 0) {
125 | 			useTedEval = (pair[1] == "true" ? true : false);
126 | 		}
127 | 		if (pair[0].compare("joint") == 0) {
128 | 			jointSegPos = (pair[1] == "true" ? true : false);
129 | 		}
130 | 		if (pair[0].compare("evalpunc") == 0) {
131 | 			evalPunc = (pair[1] == "true" ? true : false);
132 | 		}
133 | 		if (pair[0].compare("earlystop") == 0) {
134 | 			earlyStop = atoi(pair[1].c_str());
135 | 		}
136 | 		if (pair[0].compare("savebest") == 0) {
137 | 			saveBestModel = (pair[1] == "true" ? true : false);
138 | 		}
139 | 		if (pair[0].compare("ho") == 0) {
140 | 			useHO = (pair[1] == "true" ? true : false);
141 | 		}
142 | 
143 | 		//TODO: add useHO option
144 | 	}
145 | 
146 | 
147 | 	string file = trainFile;
148 | 	if (file.empty())
149 | 		file = testFile;
150 | 
151 | 	lang = findLang(file);
152 | }
153 | 
154 | int Options::findLang(string file) {
155 | 	for (int i = 0; i < PossibleLang::Count; ++i)
156 | 		if (file.find(PossibleLang::langString[i]) != string::npos) {
157 | 			return i;
158 | 		}
159 | 	cout << "Warning: unknow language" << endl;
160 | 	return PossibleLang::Count;
161 | }
162 | 
163 | void Options::setPrunerOptions() {
164 | 	modelName = modelName + ".pruner";
165 | 
166 | 	test = false;
167 | 
168 | 	trainPruner = false;
169 | 
170 | 	learningMode = DecodingMode::Exact;
171 | 	testingMode = DecodingMode::Exact;
172 | 
173 | 	// parameter
174 | 	numIters = 10;
175 | 
176 | 	devThread = 1;
177 | 	trainThread = 1;
178 | 
179 | 	regC = 0.1;
180 | 
181 | 	// feature;
182 | 	useCS = false;			// consecutive sibling
183 | 	useGP = false;			// grandparent
184 | 	useHO = false;			// high order and global
185 | 	useSP = false;
186 | 
187 | 	saveBestModel = false;
188 | }
189 | 
190 | void Options::outputArg() {
191 | 	cout << "------\nFLAGS\n------" << endl;
192 | 	cout << "train-file: " << trainFile << endl;
193 | 	cout << "test-file: " << testFile << endl;
194 | 	cout << "out-file: " << outFile << endl;
195 | 	cout << "model-name: " << modelName << endl;
196 | 	cout << "train: " << train << endl;
197 | 	cout << "test: " << test << endl;
198 | 	cout << "training-iterations: " << numIters << endl;
199 | 	cout << "seed: " << seed << endl;
200 | 	cout << "use consecutive sibling: " << useCS << endl;
201 | 	cout << "use grandparent: " << useGP << endl;
202 | 	cout << "use grand sibling, tri-sibling and high order: " << useHO << endl;
203 | 	cout << "learning mode: " << learningMode << endl;
204 | 	cout << "testing mode: " << testingMode << endl;
205 | 	cout << "train thread: " << trainThread << endl;
206 | 	cout << "dev thread: " << devThread << endl;
207 | 	cout << "reg C: " << regC << endl;
208 | 	cout << "train converge iter: " << trainConvergeIter << endl;
209 | 	cout << "test converge iter: " << testConvergeIter << endl;
210 | 	cout << "early stop: " << earlyStop << endl;
211 | 	cout << "tedeval: " << useTedEval << endl;
212 | 	cout << "joint seg pos: " << jointSegPos << endl;
213 | 	cout << "prune: " << trainPruner << endl;
214 | 	cout << "save best model: " << saveBestModel << endl;
215 | 	cout << "------\n" << endl;
216 | }
217 | 
218 | } /* namespace segparser */
219 | 


--------------------------------------------------------------------------------
/Options.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Options.h
 3 |  *
 4 |  *  Created on: Mar 27, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef OPTIONS_H_
 9 | #define OPTIONS_H_
10 | 
11 | #include <string>
12 | 
13 | namespace segparser {
14 | 
15 | using namespace std;
16 | 
17 | class Options {
18 | public:
19 | public:
20 | 	// file name
21 | 	string trainFile;
22 | 	string testFile;
23 | 
24 | 	string outFile;
25 | 	string modelName;
26 | 
27 | 	int lang;
28 | 
29 | 	// model type
30 | 	bool train;
31 | 	bool test;
32 | 
33 | 	bool trainPruner;
34 | 
35 | 	int learningMode;
36 | 	int testingMode;
37 | 
38 | 	// parameter
39 | 	int numIters;
40 | 	int maxHead;
41 | 	double pruneThresh;
42 | 
43 | 	int trainSentences;
44 | 	int testSentences;
45 | 	int maxLength;			// maximum length of the sentences during *training*
46 | 
47 | 	int devThread;
48 | 	int trainThread;		// only useful when hill climbing training
49 | 
50 | 	int seed;
51 | 	double regC;
52 | 
53 | 	// feature;
54 | 	bool useCS;			// consecutive sibling
55 | 	bool useGP;			// grandparent
56 | 	bool useHO;			// grand-sibling, tri-sibling and high order and global
57 | 	bool useSP;			// seg pos feature
58 | 
59 | 	int trainConvergeIter;	// for hill climbing
60 | 	int testConvergeIter;
61 | 
62 | 	bool evalPunc;
63 | 	bool useTedEval;
64 | 	bool jointSegPos;	// joint model or pipeline
65 | 	int earlyStop;		// early stop strategy in training
66 | 
67 | 	bool saveBestModel;
68 | 	double bestScore;
69 | 
70 | 	Options();
71 | 	virtual ~Options();
72 | 
73 | 	void processArguments(int argc, char** argv);
74 | 	void setPrunerOptions();
75 | 	void outputArg();
76 | 
77 | private:
78 | 	int findLang(string file);
79 | };
80 | 
81 | } /* namespace segparser */
82 | #endif /* OPTIONS_H_ */
83 | 


--------------------------------------------------------------------------------
/Parameters.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parameters.cpp
  3 |  *
  4 |  *  Created on: Apr 6, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #include "Parameters.h"
  9 | #include <boost/multi_array.hpp>
 10 | #include "util/SerializationUtils.h"
 11 | 
 12 | namespace segparser {
 13 | 
 14 | Parameters::Parameters(int size, Options* options)
 15 | 	: size(size), options(options){
 16 | 	parameters.clear();
 17 | 	total.clear();
 18 | 	parameters.resize(size, 0.0);
 19 | 	total.resize(size, 0.0);
 20 | }
 21 | 
 22 | Parameters::~Parameters() {
 23 | }
 24 | 
 25 | void Parameters::copyParams(Parameters* param) {
 26 | 	parameters = param->parameters;
 27 | 	total = param->total;
 28 | 	size = param->size;
 29 | 	options = param->options;
 30 | }
 31 | 
 32 | void Parameters::averageParams(double avVal) {
 33 | 	std::cout << "update time: " << avVal << std::endl;
 34 | 	for (int j = 0; j < size; ++j)
 35 | 		parameters[j] -= (avVal == 0 ? 0 : total[j] / avVal);
 36 | }
 37 | 
 38 | double Parameters::numError(DependencyInstance* gold, DependencyInstance* pred) {
 39 | 	ThrowException("should not be here");
 40 | 	double e = 0.0;
 41 | 
 42 | 	for (int i = 1; i < gold->numWord; ++i) {
 43 | 		SegInstance& goldSeg = gold->word[i].getCurrSeg();
 44 | 		SegInstance& predSeg = pred->word[i].getCurrSeg();
 45 | 
 46 | 		if (gold->word[i].currSegCandID != pred->word[i].currSegCandID) {
 47 | 			e += 1.5 * predSeg.size();
 48 | 		}
 49 | 		else {
 50 | 			// compare match element
 51 | 			for (int j = 0; j < predSeg.size(); ++j) {
 52 | 				SegElement& goldEle = goldSeg.element[j];
 53 | 				SegElement& predEle = predSeg.element[j];
 54 | 
 55 | 				if (goldEle.currPosCandID != predEle.currPosCandID) {
 56 | 					e += 1.0;
 57 | 				}
 58 | 				else if (goldEle.dep != predEle.dep) {
 59 | 					e += 1.0;
 60 | 				}
 61 | 				else if (goldEle.labid != predEle.labid) {
 62 | 					e += 0.5;
 63 | 				}
 64 | 			}
 65 | 		}
 66 | 
 67 | 	}
 68 | 	return e;
 69 | }
 70 | 
 71 | double Parameters::elementError(WordInstance& gold, WordInstance& pred, int segid) {
 72 | 	double e = 0.0;
 73 | 
 74 | 	if (gold.currSegCandID != pred.currSegCandID) {
 75 | 		e += 2.0;		// this value should not matter...
 76 | 	}
 77 | 	else {
 78 | 		SegElement& goldEle = gold.getCurrSeg().element[segid];
 79 | 		SegElement& predEle = pred.getCurrSeg().element[segid];
 80 | 
 81 | 		if (goldEle.currPosCandID != predEle.currPosCandID) {
 82 | 			e += 1.0;		// this value should not matter...
 83 | 		}
 84 | 		else if (goldEle.dep != predEle.dep) {
 85 | 			e += 1.0;
 86 | 		}
 87 | 		else if (goldEle.labid != predEle.labid) {
 88 | 			e += 0.5;
 89 | 		}
 90 | 
 91 | 	}
 92 | 
 93 | 	return e;
 94 | }
 95 | 
 96 | double Parameters::wordError(WordInstance& gold, WordInstance& pred) {
 97 | 	double e = 0.0;
 98 | 
 99 | 	if (gold.currSegCandID != pred.currSegCandID) {
100 | 		e += 1.0 * (gold.getCurrSeg().size() + pred.getCurrSeg().size());
101 | 	}
102 | 	else {
103 | 		assert(gold.getCurrSeg().size() == pred.getCurrSeg().size());
104 | 
105 | 		for (int i = 0; i < gold.getCurrSeg().size(); ++i) {
106 | 			SegElement& goldEle = gold.getCurrSeg().element[i];
107 | 			SegElement& predEle = pred.getCurrSeg().element[i];
108 | 
109 | 			assert(goldEle.labid == predEle.labid);
110 | 
111 | 			if (goldEle.currPosCandID != predEle.currPosCandID) {
112 | 				e += 1.0;
113 | 			}
114 | 			else if (goldEle.dep != predEle.dep) {
115 | 				e += 1.0;
116 | 			}
117 | 			else if (goldEle.labid != predEle.labid) {
118 | 				e += 0.5;
119 | 			}
120 | 		}
121 | 	}
122 | 	return e;
123 | }
124 | 
125 | double Parameters::wordDepError(WordInstance& gold, WordInstance& pred) {
126 | 	double e = 0.0;
127 | 
128 | 	if (gold.currSegCandID != pred.currSegCandID) {
129 | 		e += 1.0 * (gold.getCurrSeg().size() + pred.getCurrSeg().size());
130 | 	}
131 | 	else {
132 | 		assert(gold.getCurrSeg().size() == pred.getCurrSeg().size());
133 | 
134 | 		for (int i = 0; i < gold.getCurrSeg().size(); ++i) {
135 | 			SegElement& goldEle = gold.getCurrSeg().element[i];
136 | 			SegElement& predEle = pred.getCurrSeg().element[i];
137 | 
138 | 			assert(goldEle.labid == predEle.labid);
139 | 
140 | 			if (goldEle.currPosCandID != predEle.currPosCandID) {
141 | 				e += 1.0;
142 | 			}
143 | 			else if (goldEle.dep != predEle.dep) {
144 | 				e += 1.0;
145 | 			}
146 | 			else if (goldEle.labid != predEle.labid) {
147 | 				e += 0.5;
148 | 			}
149 | 		}
150 | 	}
151 | 
152 | 	return e;
153 | }
154 | 
155 | void Parameters::update(DependencyInstance* target, DependencyInstance* curr,
156 | 		FeatureVector* diffFv, double loss, FeatureExtractor* fe, int upd) {
157 | 	// upd start from 0
158 | 
159 | 	//double e = numError(gold, pred);
160 | 	//double loss = e - diffScore;
161 | 
162 | 	if (loss < 1e-4)
163 | 		return;
164 | 
165 | 	double l2norm = diffFv->dotProduct(diffFv);
166 | 	if (l2norm <= 1e-6)
167 | 		return;
168 | 
169 | 	double alpha = loss/l2norm;
170 | 
171 | 	if (alpha > options->regC)
172 | 		alpha = options->regC;
173 | 
174 | 	if (alpha > 0) {
175 | 		// update theta
176 | 		for (unsigned int i = 0; i < diffFv->binaryIndex.size(); ++i) {
177 | 			parameters[diffFv->binaryIndex[i]] += alpha;
178 | 			total[diffFv->binaryIndex[i]] += upd * alpha;
179 | 		}
180 | 		for (unsigned int i = 0; i < diffFv->negBinaryIndex.size(); ++i) {
181 | 			parameters[diffFv->negBinaryIndex[i]] -= alpha;
182 | 			total[diffFv->negBinaryIndex[i]] -= upd * alpha;
183 | 		}
184 | 		for (unsigned int i = 0; i < diffFv->normalIndex.size(); ++i) {
185 | 			double val = min(2.0, max(-2.0, diffFv->normalValue[i]));
186 | 			parameters[diffFv->normalIndex[i]] += alpha * val;
187 | 			total[diffFv->normalIndex[i]] += upd * alpha * val;
188 | 		}
189 | 	}
190 | }
191 | 
192 | double Parameters::getScore(FeatureVector* fv) {
193 | 	double score = 0.0;
194 | 	for (unsigned int i = 0; i < fv->binaryIndex.size(); ++i) {
195 | 		score += parameters[fv->binaryIndex[i]];
196 | 	}
197 | 	for (unsigned int i = 0; i < fv->negBinaryIndex.size(); ++i) {
198 | 		score -= parameters[fv->negBinaryIndex[i]];
199 | 	}
200 | 	for (unsigned int i = 0; i < fv->normalIndex.size(); ++i) {
201 | 		score += parameters[fv->normalIndex[i]] * fv->normalValue[i];
202 | 	}
203 | 	return score;
204 | }
205 | 
206 | void Parameters::writeParams(FILE* fs) {
207 | 	CHECK(WriteInteger(fs, size));
208 | 	CHECK(WriteDoubleArray(fs, parameters));
209 | }
210 | 
211 | void Parameters::readParams(FILE* fs) {
212 | 	CHECK(ReadInteger(fs, &size));
213 | 	CHECK(ReadDoubleArray(fs, &parameters));
214 | }
215 | 
216 | } /* namespace segparser */
217 | 


--------------------------------------------------------------------------------
/Parameters.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parameters.h
 3 |  *
 4 |  *  Created on: Apr 6, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef PARAMETERS_H_
 9 | #define PARAMETERS_H_
10 | 
11 | #include <vector>
12 | #include "Options.h"
13 | #include "DependencyInstance.h"
14 | #include "util/FeatureVector.h"
15 | #include "FeatureExtractor.h"
16 | 
17 | namespace segparser {
18 | 
19 | using namespace std;
20 | 
21 | class FeatureExtractor;
22 | 
23 | class Parameters {
24 | public:
25 | 	vector<double> parameters;
26 | 	vector<double> total;
27 | 	int size;
28 | 
29 | 	Parameters(int size, Options* options);
30 | 	virtual ~Parameters();
31 | 
32 | 	void copyParams(Parameters* param);
33 | 	void averageParams(double avVal);
34 | 	void update(DependencyInstance* gold, DependencyInstance* pred,
35 | 			FeatureVector* diffFv, double loss, FeatureExtractor* fe, int upd);
36 | 	double getScore(FeatureVector* fv);
37 | 
38 | 	void writeParams(FILE* fs);
39 | 	void readParams(FILE* fs);
40 | 
41 | 	double elementError(WordInstance& gold, WordInstance& pred, int segid);
42 | 	double wordError(WordInstance& gold, WordInstance& pred);
43 | 	double wordDepError(WordInstance& gold, WordInstance& pred);
44 | private:
45 | 	Options* options;
46 | 
47 | 	int maxMatch(SegInstance& gold, SegInstance& pred, vector<int>& match);
48 | 	double numError(DependencyInstance* gold, DependencyInstance* pred);
49 | };
50 | 
51 | } /* namespace segparser */
52 | #endif /* PARAMETERS_H_ */
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #### SegParser
 2 | 
 3 | Randomized Greedy algorithm for joint segmentation, POS tagging and dependency parsing
 4 | 
 5 | =========
 6 | 
 7 | #### Usage
 8 | 
 9 | ##### 1. Compilation
10 | 
11 | To compile the project, first make sure you have installed boost and boost-regex on your machine. Next, go to the "Release" directory and run command "make all" to compile the code. Note that the implementation uses some c++0x/c++11 features. Please make sure your compiler supports them.
12 | 
13 | <br> 
14 | 
15 | ##### 2. Data Format
16 | 
17 | The data format for each sentence has two parts. The first part is similar to the one used in CoNLL-X shared task. The only difference is the index in the first column. Here the index format is "token index/segment index", where the token index starts from 1 (0 is for the root), while the segment index starts from 0.
18 | 
19 | The second part encodes the search space for segmentation and POS tagging. Each line contains a string for the lattice structure of each token. The format is as follows.
20 | 
21 | line := Token form\tCandidate1\tCandidate2\t...
22 | 
23 | Candidate := Segmentation||Al index||Morphology index||Morphology value||Candidate probability
24 | 
25 | Segmentation := Segment1&&Segment2&&...
26 | 
27 | Segment := Surface form@#Lemma form@#POS candidate1@#POS candidate2@#...
28 | 
29 | POS candidate := POS tag_probability
30 | 
31 | "data" directory includes sample data files for the SPMRL dataset.
32 | 
33 | ##### 3. Datasets
34 | 
35 | Because of the license issue, datasets are not directly released here. You can find sample files in "data" directory. Please contact me for the full dataset if you are interested in.
36 | 
37 | UPDATE: data generator for SPMRL dataset and needed files for generating testing data are added into the directory spmrl_data_generator.
38 | 
39 | ##### 4. Usage
40 | 
41 | Take a look at the scripts "run_DATA.sh" and "run_DATA_test.sh" where DATA=spmrl|classical|chinese. For example, to train a model on the SPMRL dataset, you can simply run
42 | 
43 | run_spmrl.sh run1
44 | 
45 | The model and development results will be saved in directory "runs". Note that the model is evaluated on the development set (if exists) after each epoch *in parallel* with the training. After the model is trained, you can evaluate it on the test set by running
46 | 
47 | run_spmrl_test.sh run1
48 | 
49 | 


--------------------------------------------------------------------------------
/Release/SharedTaskCommon.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/Release/SharedTaskCommon.pyc


--------------------------------------------------------------------------------
/Release/decoder/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | # Add inputs and outputs from these tool invocations to the build variables 
 6 | CPP_SRCS += \
 7 | ../decoder/ClassifierDecoder.cpp \
 8 | ../decoder/DependencyDecoder.cpp \
 9 | ../decoder/DevelopmentThread.cpp \
10 | ../decoder/HillClimbingDecoder.cpp 
11 | 
12 | OBJS += \
13 | ./decoder/ClassifierDecoder.o \
14 | ./decoder/DependencyDecoder.o \
15 | ./decoder/DevelopmentThread.o \
16 | ./decoder/HillClimbingDecoder.o 
17 | 
18 | CPP_DEPS += \
19 | ./decoder/ClassifierDecoder.d \
20 | ./decoder/DependencyDecoder.d \
21 | ./decoder/DevelopmentThread.d \
22 | ./decoder/HillClimbingDecoder.d 
23 | 
24 | 
25 | # Each subdirectory must supply rules for building sources it contributes
26 | decoder/%.o: ../decoder/%.cpp
27 | 	@echo 'Building file: $<'
28 | 	@echo 'Invoking: GCC C++ Compiler'
29 | 	g++ -D__GXX_EXPERIMENTAL_CXX0X__ -D__cplusplus=201103L -O3 -Wall -c -fmessage-length=0 -std=c++0x -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<"
30 | 	@echo 'Finished building: $<'
31 | 	@echo ' '
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/Release/io/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | # Add inputs and outputs from these tool invocations to the build variables 
 6 | CPP_SRCS += \
 7 | ../io/DependencyReader.cpp \
 8 | ../io/DependencyWriter.cpp 
 9 | 
10 | OBJS += \
11 | ./io/DependencyReader.o \
12 | ./io/DependencyWriter.o 
13 | 
14 | CPP_DEPS += \
15 | ./io/DependencyReader.d \
16 | ./io/DependencyWriter.d 
17 | 
18 | 
19 | # Each subdirectory must supply rules for building sources it contributes
20 | io/%.o: ../io/%.cpp
21 | 	@echo 'Building file: $<'
22 | 	@echo 'Invoking: GCC C++ Compiler'
23 | 	g++ -D__GXX_EXPERIMENTAL_CXX0X__ -D__cplusplus=201103L -O3 -Wall -c -fmessage-length=0 -std=c++0x -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<"
24 | 	@echo 'Finished building: $<'
25 | 	@echo ' '
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/Release/lattice_to_segmentation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # Author : Reut Tsarfaty, July 2013
 3 | # ligth modifs: Djame Seddah
 4 | # +modif to support ptb's lattice files 
 5 | import sys
 6 | 
 7 | if sys.argv[1] == "-ptb":
 8 | 	ptb=1
 9 | else:
10 | 	ptb=0
11 | 
12 | 
13 | 
14 | 
15 | prev_tok = ""
16 | out_line = ""
17 | first=1
18 | for line in sys.stdin:
19 |    line = line.strip().split()
20 |    if not line:
21 |       #out_line +=  "\t".join([token,form])
22 |       if out_line:
23 |         print out_line
24 |       else:
25 |         print "\n"
26 |       prev_tok = ""
27 |       out_line = ""
28 |       #print "\n"
29 |       continue
30 |       
31 |    if ptb == -1:  #this code is bogus, the ptb hebrew files lacks the lemma field
32 |    		start, end, form, lemma, cpos, fpos, feats, token = line
33 |    else:
34 |    		start, end, form = line[0:3]
35 |    		token = line[-1]
36 |    	
37 |    if prev_tok == token:
38 |         out_line += "".join([":",form])
39 |         prev_tok = token
40 |    else:
41 |         if first==1: #lame modif to avoid first line void
42 |            first=0
43 |         else:
44 |            print out_line
45 |         out_line = ""
46 |         out_line +=  "\t".join([token,form])
47 |         prev_tok = token
48 | print "\n"
49 |           
50 |        
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/Release/makefile:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | -include ../makefile.init
 6 | 
 7 | RM := rm -rf
 8 | 
 9 | # All of the sources participating in the build are defined here
10 | -include sources.mk
11 | -include util/subdir.mk
12 | -include io/subdir.mk
13 | -include decoder/subdir.mk
14 | -include subdir.mk
15 | -include objects.mk
16 | 
17 | ifneq ($(MAKECMDGOALS),clean)
18 | ifneq ($(strip $(C++_DEPS)),)
19 | -include $(C++_DEPS)
20 | endif
21 | ifneq ($(strip $(C_DEPS)),)
22 | -include $(C_DEPS)
23 | endif
24 | ifneq ($(strip $(CC_DEPS)),)
25 | -include $(CC_DEPS)
26 | endif
27 | ifneq ($(strip $(CPP_DEPS)),)
28 | -include $(CPP_DEPS)
29 | endif
30 | ifneq ($(strip $(CXX_DEPS)),)
31 | -include $(CXX_DEPS)
32 | endif
33 | ifneq ($(strip $(C_UPPER_DEPS)),)
34 | -include $(C_UPPER_DEPS)
35 | endif
36 | endif
37 | 
38 | -include ../makefile.defs
39 | 
40 | # Add inputs and outputs from these tool invocations to the build variables 
41 | 
42 | # All Target
43 | all: SegParser
44 | 
45 | # Tool invocations
46 | SegParser: $(OBJS) $(USER_OBJS)
47 | 	@echo 'Building target: $@'
48 | 	@echo 'Invoking: GCC C++ Linker'
49 | 	g++  -o "SegParser" $(OBJS) $(USER_OBJS) $(LIBS)
50 | 	@echo 'Finished building target: $@'
51 | 	@echo ' '
52 | 
53 | # Other Targets
54 | clean:
55 | 	-$(RM) $(OBJS)$(C++_DEPS)$(C_DEPS)$(CC_DEPS)$(CPP_DEPS)$(EXECUTABLES)$(CXX_DEPS)$(C_UPPER_DEPS) SegParser
56 | 	-@echo ' '
57 | 
58 | .PHONY: all clean dependents
59 | .SECONDARY:
60 | 
61 | -include ../makefile.targets
62 | 


--------------------------------------------------------------------------------
/Release/objects.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 | 
5 | USER_OBJS :=
6 | 
7 | LIBS := -lboost_regex -lpthread
8 | 
9 | 


--------------------------------------------------------------------------------
/Release/run.sh:
--------------------------------------------------------------------------------
 1 | args=$1
 2 | runid=$2
 3 | 
 4 | ln -s $args.seg.cv2.ascii.train ../../data/$args/$args.train.$runid
 5 | ln -s $args.seg.cv2.ascii.test ../../data/$args/$args.test.$runid
 6 | 
 7 | ./SegParser train train-file:../../data/$args/$args.train.$runid model-name:../../data/$args/$args.model.$runid decode-type:non-proj test test-file:../../data/$args/$args.test.$runid seed:${runid} $@
 8 | 
 9 |     rm ../../data/$args/$args.train.$runid
10 |     rm ../../data/$args/$args.test.$runid
11 | 
12 | 


--------------------------------------------------------------------------------
/Release/run_chinese.sh:
--------------------------------------------------------------------------------
 1 | runid=$1
 2 | 
 3 | ln -s ctb.seg.train ../data/ctb.train.$runid
 4 | ln -s ctb.seg.dev ../data/ctb.test.$runid
 5 | 
 6 | ./SegParser train test train-file:../data/ctb.train.$runid model-name:../runs/ctb.model.$runid test-file:../data/ctb.test.$runid output-file:../runs/ctb.out.$runid seed:14 earlystop:40 evalpunc:false C:0.001 train-converge:300 test-converge:300 $@ | tee ../runs/ctb.log.$runid
 7 | 
 8 |     rm ../data/ctb.train.$runid
 9 |     rm ../data/ctb.test.$runid
10 | 
11 | 


--------------------------------------------------------------------------------
/Release/run_chinese_test.sh:
--------------------------------------------------------------------------------
1 | runid=$1
2 | 
3 | ln -s ctb.seg.test ../data/ctb.test.$runid
4 | 
5 | ./SegParser model-name:../runs/ctb.model.$runid test test-file:../data/ctb.test.$runid output-file:../runs/ctb.out.$runid seed:14 evalpunc:false test-converge:300 devthread:10 $@
6 | 
7 |     rm ../data/ctb.test.$runid
8 | 
9 | 


--------------------------------------------------------------------------------
/Release/run_classical.sh:
--------------------------------------------------------------------------------
 1 | runid=$1
 2 | 
 3 | ln -s qatar.seg.train ../data/qatar.train.$runid
 4 | ln -s qatar.seg.test ../data/qatar.test.$runid
 5 | 
 6 | ./SegParser train test train-file:../data/qatar.train.$runid model-name:../runs/qatar.model.$runid test-file:../data/qatar.test.$runid output-file:../runs/qatar.out.$runid seed:1 ho:false earlystop:20 evalpunc:true C:0.0001 train-converge:200 test-converge:200 savebest:false iters:5 $@ | tee ../runs/qatar.log.$runid
 7 | 
 8 |     rm ../data/qatar.train.$runid
 9 |     rm ../data/qatar.test.$runid
10 | 
11 | 


--------------------------------------------------------------------------------
/Release/run_classical_test.sh:
--------------------------------------------------------------------------------
1 | runid=$1
2 | 
3 | ln -s qatar.seg.test ../data/qatar.test.$runid
4 | 
5 | ./SegParser model-name:../runs/qatar.model.$runid test test-file:../data/qatar.test.$runid output-file:../runs/qatar.out.$runid seed:1 ho:false evalpunc:true test-converge:200 devthread:10 $@
6 | 
7 |     rm ../data/qatar.test.$runid
8 | 
9 | 


--------------------------------------------------------------------------------
/Release/run_spmrl.sh:
--------------------------------------------------------------------------------
 1 | runid=$1
 2 | 
 3 | ln -s spmrl.seg.train ../data/spmrl.train.$runid
 4 | ln -s spmrl.seg.dev ../data/spmrl.test.$runid
 5 | 
 6 | ./SegParser train test train-file:../data/spmrl.train.$runid model-name:../runs/spmrl.model.$runid test-file:../data/spmrl.test.$runid output-file:../runs/spmrl.out.$runid seed:2 earlystop:20 evalpunc:true C:0.01 train-converge:200 test-converge:200 $@ | tee ../runs/spmrl.log.$runid
 7 | 
 8 |     rm ../data/spmrl.train.$runid
 9 |     rm ../data/spmrl.test.$runid
10 | 
11 | 


--------------------------------------------------------------------------------
/Release/run_spmrl_test.sh:
--------------------------------------------------------------------------------
1 | runid=$1
2 | 
3 | ln -s spmrl.seg.test ../data/spmrl.test.$runid
4 | 
5 | ./SegParser model-name:../runs/spmrl.model.$runid test test-file:../data/spmrl.test.$runid output-file:../runs/spmrl.out.$runid seed:2 evalpunc:true test-converge:200 tedeval:true devthread:10 $@
6 | 
7 |     rm ../data/spmrl.test.$runid
8 | 
9 | 


--------------------------------------------------------------------------------
/Release/run_test.sh:
--------------------------------------------------------------------------------
 1 | args=$1
 2 | runid=$2
 3 | 
 4 | ln -s $args.seg.test ../../data/$args/$args.test.$runid
 5 | 
 6 | ./SegParser model-name:../../data/$args/$args.model.$runid decode-type:non-proj test test-file:../../data/$args/$args.test.$runid seed:${runid} $@
 7 | 
 8 |     rm ../../data/$args/$args.test.$runid
 9 | 
10 | 


--------------------------------------------------------------------------------
/Release/sources.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | O_SRCS := 
 6 | CPP_SRCS := 
 7 | C_UPPER_SRCS := 
 8 | C_SRCS := 
 9 | S_UPPER_SRCS := 
10 | OBJ_SRCS := 
11 | ASM_SRCS := 
12 | CXX_SRCS := 
13 | C++_SRCS := 
14 | CC_SRCS := 
15 | OBJS := 
16 | C++_DEPS := 
17 | C_DEPS := 
18 | CC_DEPS := 
19 | CPP_DEPS := 
20 | EXECUTABLES := 
21 | CXX_DEPS := 
22 | C_UPPER_DEPS := 
23 | 
24 | # Every subdirectory with source files must be described here
25 | SUBDIRS := \
26 | util \
27 | io \
28 | decoder \
29 | . \
30 | 
31 | 


--------------------------------------------------------------------------------
/Release/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | # Add inputs and outputs from these tool invocations to the build variables 
 6 | CPP_SRCS += \
 7 | ../DependencyInstance.cpp \
 8 | ../DependencyPipe.cpp \
 9 | ../FeatureEncoder.cpp \
10 | ../FeatureExtractor.cpp \
11 | ../Options.cpp \
12 | ../Parameters.cpp \
13 | ../SegParser.cpp 
14 | 
15 | OBJS += \
16 | ./DependencyInstance.o \
17 | ./DependencyPipe.o \
18 | ./FeatureEncoder.o \
19 | ./FeatureExtractor.o \
20 | ./Options.o \
21 | ./Parameters.o \
22 | ./SegParser.o 
23 | 
24 | CPP_DEPS += \
25 | ./DependencyInstance.d \
26 | ./DependencyPipe.d \
27 | ./FeatureEncoder.d \
28 | ./FeatureExtractor.d \
29 | ./Options.d \
30 | ./Parameters.d \
31 | ./SegParser.d 
32 | 
33 | 
34 | # Each subdirectory must supply rules for building sources it contributes
35 | %.o: ../%.cpp
36 | 	@echo 'Building file: $<'
37 | 	@echo 'Invoking: GCC C++ Compiler'
38 | 	g++ -D__GXX_EXPERIMENTAL_CXX0X__ -D__cplusplus=201103L -O3 -Wall -c -fmessage-length=0 -std=c++0x -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<"
39 | 	@echo 'Finished building: $<'
40 | 	@echo ' '
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/Release/test.txt:
--------------------------------------------------------------------------------
1 | 班汉·西巴阿差
2 | 


--------------------------------------------------------------------------------
/Release/util/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | # Add inputs and outputs from these tool invocations to the build variables 
 6 | CPP_SRCS += \
 7 | ../util/Alphabet.cpp \
 8 | ../util/Constant.cpp \
 9 | ../util/FeatureAlphabet.cpp \
10 | ../util/FeatureVector.cpp \
11 | ../util/Logarithm.cpp \
12 | ../util/SerializationUtils.cpp \
13 | ../util/StringUtils.cpp 
14 | 
15 | OBJS += \
16 | ./util/Alphabet.o \
17 | ./util/Constant.o \
18 | ./util/FeatureAlphabet.o \
19 | ./util/FeatureVector.o \
20 | ./util/Logarithm.o \
21 | ./util/SerializationUtils.o \
22 | ./util/StringUtils.o 
23 | 
24 | CPP_DEPS += \
25 | ./util/Alphabet.d \
26 | ./util/Constant.d \
27 | ./util/FeatureAlphabet.d \
28 | ./util/FeatureVector.d \
29 | ./util/Logarithm.d \
30 | ./util/SerializationUtils.d \
31 | ./util/StringUtils.d 
32 | 
33 | 
34 | # Each subdirectory must supply rules for building sources it contributes
35 | util/%.o: ../util/%.cpp
36 | 	@echo 'Building file: $<'
37 | 	@echo 'Invoking: GCC C++ Compiler'
38 | 	g++ -D__GXX_EXPERIMENTAL_CXX0X__ -D__cplusplus=201103L -O3 -Wall -c -fmessage-length=0 -std=c++0x -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<"
39 | 	@echo 'Finished building: $<'
40 | 	@echo ' '
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/SegParser.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SegParser.cpp
  3 |  *
  4 |  *  Created on: Mar 19, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #include "SegParser.h"
  9 | #include <pthread.h>
 10 | #include "util/Random.h"
 11 | #include <float.h>
 12 | #include "util/Timer.h"
 13 | #include <fstream>
 14 | #include "util/SerializationUtils.h"
 15 | #include <set>
 16 | 
 17 | namespace segparser {
 18 | 
 19 | SegParser::SegParser(DependencyPipe* pipe, Options* options)
 20 | 	: pipe(pipe), options(options), devTimes(0) {
 21 | 	// Set up arrays
 22 | 	parameters = new Parameters(pipe->dataAlphabet->size(), options);
 23 | 	devParams = new Parameters(pipe->dataAlphabet->size(), options);
 24 | 	pruner = NULL;
 25 | 	if (options->train) {
 26 | 		decoder = DependencyDecoder::createDependencyDecoder(options, options->learningMode, options->trainThread, true);
 27 | 		decoder->initialize();
 28 | 	}
 29 | 	else {
 30 | 		decoder = NULL;
 31 | 	}
 32 | 	dt = new DevelopmentThread();
 33 | 	FeatureVector::initVec(pipe->dataAlphabet->size());
 34 | }
 35 | 
 36 | void SegParser::closeDecoder() {
 37 | 	if (decoder)
 38 | 		decoder->shutdown();
 39 | }
 40 | 
 41 | SegParser::~SegParser() {
 42 | 	delete parameters;
 43 | 	delete devParams;
 44 | 	delete decoder;
 45 | 	delete dt;
 46 | 
 47 | 	delete pruner;
 48 | }
 49 | 
 50 | void SegParser::train(vector<inst_ptr>& il) {
 51 | 
 52 | 	cout << "About to train" << endl;
 53 | 
 54 | 	devTimes = 0;
 55 | 
 56 | 	// construct pred instance list
 57 | 	vector<inst_ptr> pred(il.size());
 58 | 	for (unsigned int i = 0; i < il.size(); ++i) {
 59 | 		pred[i] = inst_ptr(new DependencyInstance());
 60 | 		*(pred[i].get()) = *(il[i].get());
 61 | 	}
 62 | 
 63 | 	for(int i = 0; i < options->numIters; ++i) {
 64 | 
 65 | 		cout << "========================" << endl;
 66 | 		cout << "Iteration: " << i << endl;
 67 | 		cout << "========================" << endl;
 68 | 		cout << "Processed: ";
 69 | 		cout.flush();
 70 | 
 71 | 		Timer timer;
 72 | 
 73 | 		trainingIter(il, pred, i+1);
 74 | 
 75 | 		double diff = timer.stop();
 76 | 		cout << "Training iter took: " << diff / 1000 << " secs." << endl;
 77 | 
 78 | 	}
 79 | 
 80 | 	parameters->averageParams(decoder->getUpdateTimes());
 81 | 
 82 | 	// wait until dev finish
 83 | 	if (options->test) {
 84 | 		if (dt->isDevTesting)
 85 | 			pthread_join(dt->workThread, NULL);
 86 | 	}
 87 | 
 88 | 	if (options->saveBestModel) {
 89 | 		cout << "Best model performance: " << options->bestScore << endl;
 90 | 	}
 91 | }
 92 | 
 93 | void SegParser::trainingIter(vector<inst_ptr>& goldList, vector<inst_ptr>& predList, int iter) {
 94 | 
 95 | 	Timer timer;
 96 | 
 97 | 	for(unsigned int i = 0; i < goldList.size(); ++i) {
 98 | 		if((i+1) % 100 == 0) {
 99 | 			cout << "  " << (i+1);
100 | 			double diff = timer.stop();
101 | 			cout << " (time=" << (int)(diff / 1000) << "s)";
102 | 			cout.flush();
103 | 		}
104 | 
105 | 		inst_ptr gold = goldList[i];
106 | 		inst_ptr pred = predList[i];
107 | 
108 | 		FeatureExtractor fe(pred.get(), this, parameters, options->trainThread);
109 | 
110 | 		string str;
111 | 
112 | 		assert(gold->fv.binaryIndex.size() > 0);
113 | 
114 | 		decoder->train(gold.get(), pred.get(), &fe, iter);
115 | 
116 | 		if (options->useSP) {
117 | 			uint64_t code = pipe->fe->genCodePF(HighOrder::SEG_PROB, 0);
118 | 			int index = pipe->dataAlphabet->lookupIndex(TemplateType::THighOrder, code, false);
119 | 			if (index > 0 && parameters->parameters[index] < 0.0) {
120 | 				parameters->parameters[index] = 0.0;
121 | 			}
122 | 		}
123 | 
124 | 	}
125 | 
126 | 	cout << endl;
127 | 
128 | 	cout << "  " << goldList.size() << " instances" << endl;
129 | 
130 | 	if (options->test)
131 | 		checkDevStatus(iter);
132 | }
133 | 
134 | void SegParser::checkDevStatus(int iter) {
135 | 	if (dt->isDevTesting) {
136 | 		cout << "processing sentences: ";
137 | 
138 | 		pthread_mutex_lock(&dt->finishMutex);
139 | 		cout << dt->currFinishID << " to ";
140 | 		pthread_mutex_unlock(&dt->finishMutex);
141 | 
142 | 		pthread_mutex_lock(&dt->processMutex);
143 | 		cout << dt->currProcessID << endl;
144 | 		pthread_mutex_unlock(&dt->processMutex);
145 | 
146 | 		cout << "Wait for testing to finish." << endl;
147 | 		pthread_join(dt->workThread, NULL);
148 | 	}
149 | 
150 | 	// start new thread for dev
151 | 	string devfile = options->testFile;
152 | 	string devoutfile = options->outFile;
153 | 
154 | 	cout << "build dev params" << endl;
155 | 	devParams->copyParams(parameters);
156 | 	devParams->averageParams(decoder->getUpdateTimes());
157 | 
158 | 	cout << "start new dev " << devTimes << endl;
159 | 	dt->start(devfile, devoutfile, this, false);
160 | 	devTimes++;
161 | }
162 | 
163 | ///////////////////////////////////////////////////////
164 | // Saving and loading models
165 | ///////////////////////////////////////////////////////
166 | void SegParser::outputWeight(ofstream& fout, int type, Parameters* params) {
167 | 	unordered_map<uint64_t, int>* intmap = pipe->dataAlphabet->getMap(type);
168 | 	for (auto kv : (*intmap)) {
169 | 		uint64_t s = kv.first;
170 | 		int index = kv.second;
171 | 		if (index > 0) {
172 | 			fout << s << "\t" << parameters->parameters[index] << "\t" << parameters->total[index] << endl;
173 | 		}
174 | 	}
175 | }
176 | 
177 | void SegParser::outputWeight(string fStr) {
178 | 	cout << "output feature weight to " << fStr << endl;
179 | 	ofstream fout(fStr.c_str());
180 | 
181 | 	outputWeight(fout, TemplateType::TArc, parameters);
182 | 
183 | 	fout.close();
184 | }
185 | 
186 | void SegParser::saveModel(string file, Parameters* params) {
187 | 	FILE *fs = fopen(file.c_str(), "wb");
188 | 	params->writeParams(fs);
189 | 	pipe->dataAlphabet->writeObject(fs);
190 | 	pipe->typeAlphabet->writeObject(fs);
191 | 	pipe->posAlphabet->writeObject(fs);
192 | 	pipe->lexAlphabet->writeObject(fs);
193 | 	fclose(fs);
194 | }
195 | 
196 | void SegParser::loadModel(string file) {
197 | 	FILE *fs = fopen(file.c_str(), "rb");
198 | 	parameters->readParams(fs);
199 | 	pipe->dataAlphabet->readObject(fs);
200 | 	pipe->typeAlphabet->readObject(fs);
201 | 	pipe->posAlphabet->readObject(fs);
202 | 	pipe->lexAlphabet->readObject(fs);
203 | 	fclose(fs);
204 | 
205 | 	pipe->closeAlphabets();
206 | 	pipe->setAndCheckOffset();
207 | 
208 | 	parameters->total.clear();
209 | 	parameters->total.resize(parameters->parameters.size());
210 | 	parameters->size = parameters->parameters.size();
211 | }
212 | 
213 | void SegParser::evaluatePruning() {
214 | 	cout << "Evaluate pruning quality..." << endl;
215 | 	DependencyReader reader(options, options->testFile);
216 | 	inst_ptr gold = reader.nextInstance();
217 | 
218 | 	int numSeg = 0;
219 | 	double oracle = 0.0;
220 | 
221 | 	while(gold) {
222 | 		gold->setInstIds(pipe, options);
223 | 		DependencyInstance pred;
224 | 		pred = *(gold.get());
225 | 
226 | 		PrunerFeatureExtractor pfe;
227 | 		pfe.init(&pred, this, 1);
228 | 
229 | 		for (int i = 1; i < pred.numWord; ++i) {
230 | 			WordInstance& word = pred.word[i];
231 | 			for (int j = 0; j < word.getCurrSeg().size(); ++j) {
232 | 				numSeg++;
233 | 				vector<bool> tmpPruned;
234 | 				HeadIndex m(i, j);
235 | 				pfe.prune(&pred, m, tmpPruned);
236 | 
237 | 				HeadIndex& goldDep = gold->getElement(i, j).dep;
238 | 				int goldDepIndex = gold->wordToSeg(goldDep);
239 | 
240 | 				vector<bool> pruned;
241 | 				int p = 0;
242 | 				for (int hw = 0; hw < pred.numWord; ++hw) {
243 | 					SegInstance& headSeg = pred.word[hw].getCurrSeg();
244 | 					for (int hs = 0; hs < headSeg.size(); ++hs) {
245 | 						if (hw != m.hWord || hs != m.hSeg) {
246 | 							if (!tmpPruned[p]) {
247 | 								pruned.push_back(false);
248 | 							}
249 | 							else {
250 | 								pruned.push_back(true);
251 | 							}
252 | 							p++;
253 | 						}
254 | 						else {
255 | 							pruned.push_back(true);
256 | 						}
257 | 					}
258 | 				}
259 | 
260 | 				if (!pruned[goldDepIndex])
261 | 					oracle++;
262 | 			}
263 | 		}
264 | 
265 | 		gold = reader.nextInstance();
266 | 	}
267 | 
268 | 	cout << "Pruning recall: " << oracle / numSeg << endl;
269 | }
270 | 
271 | } /* namespace segparser */
272 | 
273 | using namespace segparser;
274 | 
275 | int main(int argc, char** argv) {
276 | 	//test1();
277 | 
278 | 	Options options;
279 | 	options.processArguments(argc, argv);
280 | 
281 | 	Options prunerOptions = options;
282 | 	prunerOptions.setPrunerOptions();
283 | 
284 | 	SegParser* pruner = NULL;
285 | 	DependencyPipe prunerPipe(&prunerOptions);
286 | 
287 |     DependencyPipe pipe(&options);
288 | 
289 | 	if (options.train) {
290 | 
291 | 		if (options.trainPruner) {
292 | 
293 | 			cout << "Pruner flags:" << endl;
294 | 			prunerOptions.outputArg();
295 | 
296 | 			prunerPipe.loadCoarseMap(prunerOptions.trainFile);
297 | 
298 | 			vector<inst_ptr> trainingData = prunerPipe.createInstances(prunerOptions.trainFile);
299 | 
300 | 			pruner = new SegParser(&prunerPipe, &prunerOptions);
301 | 			pruner->pruner = NULL;
302 | 
303 | 			int numFeats = prunerPipe.dataAlphabet->size() - 1;
304 | 			int numTypes = prunerPipe.typeAlphabet->size() - 1;
305 | 			cout << "Pruner Num Feats: " << numFeats << endl;
306 | 			cout << "Pruner Num Edge Labels: " << numTypes << endl;
307 | 
308 | 			pruner->train(trainingData);
309 | 			pruner->closeDecoder();
310 | 
311 | 			pruner->evaluatePruning();
312 | 		}
313 | 
314 | 	    cout << "Model flags:" << endl;
315 |     	options.outputArg();
316 | 
317 | 	    pipe.loadCoarseMap(options.trainFile);
318 | 
319 | 	    vector<inst_ptr> trainingData = pipe.createInstances(options.trainFile);
320 | 
321 | 	    //pipe.closeAlphabets();
322 | 
323 | 	    SegParser sp(&pipe, &options);
324 | 	    sp.pruner = pruner;
325 | 
326 | 	    int numFeats = pipe.dataAlphabet->size() - 1;
327 | 	    int numTypes = pipe.typeAlphabet->size() - 1;
328 | 	    cout << "Num Feats: " << numFeats << endl;
329 | 	    cout << "Num Edge Labels: " << numTypes << endl;
330 | 
331 | 	    sp.train(trainingData);
332 | 	    sp.closeDecoder();
333 | 	}
334 | 
335 | 	if (options.test) {
336 | 	    DependencyPipe testPipe(&options);
337 | 	    testPipe.loadCoarseMap(options.testFile);
338 | 
339 | 	    SegParser testSp(&testPipe, &options);
340 | 
341 | 	    cout << "\nLoading model ... ";
342 | 	    cout.flush();
343 | 	    pruner = NULL;
344 | 		if (options.trainPruner) {
345 | 
346 | 			prunerPipe.loadCoarseMap(prunerOptions.testFile);
347 | 
348 | 			pruner = new SegParser(&prunerPipe, &prunerOptions);
349 | 			pruner->pruner = NULL;
350 | 			pruner->loadModel(options.modelName + ".pruner");
351 | 
352 | 			int numFeats = prunerPipe.dataAlphabet->size() - 1;
353 | 			int numTypes = prunerPipe.typeAlphabet->size() - 1;
354 | 			cout << "Pruner Num Feats: " << numFeats << endl;
355 | 			cout << "Pruner Num Edge Labels: " << numTypes << endl;
356 | 		}
357 | 		testSp.pruner = pruner;
358 | 	    testSp.loadModel(options.modelName);
359 | 	    cout << "done." << endl;
360 | 
361 | 	    int numFeats = testPipe.dataAlphabet->size() - 1;
362 | 	    int numTypes = testPipe.typeAlphabet->size() - 1;
363 | 	    cout << "Num Feats: " << numFeats << endl;
364 | 	    cout << "Num Edge Labels: " << numTypes << endl;
365 | 
366 | 	    //pipe.closeAlphabets();
367 | 
368 | 	    // run multi-thread to test
369 | 		string devfile = options.testFile;
370 | 		string devoutfile = options.outFile;
371 | 		cout << "build dev params" << endl;
372 | 		testSp.devParams->copyParams(testSp.parameters);
373 | 	    testSp.dt->start(devfile, devoutfile, &testSp, true);
374 | 
375 | 	    // wait until all finishes
376 | 	    pthread_join(testSp.dt->workThread, NULL);
377 | 	    testSp.closeDecoder();
378 | 	}
379 | 
380 | 	return 0;
381 | }
382 | 
383 | 
384 | 


--------------------------------------------------------------------------------
/SegParser.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SegParser.h
 3 |  *
 4 |  *  Created on: Mar 19, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef SEGPARSER_H_
 9 | #define SEGPARSER_H_
10 | 
11 | #include <boost/multi_array.hpp>
12 | #include <boost/shared_ptr.hpp>
13 | #include <vector>
14 | #include "DependencyPipe.h"
15 | #include "decoder/DevelopmentThread.h"
16 | #include "Parameters.h"
17 | #include "Options.h"
18 | #include "decoder/DependencyDecoder.h"
19 | 
20 | namespace segparser {
21 | 
22 | using namespace std;
23 | using namespace boost;
24 | 
25 | class Parameters;
26 | class DependencyDecoder;
27 | class DevelopmentThread;
28 | 
29 | class SegParser {
30 | public:
31 | 	SegParser(DependencyPipe* pipe, Options* options);
32 | 	virtual ~SegParser();
33 | 	void train(vector<inst_ptr>& il);
34 | 	void trainingIter(vector<inst_ptr>& goldList, vector<inst_ptr>& predList, int iter);
35 | 	void checkDevStatus(int iter);
36 | 
37 | 	void outputWeight(ofstream& fout, int type, Parameters* params);
38 | 	void outputWeight(string fStr);
39 | 	void loadModel(string file);
40 | 	void saveModel(string file, Parameters* params);
41 | 
42 | 	void closeDecoder();
43 | 
44 | 	void evaluatePruning();
45 | 
46 | 	DependencyPipe* pipe;
47 | 	DependencyDecoder* decoder;
48 | 	Parameters* parameters;
49 | 	Parameters* devParams;
50 | 	DevelopmentThread* dt;
51 | 	Options* options;
52 | 	SegParser* pruner;
53 | 
54 | private:
55 | 	int devTimes;
56 | };
57 | 
58 | } /* namespace segparser */
59 | #endif /* SEGPARSER_H_ */
60 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/SharedTaskCommon.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/SharedTaskCommon.pyc


--------------------------------------------------------------------------------
/TedWrappers_20131015/TedEvalApps.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/TedEvalApps.jar


--------------------------------------------------------------------------------
/TedWrappers_20131015/TedPart.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/TedPart.jar


--------------------------------------------------------------------------------
/TedWrappers_20131015/cleanconll.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # script that clean treebank and treebank output
 4 | # for use with tedeval
 5 | # Djame Seddah
 6 | 
 7 | use strict;
 8 | 
 9 | 
10 | use constant {
11 | 		# for conll data
12 |         ID      => 0,
13 |         FORM => 1,   
14 |         LEMMA => 2,  
15 |         CPOS => 3,   
16 |         FPOS => 4,   
17 |         FEAT => 5,   
18 |         HEAD => 6,   
19 |         DEPREL => 7, 
20 |         PHEAD => 8,  
21 |         PDEPREL => 9,
22 |         SOURCETOKEN => 10,
23 |         # for morfette data
24 |         FORMM =>0,   
25 |         LEMMAM=> 1,  
26 |         FEATM=> 2    
27 | };
28 | 
29 | my $kk=0;
30 | if ($ARGV[0] eq "-pass"){
31 |         $kk=1;
32 | }
33 | 
34 | 
35 | while(<>){
36 | 		chomp;
37 | 		my $line=$_;
38 | 		if ($line=~/^\s*$/){ print "\n"; next;}
39 | 		if ($kk ==1){ print "$line\n"; next;} # just for debogging' sake (like do nothing)
40 | 		my @FC=split(/\t/,$line);
41 | 		foreach my $field (LEMMA,CPOS,FPOS,FEAT,DEPREL){
42 | 			$FC[DEPREL]=~s/^(.+)\|.+/$1/; # beware, destructive operation. Tree won't be able to deprojectivize (we strip some information)
43 | 			$FC[$field]=~s/^[_|-]+$/dummy/;
44 | 			$FC[$field]=~s/\:/<column>/g; # that one I like...
45 | 			$FC[$field]=~s/-(..)B-/$1B/;
46 | 		}
47 | 		$FC[FEAT]="_";
48 | 		$FC[FORM]=~s/\:/<column>/g; # for fuck's sake putain..
49 | 		$FC[FORM]=~s/-(..)B-/$1B/;
50 | 		# we print the first 8 
51 | 		print join("\t",@FC[0..7]),"\t";
52 | 		print join("\t",@FC[6..7]);
53 | 		if (defined $FC[SOURCETOKEN]){
54 |                  print "\t",$FC[SOURCETOKEN];
55 | 		}
56 | 		print "\n"; 
57 | }
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/cleanptb.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # script that clean treebank and treebank output
 4 | # for use with tedeval
 5 | # Djame Seddah
 6 | 
 7 | use strict;
 8 | 
 9 | 
10 | my $kk=0;
11 | if ($ARGV[0] eq "-pass"){
12 |         $kk=1;
13 | }
14 | 
15 | 
16 | while(<>){
17 | 		chomp;
18 | 		my $line=$_;
19 | 		$line=~s/^\( /(TOP /;
20 | 		#if ($line=~/^\s*$/){ print "\n"; next;}
21 | 		if ($kk ==1){ print "$line\n"; next;} # just for debogging' sake (like do nothing)
22 | 		$line=~s/##[^#]+##//g; # removing all features
23 | 		# magical regexp from releaf.pl
24 | 		my $preterm='\(([^() \t]+)[ \t]+([^() \t]+)\)';# match (DT The) or (NC samere_en_short)
25 | 		$line=~s/$preterm/"(".&clean_all($1)." ".&clean_all($2).")"/ge;
26 | 		print "$line\n";
27 | }
28 | 
29 | 
30 | sub clean_all{
31 |         my $string= shift;
32 |         #$string=~s/^[_|-]+$/dummy/;
33 | 		$string=~s/\:/<column>/g; # that one I like...
34 | 		$string=~s/-(..)B-/$1B/; # probably not necessary
35 | 		return $string;	
36 | }
37 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/debug/check_sourceid.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | 
 4 | 
 5 | use strict;
 6 | 
 7 | my $sent=0;
 8 | my $i++;
 9 | my $j=0;
10 | while(<>){
11 |     chomp;
12 |     my $line=$_;
13 |     $j++;
14 |     if ($line=~/^\s*$/){$sent++; $i=0; next;}
15 |     my @FC=split(/\t/,$line);
16 |     $i++;
17 |     if ($FC[$#FC] !~m/^[0-9]+$/){ print "sentence $sent, token $i (line $j)\n"; exit;};
18 | 
19 | }


--------------------------------------------------------------------------------
/TedWrappers_20131015/debug/do_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | for file in `ls *.lattices`; do
4 | 	echo "processing $file"
5 | 	./check_sourceid.pl $file
6 | done
7 | 
8 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/genere_tfm_tedeval.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | 
 4 | use strict;
 5 | my $LANG;
 6 | my $GOLD;
 7 | my $TYPE;
 8 | while(<>){
 9 | 	chomp;
10 | 	my $line=$_;
11 | 	if($line=~/(HEBREW)/i) {
12 | 		$LANG="HEBREW";
13 | 	}elsif($line=~/(ARABIC)/i) {
14 | 		$LANG="ARABIC";
15 | 	}
16 | 	
17 | 	if ($line=~/ptb/){
18 | 		$TYPE="ptb";
19 | 	}elsif ($line=~/conll/){
20 | 		$TYPE="conll";
21 | 	}
22 | 	my $Lang=ucfirst lc $LANG;
23 | 	#print "$Lang\n"; next;
24 | 	 my $file=$line;
25 | 	 $GOLD="../READY_TO_SHIP_FINAL/${LANG}_SPMRL/gold/$TYPE/test/test.$Lang.gold.$TYPE";	
26 | 	 my $TFM="tedeval.sh  --unlabeled --$TYPE --any $LANG -k  -g $GOLD -s $file | tee $file.djam_log ;  cat $file.4tedeval.evalted.res.ted-unlabeled |grep \"AVG:\"| perl -ne 'chomp ; print \"\$_\\t$file\\n\"'" ;
27 | 	 #print STDERR `eval $CMD | tee $file.log`;
28 | 	 print $TFM."\n";
29 | }
30 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/get_cutoffed_sent.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright (c) 2001 by David Chiang. All rights reserved.
 3 | # modif to cope with conll files by DjamSeddah (2013)
 4 | # usage: lines 2 3 < test.mrg  or lines -c 2 3 < test.conll
 5 |  
 6 | use strict;
 7 | my $KK="\n";
 8 | if ($ARGV[0] eq "-c"){
 9 | 	$KK="\n\n";
10 | 	shift @ARGV;
11 | }elsif($ARGV[0] eq '-mada'){
12 |        $KK="--------------\nSENTENCE BREAK\n--------------\n";
13 |        shift @ARGV;
14 | }
15 | 
16 | my $CUTOFF;
17 | if ($ARGV[0] eq "-K"){
18 | 	$CUTOFF=$ARGV[1] or die "cut-off lenght not given.\n";
19 | 	shift @ARGV; #lame I know, but that case was inserted way after the rest..
20 | 	shift @ARGV;  
21 | }
22 | 
23 | 
24 | open FICIN,"<$ARGV[0]" or die "[get_cutoffed_sent.pl] problem with $ARGV[0] or no file given\n";
25 | my @lines2skip=<FICIN>;
26 | chomp @lines2skip;
27 | my %H= map { $_=~s/^\s*([0-9]+)\s*$/$1/; $_ => 1 } @lines2skip;
28 | 
29 | #print join("__", keys %H),"__ICI\n";
30 | 
31 | #die;		
32 | $/=$KK;
33 | 
34 | 
35 | my $i = 1;
36 | my $skipped=0;
37 | my $total=0;
38 | while (<STDIN>) {
39 | #       if ($_=~/^#/){print "$_";} # print comment
40 | 		my $len=&get_lenght($_);
41 |         if ($len>$CUTOFF) {
42 |                  
43 | 				 #print  &get_lenght($_),"\n";
44 | 				 print "$i\n";
45 | 				 $skipped++;
46 |         }else{
47 |         		  $total=$total+$len;
48 | #        		  print STDERR "$i\n";	
49 |                   
50 |          }	
51 | 	$i++;
52 | }
53 | 
54 | print STDERR "$skipped sentences skipped\n";
55 | my $perc=($skipped/$i)*100;
56 | print STDERR "$perc \% of sentences removed ($skipped / $i)\n";
57 | my $avg_lenght=$total/$i;
58 | print STDERR "Avg = ".$avg_lenght."\n";
59 | 
60 | sub get_lenght{
61 | 	my $sent=shift;
62 | 	my $count= () = $sent =~ /\n/g;
63 | 	return $count;
64 | }             
65 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/get_ted_res.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | 
 4 | #use strict;
 5 | 
 6 | 
 7 | #------------------------------------------------------------
 8 | # Sentence        TED       Exact              #Spans                              TED
 9 | # ID   Length    Accuracy    match       test    gold    gen           Distance           Normalization
10 | #                         gold   gen                              L1      L2   L1 - L2
11 | #_____________________________________________________________________________________________________
12 | my @td=qw(LEN ACC EX_gold Ex_gen Spans_test Spans_gold Spans_gen Dist_L1 Dist_L2 Dist_L1-L2 Norm file);        
13 | 
14 | while(<>){
15 |       chomp;
16 |       my $line=$_;
17 |       $line=~s/AVG:\s+//g;
18 |       $line=~s/,/./g;
19 |       my @res=split(/\s+/,$line);
20 |       my %Hres=();
21 |       my $i=0;
22 |       foreach my $el (@res){
23 |         my $key=$td[$i++];
24 |            $Hres{$key}=($el);     
25 |            #print "$key\t$el\n";
26 |       }             
27 | 	#print qw(ACC EX_gold Ex_gen Norm Spans_test Spans_gold Spans_gen Dist_L1 Dist_L2 Dist_L1-L2);
28 |       foreach my $key (qw(ACC EX_gold Ex_gen Norm Spans_test Spans_gold Spans_gen Dist_L1 Dist_L2 Dist_L1-L2)){
29 |           print "$key: $Hres{$key}\t";
30 |       }
31 |       print "file: $Hres{file}\n";
32 | }
33 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/lattice_to_segmentation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # Author : Reut Tsarfaty, July 2013
 3 | # ligth modifs: Djame Seddah
 4 | # +modif to support ptb's lattice files 
 5 | import sys
 6 | 
 7 | if sys.argv[1] == "-ptb":
 8 | 	ptb=1
 9 | else:
10 | 	ptb=0
11 | 
12 | 
13 | 
14 | 
15 | prev_tok = ""
16 | out_line = ""
17 | first=1
18 | for line in sys.stdin:
19 |    line = line.strip().split()
20 |    if not line:
21 |       #out_line +=  "\t".join([token,form])
22 |       if out_line:
23 |         print out_line
24 |       else:
25 |         print "\n"
26 |       prev_tok = ""
27 |       out_line = ""
28 |       #print "\n"
29 |       continue
30 |       
31 |    if ptb == -1:  #this code is bogus, the ptb hebrew files lacks the lemma field
32 |    		start, end, form, lemma, cpos, fpos, feats, token = line
33 |    else:
34 |    		start, end, form = line[0:3]
35 |    		token = line[-1]
36 |    	
37 |    if prev_tok == token:
38 |         out_line += "".join([":",form])
39 |         prev_tok = token
40 |    else:
41 |         if first==1: #lame modif to avoid first line void
42 |            first=0
43 |         else:
44 |            print out_line
45 |         out_line = ""
46 |         out_line +=  "\t".join([token,form])
47 |         prev_tok = token
48 | print "\n"
49 |           
50 |        
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/lines:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright (c) 2001 by David Chiang. All rights reserved.
 3 | # modif to cope with conll files by DjamSeddah (2013)
 4 | # usage: lines 2 3 < test.mrg  or lines -c 2 3 < test.conll
 5 |  
 6 | 
 7 | # added by djame
 8 | if (($ARGV[0] eq "-c")||($ARGV[0] eq "-L")){
 9 | 	$/="\n\n";
10 | 	shift @ARGV;
11 | }elsif($ARGV[0] eq '-mada'){
12 |        $/="--------------\nSENTENCE BREAK\n--------------\n";
13 |        shift @ARGV;
14 | }
15 | 
16 | 
17 | if ($ARGV[0] eq "-ptb"){
18 | 	shift @ARGV;
19 | 	#default mode for the sake of being compatible with one script
20 | }
21 | 		
22 | 
23 | if ($ARGV[0] eq "-p"){
24 | 	$DISPLAYNUM=1;
25 | 	shift @ARGV;
26 | }
27 | 
28 | #if ($#ARGV <2 ) {
29 | #	printf "Usage: lines <start> <stop+1>\n";
30 | #	die;
31 | #}
32 | 
33 | $start = $ARGV[0];
34 | #shift @ARGV;
35 | $stop = $ARGV[1];
36 | 
37 | if (!defined($ARGV[1])){
38 |  $stop=$start+1;
39 | }
40 | 
41 | 
42 | $i = 1;
43 | 
44 | while ($i < $start && <STDIN>) {
45 | 	$i++;
46 | }
47 | 
48 | while ($i >= $start && $i < $stop && defined($_ = <STDIN>)) {
49 | 	print $_ if ($DISPLAYNUM != 1);
50 | 	$i++;
51 | }
52 | 
53 | print  $i if ($DISPLAYNUM == 1);
54 | 
55 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/pproj_24934/conllx.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <dataformat name="conllx">
 3 | 	<column name="ID" category="INPUT" type="INTEGER"/>
 4 | 	<column name="FORM" category="INPUT" type="STRING"/>
 5 | 	<column name="LEMMA" category="INPUT" type="STRING"/>
 6 | 	<column name="CPOSTAG" category="INPUT" type="STRING"/>
 7 | 	<column name="POSTAG" category="INPUT" type="STRING"/>
 8 | 	<column name="FEATS" category="INPUT" type="STRING"/>
 9 | 	<column name="HEAD" category="HEAD" type="INTEGER"/>
10 | 	<column name="DEPREL" category="DEPENDENCY_EDGE_LABEL" type="STRING"/>
11 | 	<column name="PHEAD" category="IGNORE" type="INTEGER" default="_"/>
12 | 	<column name="PDEPREL" category="IGNORE" type="STRING" default="_"/>
13 | </dataformat>


--------------------------------------------------------------------------------
/TedWrappers_20131015/pproj_24934/pproj_24934_pseudo.info:
--------------------------------------------------------------------------------
 1 | CONFIGURATION
 2 | Configuration name:   pproj_24934
 3 | Configuration type:   pseudo
 4 | Created:              Fri Sep 05 22:19:36 EDT 2014
 5 | 
 6 | SYSTEM
 7 | Operating system architecture: amd64
 8 | Operating system name:         Linux
 9 | JRE vendor name:               Oracle Corporation
10 | JRE version number:            1.8.0_05
11 | 
12 | MALTPARSER
13 | Version:                       1.7.2
14 | Build date:                    September 25 2012
15 | 
16 | SETTINGS
17 | 2planar
18 |   reduceonswitch (-2pr)                 false
19 | config
20 |   logfile (-lfi)                        stdout
21 |   workingdir (  -w)                     user.dir
22 |   name (  -c)                           pproj_24934
23 |   logging ( -cl)                        info
24 |   flowchart (  -m)                      proj
25 |   type (  -t)                           singlemalt
26 |   url (  -u)                            
27 | covington
28 |   allow_shift ( -cs)                    false
29 |   allow_root ( -cr)                     true
30 | graph
31 |   max_sentence_length (-gsl)            256
32 |   root_label (-grl)                     ROOT
33 |   head_rules (-ghr)                     
34 | guide
35 |   features (  -F)                       
36 |   data_split_threshold (  -T)           50
37 |   kbest_type ( -kt)                     rank
38 |   data_split_structure (  -s)           
39 |   data_split_column (  -d)              
40 |   learner (  -l)                        liblinear
41 |   decision_settings (-gds)              T.TRANS+A.DEPREL
42 |   classitem_separator (-gcs)            ~
43 |   kbest (  -k)                          -1
44 | input
45 |   charset ( -ic)                        UTF-8
46 |   reader ( -ir)                         tab
47 |   reader_options (-iro)                 
48 |   format ( -if)                         /appdata/dataformat/conllx.xml
49 |   infile (  -i)                         /dev/stdin
50 |   iterations ( -it)                     1
51 | lib
52 |   external ( -lx)                       
53 |   save_instance_files ( -li)            false
54 |   options ( -lo)                        
55 |   verbosity ( -lv)                      silent
56 | multiplanar
57 |   planar_root_handling (-prh)           normal
58 | nivre
59 |   allow_reduce ( -ne)                   false
60 |   allow_root ( -nr)                     true
61 | output
62 |   charset ( -oc)                        UTF-8
63 |   outfile (  -o)                        /dev/stdout
64 |   format ( -of)                         
65 |   writer_options (-owo)                 
66 |   writer ( -ow)                         tab
67 | planar
68 |   no_covered_roots (-pcov)               false
69 |   acyclicity (-pacy)                     true
70 |   connectedness (-pcon)                  none
71 | pproj
72 |   marking_strategy ( -pp)               head
73 |   lifting_order (-plo)                  shortest
74 |   covered_root (-pcr)                   none
75 | singlemalt
76 |   mode ( -sm)                           parse
77 |   diagnostics ( -di)                    false
78 |   use_partial_tree ( -up)               false
79 |   propagation ( -fp)                    
80 |   parsing_algorithm (  -a)              nivreeager
81 |   guide_model ( -gm)                    single
82 |   null_value ( -nv)                     one
83 |   diafile (-dif)                        stdout
84 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/reprojectivize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #MALTHOME=$SHARED/TEDEVALSTUFF/maltparser-1.7.2  # to change to fit your own install
 3 | MALTHOME=~/public/workspace/Code/tedeval/maltparser-1.7.2
 4 | 
 5 | if test "$1" = "-ptb" ; then # do nothing if const. file
 6 | 	echo "ptb file, doing nothing"  > /dev/stderr
 7 | 	cat
 8 | else # reprojectivize the data
 9 |  java -jar $MALTHOME/maltparser-1.7.2.jar -c pproj_$$ -m proj -pp head -i /dev/stdin -o /dev/stdout
10 |  rm -f pproj_$$.mco
11 |  
12 | fi
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/skip_lines.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # Copyright (c) 2001 by David Chiang. All rights reserved.
 3 | # modif to cope with conll files by DjamSeddah (2013)
 4 | # usage: lines 2 3 < test.mrg  or lines -c 2 3 < test.conll
 5 |  
 6 | use strict;
 7 | use Data::Dumper;
 8 | use Getopt::Long;
 9 | 
10 | # to generate the small test sets version
11 | # will add the option later
12 | my @test5knoskip=qw/145 436 149 285 223 291 409 486 319/;
13 | my @dev5knoskip=qw/126 337 166 338 203 157 388 493 238/;
14 | 
15 | # now those are the right ones (la putain de sa mère !!)
16 | my @test5k=qw/153 436 152 285 223 291 409 319 /;
17 | my @dev5k=qw/130 337 169 338 209 161 388 493 256 /;
18 | 
19 | 
20 | # ARABIC= 145  HEBREW 223
21 | my @lang=qw/ARABIC BASQUE FRENCH GERMAN HEBREW HUNGARIAN KOREAN POLISH SWEDISH/;
22 | my %data=();
23 | my $i=0;
24 | foreach my $l (@lang){
25 | 	$data{$l}{dev}=$dev5k[$i];
26 | 	$data{$l}{test}=$test5k[$i];
27 | 	$i++;
28 | }
29 | #print Dumper(\%data);
30 | 
31 | 
32 | 
33 | my $KK="\n";
34 | if ($ARGV[0] eq "-c"){
35 | 	$KK="\n\n";
36 | 	shift @ARGV;
37 | }elsif($ARGV[0] eq '-mada'){
38 |        $KK="--------------\nSENTENCE BREAK\n--------------\n";
39 |        shift @ARGV;
40 | }
41 | my $fiveK=0;
42 | my $pref="test";
43 | my $lang="";
44 | if ($ARGV[0] eq "-5k"){
45 | 	if(defined $ARGV[1]){
46 | 		$lang=uc $ARGV[1];
47 | 		shift @ARGV;
48 | 		}else{
49 | 			die "-5k must be followed by a language (Arabic,French..)\n";
50 | 		}
51 | 	$fiveK=1;
52 | 	shift @ARGV;
53 | }
54 | 
55 | 
56 | 
57 | open FICIN,"<$ARGV[0]" or die "[skip_lines.pl] problem with $ARGV[0] or no file given\n";
58 | my @lines2skip=<FICIN>;
59 | chomp @lines2skip;
60 | my %H= map { $_=~s/^\s*([0-9]+)\s*$/$1/; $_ => 1 } @lines2skip;
61 | 
62 | #print join("__", keys %H),"__ICI\n";
63 | 
64 | #die;		
65 | $/=$KK;
66 | 
67 | 
68 | my $i = 1; #line read, even if skipped
69 | my $j=1;  #line effectively output 
70 | my $skipped=0;
71 | LOOP: while (<STDIN>) {
72 | #       if ($_=~/^#/){print "$_";} # print comment
73 |         if (!exists $H{$i}){
74 | 			if ( ($fiveK == 1) && ($j >$data{$lang}{$pref}) ){
75 | 					# we simply exit
76 | 					last LOOP; # lame and all but is there a simplest way to exit ?
77 | 			}
78 | 			print "$_";
79 | 			$j++;     
80 |         }else{
81 |                   $skipped++;
82 |          }	
83 | 	$i++;
84 | }
85 | OUT:
86 | print STDERR "$skipped sentences skipped\n";
87 | print STDERR "$lang = $data{$lang}{$pref} sentences\n" if ($fiveK == 1);
88 |              
89 | __END__
90 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/tedeval-2.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/tedeval-2.2.jar


--------------------------------------------------------------------------------
/TedWrappers_20131015/tedeval.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/tedeval.jar


--------------------------------------------------------------------------------
/TedWrappers_20131015/tedeval_cross2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | 
 4 | PROGDIR=`dirname $0`
 5 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar
 6 | TEDEVALJARAPP=$PROGDIR/TedEvalApps.jar
 7 | 
 8 | 
 9 | PREF=$FINAL/READY_TO_SHIP_FINAL/FRENCH_SPMRL/
10 | GOLDCONLL=$PREF/gold/conll/test/test.French.gold.conll
11 | GOLDPTB=$PREF/gold/ptb/test/test.French.gold.ptb
12 | TESTCONLL=$1
13 | TESTPTB=$2
14 | 
15 | #java -Xmx768m   -cop /Archive/workspace/unipar/bin/  applications.Dtreebank2Ftreebank
16 | 
17 | 
18 | cat $GOLDPTB | perl -pe 's/^\( /(TOP /' > $GOLDPTB.4tedeval
19 | cat $TESTPTB | perl -pe 's/^\( /(TOP /' > $TESTPTB.4tedeval.noeval
20 | GOLDPTB=$GOLDPTB.4tedeval
21 | TESTPTB=$TESTPTB.4tedeval.noeval
22 | 
23 | java -Xmx768m   -cp $TEDEVALJARAPP:$TEDEVALJAR:. applications.Dtreebank2Ftreebank $GOLDCONLL $GOLDCONLL.ftrees
24 | 
25 | java -Xmx768m   -cp $TEDEVALJARAPP:$TEDEVALJAR:. applications.Dtreebank2Ftreebank $TESTCONLL $TESTCONLL.ftrees
26 | 
27 | java -Xmx768m   -jar $TEDEVALJAR -p1 $TESTCONLL.ftrees -g1 $GOLDCONLL.ftrees -o1 $TESTCONLL.tedeval-res -p2 $TESTPTB -g2 $GOLDPTB -o2 $TESTPTB.tedeval-crossfram.res
28 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/tedeval_debug.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/tedeval_debug.jar


--------------------------------------------------------------------------------
/TedWrappers_20131015/tedeval_seg.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # wrapper script to make tedeval work on SPMRL Shared task data set
  4 | # Djame Seddah
  5 | 
  6 | # version  August 19, 03:49
  7 | # fixing java.nullPointer exception (removing sentence 908 and 1889 from dev arabic pred+pred)
  8 | # version  August 18, 02:14
  9 | # fixing java.nullPointer exception (removing sentence 799 from test test arabic pred+pred)
 10 | 
 11 | 
 12 | 
 13 | #  see
 14 | # http://stackoverflow.com/questions/402377/using-getopts-in-bash-shell-script-to-get-long-and-short-command-line-options/7680682#7680682
 15 | # options  
 16 | # 	-d (debug version), -n  
 17 | # 	-labeled, -unlalebed (*)
 18 | #	-ptb, -conll (*)
 19 | # 	-ar  (for arabic, default hebrew)
 20 | #   -test  (test set gold file used, default dev)
 21 | #   -cut   (cut-off lenght + bad sentences removed, fixed?)
 22 | #  -predfile FILE   (predicted parsed file)
 23 | #  -predmap  FILE  (predicted mapping file) # if not given, calculated
 24 | #  -gold FILE  
 25 | # -begin  starting line to be evaluated, 1 if nothing
 26 | # -end  end+1 line to be evaluated	+1000000 if nothing 
 27 | 
 28 | 
 29 | set -x
 30 | 
 31 | # VARIABLES
 32 | PROGDIR=`dirname $0`
 33 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar
 34 | #TEDEVALJAR=$PROGDIR/tedeval.jar
 35 | LABEL="-unlabeled"
 36 | TYPE="conll"
 37 | TYPENAME=conll
 38 | ARG="-c"
 39 | LANGUAGE=""
 40 | SKIPFILE=/dev/null
 41 | PREF=test
 42 | DOCUT=1
 43 | CUTOFF=700  #should be a parameter, later...
 44 | PREDFILE=""
 45 | PREDMAP=""
 46 | PREDLAT=""
 47 | GOLDFILE=""
 48 | GOLDLAT=""
 49 | LATSUF=tobeparsed.gold_tagged+gold_token.lattices
 50 | START=1
 51 | END=100000  # let's hope that no file will ever be that long (linewise)
 52 | SPMRLDATA_ROOTDIR=$SHARED/SPMRL_FINAL/READY_TO_SHIP_FINAL/
 53 | FIVEK=""
 54 | GOLDSEG=""
 55 | PREDSEG=""
 56 | 
 57 | if [ -z $1 ] ; then
 58 | 	echo "tedeval.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 
 59 |     -D | --debug 	 	use tedeval + debug outputs
 60 |     -n | --new 	   		use latest tedeval-2.2.jar (default is 2.1	
 61 | 	-u | --unlabeled 	unlabeled evaluation  (default)	 
 62 | 	-l | --labeled 	 	labeled evaluation
 63 | 	-p | --ptb 	 		evaluate const. files
 64 | 	-c | --conll 	 	evaluate conll files (default)
 65 | 	-a | --arabic 	 	dev mode for Arabic, do not use 
 66 | 	-h | --hebrew 	 	dev mode for Hebrew, do not use 
 67 | 	-y | --any	LANGUAGE	LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...)
 68 | 	-t | --test 			dev mode, test file, do not use 
 69 | 	-d | --dev 	  			dev mode, test file, do not use 
 70 | 	-k | --cut 	 			CUTOFF mode (tedeval is really slow for long sentences	, length cutoff hardcoded to 70 
 71 | 	-s | --system FILE		test file to be evaluated 
 72 | 	-g | --gold FILE		gold standard file ;;
 73 | 	-P | --P FILE		pred seg
 74 | 	-G | --G FILE		gold seg 
 75 | 	-L | --predlat FILE		predicted lattice files as provided by  SPMRL. If not given, spmrl one will be used
 76 | 	-m | --predmap			mapping for predicted files (use it only when with non-spmrl predicted file	. Generated from predlat file otherwise
 77 | 	-b | --begin 			line ID to start evaluate
 78 | 	-e | --end 	 			line ID+1 to stop the evaluation 
 79 | 	--spmrldata_rootdir 	root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/	
 80 | 	--help 	
 81 | 	" ; 
 82 | 	exit
 83 | fi
 84 | 
 85 | echo "###########################"
 86 | echo "Running\: tedeval.sh $@"
 87 | echo "###########################"
 88 | echo "\n"
 89 | TEMP=`getopt -o DnlupcAHy:tdks:m:L:g:b:e:P:G: --long help,debug,new,labeled,unlabeled,ptb,conll,Arabic,Hebrew,any:,test,dev,cut,predfile:,predmap:,predlat:,gold:,begin:,end:,spmrldata_rootdir,fivek,P:,G: -- "$@"`
 90 | 
 91 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
 92 | 
 93 | # Note the quotes around `$TEMP': they are essential!
 94 | eval set -- "$TEMP"
 95 | 
 96 | 
 97 | while true; do
 98 |   case "$1" in
 99 |     -D | --debug ) TEDEVALJAR=$PROGDIR/tedeval_debug.jar; shift ;;
100 |     -n | --new )   TEDEVALJAR=$PROGDIR/tedeval-2.2.jar; shift ;;
101 |         -P | --P ) PREDSEG="$2" ; shift 2;;
102 |         -G | --G ) GOLDSEG="$2" ; shift 2;;
103 | 	-u | --unlabeled ) LABEL="-unlabeled" ; shift ;;
104 | 	-l | --labeled ) LABEL="" ; shift ;;
105 | 	-p | --ptb ) TYPE=ptb ; TYPENAME=bracketed ; ARG="" ; shift   ;;
106 | 	-c | --conll ) TYPE=conll ; TYPENAME=conll ; ARG="-c" ; shift ;;
107 | 	-A | --arabic ) LANGUAGE=ARABIC ;SUF=""; SKIPFILE="" ; exit ; shift ;; 
108 | 	-H | --hebrew ) LANGUAGE=HEBREW  ; shift ;;
109 | 	-y | --any	) LANGUAGE="$2" ; shift 2 ;;
110 | 	-t | --test ) PREF="test" ; shift ;;
111 | 	-d | --dev )  PREF="dev"  ; shift ;;
112 | 	-k | --cut ) DOCUT=1 ; shift ;;
113 | 	-s | --system ) PREDFILE="$2" ; shift 2;;
114 | 	-g | --gold ) GOLDFILE="$2" ; shift 2;;
115 | 	-m | --predmap ) PREDMAP="$2" ; shift 2;;
116 | 	-L | --predlat ) PREDLAT="$2" ; shift 2;;
117 | 	-b | --begin )	START="$2"  ; shift 2;;
118 | 	-e | --end ) END="$2"  ; shift 2;;
119 | 	-R | --spmrldata_rootdir ) SPMRLDATA_ROOTDIR="$2" ; shift 2;;
120 | 	--fivek ) FIVEK="-5k" ; shift ;;
121 | 	--help )
122 | 	echo "tedeval_simple.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 
123 |     -D | --debug 	 	use tedeval + debug outputs
124 |     -n | --new 	   		use latest tedeval-2.2.jar (default is 2.1	
125 | 	-u | --unlabeled 	unlabeled evaluation  (default)	 
126 | 	-l | --labeled 	 	labeled evaluation
127 | 	-p | --ptb 	 		evaluate const. files
128 | 	-c | --conll 	 	evaluate conll files (default)
129 | 	-a | --arabic 	 	dev mode for Arabic, do not use 
130 | 	-h | --hebrew 	 	dev mode for Hebrew, do not use 
131 | 	-y | --any	LANGUAGE	LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...)
132 | 	-t | --test 			dev mode, test file, do not use 
133 | 	-d | --dev 	  			dev mode, test file, do not use 
134 | 	-k | --cut 	 			CUTOFF mode (tedeval is really slow for long sentences	, length cutoff hardcoded to 70 
135 | 	-s | --system FILE		test file to be evaluated 
136 | 	-g | --gold FILE		gold standard file ;;
137 |         -P | --P FILE         pred seg
138 |         -G | --G FILE         gold seg 
139 | 	-b | --begin 			line ID to start evaluate
140 | 	-e | --end 	 			line ID+1 to stop the evaluation 
141 | 	--spmrldata_rootdir 	root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/	
142 | 	--help 	
143 | 	" ; 
144 | 	shift ; exit 1;;
145 |     -- ) shift; break ;;
146 |     * ) break ;;
147 |   esac
148 | done
149 | 
150 | 
151 | # *** INIT
152 | # arabic lines bug
153 | #START=699 # should be 799 but 100 sentences > 70 were removed
154 | #START=167 # same bug as 699 but in labeled 
155 | 
156 | #END=700
157 | #END=168
158 | LDIR=`echo $LANGUAGE | perl -ne 'print uc $_'`_SPMRL
159 | LANGUPPED=`echo $LANGUAGE | perl -ne 'print uc $_'`
160 | LANGUAGE=`echo $LANGUAGE|perl -p -ne '$_ = ucfirst lc $_ ;'`
161 | if [ ! -z "$FIVEK" ] ; then
162 | 	FIVEK="-5k $LANGUPPED"
163 | fi
164 | 
165 | 
166 | if [ -z "$PREDFILE" ]; then
167 | 		PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed$SUF
168 | fi
169 | 
170 | if [ ! -f "$PREDFILE" ] ; then
171 | 	echo "PREDFILE: $PREDFILE not found"
172 | 	exit 0
173 | fi
174 | 
175 | echo "LDIR = $LDIR   LANGUAGE $LANGUAGE  PREDLAT $PREDLAT"
176 | #exit
177 | 
178 | 
179 | 
180 | 
181 | # we're always picking the 
182 | if [ -z "$GOLDFILE" ] ; then
183 | 	GOLDFILE=${SPMRLDATA_ROOTDIR}/${LDIR}/gold/$TYPE/${PREF}/${PREF}.${LANGUAGE}.gold.$TYPE
184 | fi
185 | 
186 | 	
187 | 
188 | # checking all files
189 | 
190 | 
191 | if [ -f "$GOLDFILE" ] ; then
192 | 	echo "gold: $GOLDFILE found"
193 | else
194 | 	wc -L $GOLDFILE
195 | 	echo "gold: $GOLDFILE not found"
196 | 	exit
197 | fi
198 | 
199 | 
200 | 
201 | if [ -f "$PREDFILE" ] ; then
202 | 	echo "pred file: $PREDFILE found"
203 | else
204 | 	echo "pred file: $PREDFILE not found"
205 | 	exit
206 | fi
207 | 
208 | 
209 | 
210 | #exit
211 | 
212 | 	
213 | #PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices
214 | 
215 | #  dev.Hebrew.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.parsed.lattices.5k
216 | # test.Arabic.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed.full
217 | 
218 | 
219 | # fixing the blank line bug
220 | #perl -i.bak -pe 's/^\s*$/\n/g' $GOLFILE $PREDFILE $PREDLAT $GOLDLAT
221 | 
222 | ####################################################
223 | ##  real workd done here
224 | ######################################################
225 | 
226 | if test $DOCUT = 1 ; then
227 | 	echo "generating lines to be skipped"
228 | 	cat $GOLDFILE | perl -pe 's/^\s*$/\n/' | $PROGDIR/get_cutoffed_sent.pl -c -K $CUTOFF /dev/null > $GOLDFILE.tobeskipped
229 | 	if test "$PREF.$LANGUAGE" = "test.Arabic" ; then
230 | 		#echo -e "\n799" >> $GOLDFILELAT.tobeskipped # buggy sentence for unlabeld evaluation on arabic test
231 | 		echo "\n" >>  $GOLDFILE.tobeskipped
232 | 	elif test "$PREF.$LANGUAGE" = "dev.Arabic" ; then
233 | 		#echo -e "\n904\n1889" >>  $GOLDFILELAT.tobeskipped #buggy sentence for unlabeld evaluation on arabic dev
234 | 		echo "\n" >>  $GOLDFILE.tobeskipped
235 | 
236 | 	fi
237 | 	SKIPFILE=$GOLDFILE.tobeskipped
238 | else
239 | 	SKIPFILE=/dev/null
240 | fi
241 | 
242 | 
243 | #exit
244 |  
245 | echo "generating normalized files"
246 | echo -e "\t==> gold"
247 | wc $ARG $GOLDFILE
248 | cat $GOLDFILE|perl -pe 's/^\s*$/\n/'| $PROGDIR/skip_lines.pl $ARG $FIVEK  $SKIPFILE  | $PROGDIR/lines $ARG $START $END | $PROGDIR/reprojectivize.sh -$TYPE|   $PROGDIR/clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $GOLDFILE.4tedeval.$$
249 | wc $ARG $GOLDFILE.4tedeval.$$
250 | cat $GOLDSEG > $GOLDSEG.4tedeval.$$
251 | 
252 | echo -e "\t==> pred"
253 | wc $ARG $PREDFILE
254 | cat $PREDFILE|perl -pe 's/^\s*$/\n/'|$PROGDIR/skip_lines.pl $ARG $FIVEK $SKIPFILE  | $PROGDIR/lines $ARG $START $END | $PROGDIR/reprojectivize.sh -$TYPE|   $PROGDIR/clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $PREDFILE.4tedeval.$$ #no idea why twice
255 | wc $ARG $PREDFILE.4tedeval.$$
256 | cat $PREDSEG > $PREDSEG.4tedeval.$$
257 | #exit
258 | 
259 | # that was to generate a fake arabic parsed file
260 | #cat $PREDLAT | lines $ARG 1 $END | cut -f2-7 | add_fake_col.pl | clean$TYPE.pl | perl -pe 's/^\s*$/\n/' |uniq> $PREDLAT.fake
261 | 
262 | # normal  gold vs pred (LABELED)
263 | 
264 | if test $LABEL = "-unlabeled" ; then
265 | 	SUF="-unlabeled"
266 | else
267 | 	SUF="-labeled"
268 | fi
269 | 
270 | #java  -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$  -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF
271 | java  -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$ -sg $GOLDSEG.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -sp $PREDSEG.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF
272 | file="$PREDFILE.4tedeval.simple_tedeval.res$SUF.ted"
273 | 
274 | 
275 | 
276 | #echo "java  -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$  -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF" > /dev/stderr
277 | echo "java  -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$ -sg $GOLDSEG.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -sp $PREDSEG.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF" > /dev/stderr
278 | echo " "
279 | cat $file | grep "AVG:"|  perl -p -s -e 'chomp ; s/(.)$/\1\t$file \n/' -- -file=$file | $PROGDIR/get_ted_res.pl
280 | 
281 | 
282 | echo -e "\n\n"
283 | 
284 | rm -f $GOLDFILE.4tedeval.$$ $PREDFILE.4tedeval.$$ $GOLDSEG.4tedeval.$$ $PREDSEG.4tedeval.$$
285 | 
286 | #eval gold vs gold => 100%
287 | #java -Xmx3g -jar $TEDEVALJAR -g  $GOLDFILE.4tedeval -sg $GOLDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE
288 | 
289 | # eval pred vs gold
290 | #java -server -Xmx3g -jar $TEDEVALJAR -g  $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE
291 | 
292 | # eval pred vs pred => 100%
293 | #java -server -Xmx3g -jar $TEDEVALJAR -g  $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $PREDFILE.4tedeval.$$ -sp $PREDLAT.4tedeval.mapping -format $TYPE
294 | 
295 | exit
296 | 
297 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/tedeval_simple.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # wrapper script to make tedeval work on SPMRL Shared task data set
  4 | # Djame Seddah
  5 | 
  6 | # version  August 19, 03:49
  7 | # fixing java.nullPointer exception (removing sentence 908 and 1889 from dev arabic pred+pred)
  8 | # version  August 18, 02:14
  9 | # fixing java.nullPointer exception (removing sentence 799 from test test arabic pred+pred)
 10 | 
 11 | 
 12 | 
 13 | #  see
 14 | # http://stackoverflow.com/questions/402377/using-getopts-in-bash-shell-script-to-get-long-and-short-command-line-options/7680682#7680682
 15 | # options  
 16 | # 	-d (debug version), -n  
 17 | # 	-labeled, -unlalebed (*)
 18 | #	-ptb, -conll (*)
 19 | # 	-ar  (for arabic, default hebrew)
 20 | #   -test  (test set gold file used, default dev)
 21 | #   -cut   (cut-off lenght + bad sentences removed, fixed?)
 22 | #  -predfile FILE   (predicted parsed file)
 23 | #  -predmap  FILE  (predicted mapping file) # if not given, calculated
 24 | #  -gold FILE  
 25 | # -begin  starting line to be evaluated, 1 if nothing
 26 | # -end  end+1 line to be evaluated	+1000000 if nothing 
 27 | 
 28 | 
 29 | set -x
 30 | 
 31 | # VARIABLES
 32 | PROGDIR=`dirname $0`
 33 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar
 34 | #TEDEVALJAR=$PROGDIR/tedeval.jar
 35 | LABEL="-unlabeled"
 36 | TYPE="conll"
 37 | TYPENAME=conll
 38 | ARG="-c"
 39 | LANGUAGE=""
 40 | SKIPFILE=/dev/null
 41 | PREF=test
 42 | DOCUT=1
 43 | CUTOFF=70  #should be a parameter, later...
 44 | PREDFILE=""
 45 | PREDMAP=""
 46 | PREDLAT=""
 47 | GOLDFILE=""
 48 | GOLDLAT=""
 49 | LATSUF=tobeparsed.gold_tagged+gold_token.lattices
 50 | START=1
 51 | END=100000  # let's hope that no file will ever be that long (linewise)
 52 | SPMRLDATA_ROOTDIR=$SHARED/SPMRL_FINAL/READY_TO_SHIP_FINAL/
 53 | FIVEK=""
 54 | 
 55 | if [ -z $1 ] ; then
 56 | 	echo "tedeval.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 
 57 |     -D | --debug 	 	use tedeval + debug outputs
 58 |     -n | --new 	   		use latest tedeval-2.2.jar (default is 2.1	
 59 | 	-u | --unlabeled 	unlabeled evaluation  (default)	 
 60 | 	-l | --labeled 	 	labeled evaluation
 61 | 	-p | --ptb 	 		evaluate const. files
 62 | 	-c | --conll 	 	evaluate conll files (default)
 63 | 	-a | --arabic 	 	dev mode for Arabic, do not use 
 64 | 	-h | --hebrew 	 	dev mode for Hebrew, do not use 
 65 | 	-y | --any	LANGUAGE	LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...)
 66 | 	-t | --test 			dev mode, test file, do not use 
 67 | 	-d | --dev 	  			dev mode, test file, do not use 
 68 | 	-k | --cut 	 			CUTOFF mode (tedeval is really slow for long sentences	, length cutoff hardcoded to 70 
 69 | 	-s | --system FILE		test file to be evaluated 
 70 | 	-g | --gold FILE		gold standard file ;;
 71 | 	-L | --predlat FILE		predicted lattice files as provided by  SPMRL. If not given, spmrl one will be used
 72 | 	-m | --predmap			mapping for predicted files (use it only when with non-spmrl predicted file	. Generated from predlat file otherwise
 73 | 	-b | --begin 			line ID to start evaluate
 74 | 	-e | --end 	 			line ID+1 to stop the evaluation 
 75 | 	--spmrldata_rootdir 	root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/	
 76 | 	--help 	
 77 | 	" ; 
 78 | 	exit
 79 | fi
 80 | 
 81 | echo "###########################"
 82 | echo "Running\: tedeval.sh $@"
 83 | echo "###########################"
 84 | echo "\n"
 85 | TEMP=`getopt -o DnlupcAHy:tdks:m:L:g:b:e: --long help,debug,new,labeled,unlabeled,ptb,conll,Arabic,Hebrew,any:,test,dev,cut,predfile:,predmap:,predlat:,gold:,begin:,end:,spmrldata_rootdir,fivek -- "$@"`
 86 | 
 87 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
 88 | 
 89 | # Note the quotes around `$TEMP': they are essential!
 90 | eval set -- "$TEMP"
 91 | 
 92 | 
 93 | while true; do
 94 |   case "$1" in
 95 |     -D | --debug ) TEDEVALJAR=$PROGDIR/tedeval_debug.jar; shift ;;
 96 |     -n | --new )   TEDEVALJAR=$PROGDIR/tedeval-2.2.jar; shift ;;
 97 | 	-u | --unlabeled ) LABEL="-unlabeled" ; shift ;;
 98 | 	-l | --labeled ) LABEL="" ; shift ;;
 99 | 	-p | --ptb ) TYPE=ptb ; TYPENAME=bracketed ; ARG="" ; shift   ;;
100 | 	-c | --conll ) TYPE=conll ; TYPENAME=conll ; ARG="-c" ; shift ;;
101 | 	-A | --arabic ) LANGUAGE=ARABIC ;SUF=""; SKIPFILE="" ; exit ; shift ;; 
102 | 	-H | --hebrew ) LANGUAGE=HEBREW  ; shift ;;
103 | 	-y | --any	) LANGUAGE="$2" ; shift 2 ;;
104 | 	-t | --test ) PREF="test" ; shift ;;
105 | 	-d | --dev )  PREF="dev"  ; shift ;;
106 | 	-k | --cut ) DOCUT=1 ; shift ;;
107 | 	-s | --system ) PREDFILE="$2" ; shift 2;;
108 | 	-g | --gold ) GOLDFILE="$2" ; shift 2;;
109 | 	-m | --predmap ) PREDMAP="$2" ; shift 2;;
110 | 	-L | --predlat ) PREDLAT="$2" ; shift 2;;
111 | 	-b | --begin )	START="$2"  ; shift 2;;
112 | 	-e | --end ) END="$2"  ; shift 2;;
113 | 	-R | --spmrldata_rootdir ) SPMRLDATA_ROOTDIR="$2" ; shift 2;;
114 | 	--fivek ) FIVEK="-5k" ; shift ;;
115 | 	--help )
116 | 	echo "tedeval_simple.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 
117 |     -D | --debug 	 	use tedeval + debug outputs
118 |     -n | --new 	   		use latest tedeval-2.2.jar (default is 2.1	
119 | 	-u | --unlabeled 	unlabeled evaluation  (default)	 
120 | 	-l | --labeled 	 	labeled evaluation
121 | 	-p | --ptb 	 		evaluate const. files
122 | 	-c | --conll 	 	evaluate conll files (default)
123 | 	-a | --arabic 	 	dev mode for Arabic, do not use 
124 | 	-h | --hebrew 	 	dev mode for Hebrew, do not use 
125 | 	-y | --any	LANGUAGE	LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...)
126 | 	-t | --test 			dev mode, test file, do not use 
127 | 	-d | --dev 	  			dev mode, test file, do not use 
128 | 	-k | --cut 	 			CUTOFF mode (tedeval is really slow for long sentences	, length cutoff hardcoded to 70 
129 | 	-s | --system FILE		test file to be evaluated 
130 | 	-g | --gold FILE		gold standard file ;;
131 | 	-b | --begin 			line ID to start evaluate
132 | 	-e | --end 	 			line ID+1 to stop the evaluation 
133 | 	--spmrldata_rootdir 	root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/	
134 | 	--help 	
135 | 	" ; 
136 | 	shift ; exit 1;;
137 |     -- ) shift; break ;;
138 |     * ) break ;;
139 |   esac
140 | done
141 | 
142 | 
143 | # *** INIT
144 | # arabic lines bug
145 | #START=699 # should be 799 but 100 sentences > 70 were removed
146 | #START=167 # same bug as 699 but in labeled 
147 | 
148 | #END=700
149 | #END=168
150 | LDIR=`echo $LANGUAGE | perl -ne 'print uc $_'`_SPMRL
151 | LANGUPPED=`echo $LANGUAGE | perl -ne 'print uc $_'`
152 | LANGUAGE=`echo $LANGUAGE|perl -p -ne '$_ = ucfirst lc $_ ;'`
153 | if [ ! -z "$FIVEK" ] ; then
154 | 	FIVEK="-5k $LANGUPPED"
155 | fi
156 | 
157 | 
158 | if [ -z "$PREDFILE" ]; then
159 | 		PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed$SUF
160 | fi
161 | 
162 | if [ ! -f "$PREDFILE" ] ; then
163 | 	echo "PREDFILE: $PREDFILE not found"
164 | 	exit 0
165 | fi
166 | 
167 | echo "LDIR = $LDIR   LANGUAGE $LANGUAGE  PREDLAT $PREDLAT"
168 | #exit
169 | 
170 | 
171 | 
172 | 
173 | # we're always picking the 
174 | if [ -z "$GOLDFILE" ] ; then
175 | 	GOLDFILE=${SPMRLDATA_ROOTDIR}/${LDIR}/gold/$TYPE/${PREF}/${PREF}.${LANGUAGE}.gold.$TYPE
176 | fi
177 | 
178 | 	
179 | 
180 | # checking all files
181 | 
182 | 
183 | if [ -f "$GOLDFILE" ] ; then
184 | 	echo "gold: $GOLDFILE found"
185 | else
186 | 	wc -L $GOLDFILE
187 | 	echo "gold: $GOLDFILE not found"
188 | 	exit
189 | fi
190 | 
191 | 
192 | 
193 | if [ -f "$PREDFILE" ] ; then
194 | 	echo "pred file: $PREDFILE found"
195 | else
196 | 	echo "pred file: $PREDFILE not found"
197 | 	exit
198 | fi
199 | 
200 | 
201 | 
202 | #exit
203 | 
204 | 	
205 | #PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices
206 | 
207 | #  dev.Hebrew.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.parsed.lattices.5k
208 | # test.Arabic.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed.full
209 | 
210 | 
211 | # fixing the blank line bug
212 | #perl -i.bak -pe 's/^\s*$/\n/g' $GOLFILE $PREDFILE $PREDLAT $GOLDLAT
213 | 
214 | ####################################################
215 | ##  real workd done here
216 | ######################################################
217 | 
218 | if test $DOCUT = 1 ; then
219 | 	echo "generating lines to be skipped"
220 | 	cat $GOLDFILE | perl -pe 's/^\s*$/\n/' | ./get_cutoffed_sent.pl -c -K 70 /dev/null > $GOLDFILE.tobeskipped
221 | 	if test "$PREF.$LANGUAGE" = "test.Arabic" ; then
222 | 		#echo -e "\n799" >> $GOLDFILELAT.tobeskipped # buggy sentence for unlabeld evaluation on arabic test
223 | 		echo "\n" >>  $GOLDFILE.tobeskipped
224 | 	elif test "$PREF.$LANGUAGE" = "dev.Arabic" ; then
225 | 		#echo -e "\n904\n1889" >>  $GOLDFILELAT.tobeskipped #buggy sentence for unlabeld evaluation on arabic dev
226 | 		echo "\n" >>  $GOLDFILE.tobeskipped
227 | 
228 | 	fi
229 | 	SKIPFILE=$GOLDFILE.tobeskipped
230 | else
231 | 	SKIPFILE=/dev/null
232 | fi
233 | 
234 | 
235 | #exit
236 |  
237 | echo "generating normalized files"
238 | echo -e "\t==> gold"
239 | wc $ARG $GOLDFILE
240 | cat $GOLDFILE|perl -pe 's/^\s*$/\n/'| ./skip_lines.pl $ARG $FIVEK  $SKIPFILE  | ./lines $ARG $START $END | ./reprojectivize.sh -$TYPE|   ./clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $GOLDFILE.4tedeval.$$
241 | wc $ARG $GOLDFILE.4tedeval.$$
242 | 
243 | echo -e "\t==> pred"
244 | wc $ARG $PREDFILE
245 | cat $PREDFILE|perl -pe 's/^\s*$/\n/'|./skip_lines.pl $ARG $FIVEK $SKIPFILE  | ./lines $ARG $START $END | ./reprojectivize.sh -$TYPE|   ./clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $PREDFILE.4tedeval.$$ #no idea why twice
246 | wc $ARG $PREDFILE.4tedeval.$$
247 | #exit
248 | 
249 | # that was to generate a fake arabic parsed file
250 | #cat $PREDLAT | lines $ARG 1 $END | cut -f2-7 | add_fake_col.pl | clean$TYPE.pl | perl -pe 's/^\s*$/\n/' |uniq> $PREDLAT.fake
251 | 
252 | # normal  gold vs pred (LABELED)
253 | 
254 | if test $LABEL = "-unlabeled" ; then
255 | 	SUF="-unlabeled"
256 | else
257 | 	SUF="-labeled"
258 | fi
259 | 
260 | java  -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$  -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF
261 | file="$PREDFILE.4tedeval.simple_tedeval.res$SUF.ted"
262 | 
263 | 
264 | 
265 | echo "java  -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$  -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF" > /dev/stderr
266 | echo " "
267 | cat $file | grep "AVG:"|  perl -p -s -e 'chomp ; s/(.)$/\1\t$file \n/' -- -file=$file | ./get_ted_res.pl
268 | 
269 | 
270 | echo -e "\n\n"
271 | 
272 | rm -f $GOLDFILE.4tedeval.$$ $PREDFILE.4tedeval.$$
273 | 
274 | #eval gold vs gold => 100%
275 | #java -Xmx3g -jar $TEDEVALJAR -g  $GOLDFILE.4tedeval -sg $GOLDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE
276 | 
277 | # eval pred vs gold
278 | #java -server -Xmx3g -jar $TEDEVALJAR -g  $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE
279 | 
280 | # eval pred vs pred => 100%
281 | #java -server -Xmx3g -jar $TEDEVALJAR -g  $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $PREDFILE.4tedeval.$$ -sp $PREDLAT.4tedeval.mapping -format $TYPE
282 | 
283 | exit
284 | 
285 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/tedeval_simple.sh.good:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # wrapper script to make tedeval work on SPMRL Shared task data set
  4 | # Djame Seddah
  5 | 
  6 | # version  August 19, 03:49
  7 | # fixing java.nullPointer exception (removing sentence 908 and 1889 from dev arabic pred+pred)
  8 | # version  August 18, 02:14
  9 | # fixing java.nullPointer exception (removing sentence 799 from test test arabic pred+pred)
 10 | 
 11 | 
 12 | 
 13 | #  see
 14 | # http://stackoverflow.com/questions/402377/using-getopts-in-bash-shell-script-to-get-long-and-short-command-line-options/7680682#7680682
 15 | # options  
 16 | # 	-d (debug version), -n  
 17 | # 	-labeled, -unlalebed (*)
 18 | #	-ptb, -conll (*)
 19 | # 	-ar  (for arabic, default hebrew)
 20 | #   -test  (test set gold file used, default dev)
 21 | #   -cut   (cut-off lenght + bad sentences removed, fixed?)
 22 | #  -predfile FILE   (predicted parsed file)
 23 | #  -predmap  FILE  (predicted mapping file) # if not given, calculated
 24 | #  -gold FILE  
 25 | # -begin  starting line to be evaluated, 1 if nothing
 26 | # -end  end+1 line to be evaluated	+1000000 if nothing 
 27 | 
 28 | 
 29 | #set -x
 30 | 
 31 | # VARIABLES
 32 | PROGDIR=`dirname $0`
 33 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar
 34 | #TEDEVALJAR=$PROGDIR/tedeval.jar
 35 | LABEL="-unlabeled"
 36 | TYPE="conll"
 37 | TYPENAME=conll
 38 | ARG="-c"
 39 | LANGUAGE=""
 40 | SKIPFILE=/dev/null
 41 | PREF=test
 42 | DOCUT=1
 43 | CUTOFF=70  #should be a parameter, later...
 44 | PREDFILE=""
 45 | PREDMAP=""
 46 | PREDLAT=""
 47 | GOLDFILE=""
 48 | GOLDLAT=""
 49 | LATSUF=tobeparsed.gold_tagged+gold_token.lattices
 50 | START=1
 51 | END=100000  # let's hope that no file will ever be that long (linewise)
 52 | SPMRLDATA_ROOTDIR=$SHARED/SPMRL_FINAL/READY_TO_SHIP_FINAL/
 53 | 
 54 | 
 55 | if [ -z $1 ] ; then
 56 | 	echo "tedeval.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 
 57 |     -D | --debug 	 	use tedeval + debug outputs
 58 |     -n | --new 	   		use latest tedeval-2.2.jar (default is 2.1	
 59 | 	-u | --unlabeled 	unlabeled evaluation  (default)	 
 60 | 	-l | --labeled 	 	labeled evaluation
 61 | 	-p | --ptb 	 		evaluate const. files
 62 | 	-c | --conll 	 	evaluate conll files (default)
 63 | 	-a | --arabic 	 	dev mode for Arabic, do not use 
 64 | 	-h | --hebrew 	 	dev mode for Hebrew, do not use 
 65 | 	-y | --any	LANGUAGE	LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...)
 66 | 	-t | --test 			dev mode, test file, do not use 
 67 | 	-d | --dev 	  			dev mode, test file, do not use 
 68 | 	-k | --cut 	 			CUTOFF mode (tedeval is really slow for long sentences	, length cutoff hardcoded to 70 
 69 | 	-s | --system FILE		test file to be evaluated 
 70 | 	-g | --gold FILE		gold standard file ;;
 71 | 	-L | --predlat FILE		predicted lattice files as provided by  SPMRL. If not given, spmrl one will be used
 72 | 	-m | --predmap			mapping for predicted files (use it only when with non-spmrl predicted file	. Generated from predlat file otherwise
 73 | 	-b | --begin 			line ID to start evaluate
 74 | 	-e | --end 	 			line ID+1 to stop the evaluation 
 75 | 	--spmrldata_rootdir 	root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/	
 76 | 	--help 	
 77 | 	" ; 
 78 | 	exit
 79 | fi
 80 | 
 81 | echo "###########################"
 82 | echo "Running\: tedeval.sh $@"
 83 | echo "###########################"
 84 | echo "\n"
 85 | TEMP=`/sw/bin/getopt -o DnlupcAHy:tdks:m:L:g:b:e: --long help,debug,new,labeled,unlabeled,ptb,conll,Arabic,Hebrew,any:,test,dev,cut,predfile:,predmap:,predlat:,gold:,begin:,end:,spmrldata_rootdir -- "$@"`
 86 | 
 87 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
 88 | 
 89 | # Note the quotes around `$TEMP': they are essential!
 90 | eval set -- "$TEMP"
 91 | 
 92 | 
 93 | while true; do
 94 |   case "$1" in
 95 |     -D | --debug ) TEDEVALJAR=$PROGDIR/tedeval_debug.jar; shift ;;
 96 |     -n | --new )   TEDEVALJAR=$PROGDIR/tedeval-2.2.jar; shift ;;
 97 | 	-u | --unlabeled ) LABEL="-unlabeled" ; shift ;;
 98 | 	-l | --labeled ) LABEL="" ; shift ;;
 99 | 	-p | --ptb ) TYPE=ptb ; TYPENAME=bracketed ; ARG="" ; shift   ;;
100 | 	-c | --conll ) TYPE=conll ; TYPENAME=conll ; ARG="-c" ; shift ;;
101 | 	-A | --arabic ) LANGUAGE=ARABIC ;SUF=""; SKIPFILE=""; echo "ici arabic" ; exit ; shift ;; 
102 | 	-H | --hebrew ) LANGUAGE=HEBREW  ; shift ;;
103 | 	-y | --any	) LANGUAGE="$2" ; shift 2 ;;
104 | 	-t | --test ) PREF="test" ; shift ;;
105 | 	-d | --dev )  PREF="dev"  ; shift ;;
106 | 	-k | --cut ) DOCUT=1 ; shift ;;
107 | 	-s | --system ) PREDFILE="$2" ; shift 2;;
108 | 	-g | --gold ) GOLDFILE="$2" ; shift 2;;
109 | 	-m | --predmap ) PREDMAP="$2" ; shift 2;;
110 | 	-L | --predlat ) PREDLAT="$2" ; shift 2;;
111 | 	-b | --begin )	START="$2"  ; shift 2;;
112 | 	-e | --end ) END="$2"  ; shift 2;;
113 | 	-R | --spmrldata_rootdir ) SPMRLDATA_ROOTDIR="$2" ; shift 2;;
114 | 	--help )
115 | 	echo "tedeval_simple.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 
116 |     -D | --debug 	 	use tedeval + debug outputs
117 |     -n | --new 	   		use latest tedeval-2.2.jar (default is 2.1	
118 | 	-u | --unlabeled 	unlabeled evaluation  (default)	 
119 | 	-l | --labeled 	 	labeled evaluation
120 | 	-p | --ptb 	 		evaluate const. files
121 | 	-c | --conll 	 	evaluate conll files (default)
122 | 	-a | --arabic 	 	dev mode for Arabic, do not use 
123 | 	-h | --hebrew 	 	dev mode for Hebrew, do not use 
124 | 	-y | --any	LANGUAGE	LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...)
125 | 	-t | --test 			dev mode, test file, do not use 
126 | 	-d | --dev 	  			dev mode, test file, do not use 
127 | 	-k | --cut 	 			CUTOFF mode (tedeval is really slow for long sentences	, length cutoff hardcoded to 70 
128 | 	-s | --system FILE		test file to be evaluated 
129 | 	-g | --gold FILE		gold standard file ;;
130 | 	-b | --begin 			line ID to start evaluate
131 | 	-e | --end 	 			line ID+1 to stop the evaluation 
132 | 	--spmrldata_rootdir 	root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/	
133 | 	--help 	
134 | 	" ; 
135 | 	shift ; exit 1;;
136 |     -- ) shift; break ;;
137 |     * ) break ;;
138 |   esac
139 | done
140 | 
141 | # arabic lines bug
142 | #START=699 # should be 799 but 100 sentences > 70 were removed
143 | #START=167 # same bug as 699 but in labeled 
144 | 
145 | #END=700
146 | #END=168
147 | LDIR=`echo $LANGUAGE | perl -ne 'print uc $_'`_SPMRL
148 | LANGUAGE=`echo $LANGUAGE|perl -p -ne '$_ = ucfirst lc $_ ;'`
149 | 
150 | if [ -z "$PREDFILE" ]; then
151 | 		PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed$SUF
152 | fi
153 | 
154 | if [ ! -f "$PREDFILE" ] ; then
155 | 	echo "PREDFILE: $PREDFILE not found"
156 | 	exit 0
157 | fi
158 | 
159 | echo "LDIR = $LDIR   LANGUAGE $LANGUAGE  PREDLAT $PREDLAT"
160 | #exit
161 | 
162 | 
163 | 
164 | 
165 | # we're always picking the 
166 | if [ -z "$GOLDFILE" ] ; then
167 | 	GOLDFILE=${PREF}.${LANGUAGE}.gold.$TYPE
168 | fi
169 | 
170 | 	
171 | 
172 | # checking all files
173 | 
174 | 
175 | if [ -f "$GOLDFILE" ] ; then
176 | 	echo "gold: $GOLDFILE found"
177 | else
178 | 	wc -L $GOLDFILE
179 | 	echo "gold: $GOLDFILE not found"
180 | 	exit
181 | fi
182 | 
183 | 
184 | 
185 | if [ -f "$PREDFILE" ] ; then
186 | 	echo "pred file: $PREDFILE found"
187 | else
188 | 	echo "pred file: $PREDFILE not found"
189 | 	exit
190 | fi
191 | 
192 | 
193 | 
194 | #exit
195 | 
196 | 	
197 | #PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices
198 | 
199 | #  dev.Hebrew.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.parsed.lattices.5k
200 | # test.Arabic.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed.full
201 | 
202 | 
203 | # fixing the blank line bug
204 | #perl -i.bak -pe 's/^\s*$/\n/g' $GOLFILE $PREDFILE $PREDLAT $GOLDLAT
205 | 
206 | ####################################################
207 | ##  real workd done here
208 | ######################################################
209 | 
210 | if test $DOCUT = 1 ; then
211 | 	echo "generating lines to be skipped"
212 | 	cat $GOLDFILE | perl -pe 's/^\s*$/\n/' |get_cutoffed_sent.pl -c -K 70 /dev/null > $GOLDFILE.tobeskipped
213 | 	if test "$PREF.$LANGUAGE" = "test.Arabic" ; then
214 | 		#echo -e "\n799" >> $GOLDFILELAT.tobeskipped # buggy sentence for unlabeld evaluation on arabic test
215 | 		echo "\n" >>  $GOLDFILE.tobeskipped
216 | 	elif test "$PREF.$LANGUAGE" = "dev.Arabic" ; then
217 | 		#echo -e "\n904\n1889" >>  $GOLDFILELAT.tobeskipped #buggy sentence for unlabeld evaluation on arabic dev
218 | 		echo "\n" >>  $GOLDFILE.tobeskipped
219 | 
220 | 	fi
221 | 	SKIPFILE=$GOLDFILE.tobeskipped
222 | else
223 | 	SKIPFILE=/dev/null
224 | fi
225 | 
226 | 
227 | #exit
228 |  
229 | echo "generating normalized files"
230 | echo -e "\t==> gold"
231 | wc $ARG $GOLDFILE
232 | cat $GOLDFILE|perl -pe 's/^\s*$/\n/'| skip_lines.pl $ARG $SKIPFILE  | lines $ARG $START $END | reprojectivize.sh -$TYPE|   clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $GOLDFILE.4tedeval.$$
233 | wc $ARG $GOLDFILE.4tedeval.$$
234 | 
235 | echo -e "\t==> pred"
236 | wc $ARG $PREDFILE
237 | cat $PREDFILE|perl -pe 's/^\s*$/\n/'|skip_lines.pl $ARG $SKIPFILE  | lines $ARG $START $END | reprojectivize.sh -$TYPE|   clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $PREDFILE.4tedeval #no idea why twice
238 | wc $ARG $PREDFILE.4tedeval
239 | #exit
240 | 
241 | # that was to generate a fake arabic parsed file
242 | #cat $PREDLAT | lines $ARG 1 $END | cut -f2-7 | add_fake_col.pl | clean$TYPE.pl | perl -pe 's/^\s*$/\n/' |uniq> $PREDLAT.fake
243 | 
244 | # normal  gold vs pred (LABELED)
245 | 
246 | java -server -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$  -p $PREDFILE.4tedeval-format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF
247 | if test $LABEL = "-unlabeled" ; then
248 | 	SUF="-unlabeled"
249 | else
250 | 	SUF="-labeled"
251 | fi
252 | 
253 | echo "java -server -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$  -p $PREDFILE.4tedeval-format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF"
254 | 
255 | echo "\n\n"
256 | #rm -f $GOLDFILE.4tedeval.$$ $GOLDLAT.4tedeval.mapping.$$ $PREDMAP
257 | 
258 | #eval gold vs gold => 100%
259 | #java -Xmx3g -jar $TEDEVALJAR -g  $GOLDFILE.4tedeval -sg $GOLDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE
260 | 
261 | # eval pred vs gold
262 | #java -server -Xmx3g -jar $TEDEVALJAR -g  $PREDFILE.4tedeval -sg $PREDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE
263 | 
264 | # eval pred vs pred => 100%
265 | #java -server -Xmx3g -jar $TEDEVALJAR -g  $PREDFILE.4tedeval -sg $PREDLAT.4tedeval.mapping -p $PREDFILE.4tedeval -sp $PREDLAT.4tedeval.mapping -format $TYPE
266 | 
267 | exit
268 | 
269 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/tedeval_simple.sh.old:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # wrapper script to make tedeval work on SPMRL Shared task data set
  4 | # Djame Seddah
  5 | 
  6 | # version  August 19, 03:49
  7 | # fixing java.nullPointer exception (removing sentence 908 and 1889 from dev arabic pred+pred)
  8 | # version  August 18, 02:14
  9 | # fixing java.nullPointer exception (removing sentence 799 from test test arabic pred+pred)
 10 | 
 11 | 
 12 | 
 13 | #  see
 14 | # http://stackoverflow.com/questions/402377/using-getopts-in-bash-shell-script-to-get-long-and-short-command-line-options/7680682#7680682
 15 | # options  
 16 | # 	-d (debug version), -n  
 17 | # 	-labeled, -unlalebed (*)
 18 | #	-ptb, -conll (*)
 19 | # 	-ar  (for arabic, default hebrew)
 20 | #   -test  (test set gold file used, default dev)
 21 | #   -cut   (cut-off lenght + bad sentences removed, fixed?)
 22 | #  -predfile FILE   (predicted parsed file)
 23 | #  -predmap  FILE  (predicted mapping file) # if not given, calculated
 24 | #  -gold FILE  
 25 | # -begin  starting line to be evaluated, 1 if nothing
 26 | # -end  end+1 line to be evaluated	+1000000 if nothing 
 27 | 
 28 | 
 29 | #set -x
 30 | 
 31 | # VARIABLES
 32 | PROGDIR=`dirname $0`
 33 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar
 34 | #TEDEVALJAR=$PROGDIR/tedeval.jar
 35 | LABEL="-unlabeled"
 36 | TYPE="conll"
 37 | TYPENAME=conll
 38 | ARG="-c"
 39 | LANGUAGE=""
 40 | SKIPFILE=/dev/null
 41 | PREF=test
 42 | DOCUT=1
 43 | CUTOFF=70  #should be a parameter, later...
 44 | PREDFILE=""
 45 | PREDMAP=""
 46 | PREDLAT=""
 47 | GOLDFILE=""
 48 | GOLDLAT=""
 49 | LATSUF=tobeparsed.gold_tagged+gold_token.lattices
 50 | START=1
 51 | END=100000  # let's hope that no file will ever be that long (linewise)
 52 | SPMRLDATA_ROOTDIR=$SHARED/SPMRL_FINAL/READY_TO_SHIP_FINAL/
 53 | FIVEK=""
 54 | 
 55 | if [ -z $1 ] ; then
 56 | 	echo "tedeval.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 
 57 |     -D | --debug 	 	use tedeval + debug outputs
 58 |     -n | --new 	   		use latest tedeval-2.2.jar (default is 2.1	
 59 | 	-u | --unlabeled 	unlabeled evaluation  (default)	 
 60 | 	-l | --labeled 	 	labeled evaluation
 61 | 	-p | --ptb 	 		evaluate const. files
 62 | 	-c | --conll 	 	evaluate conll files (default)
 63 | 	-a | --arabic 	 	dev mode for Arabic, do not use 
 64 | 	-h | --hebrew 	 	dev mode for Hebrew, do not use 
 65 | 	-y | --any	LANGUAGE	LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...)
 66 | 	-t | --test 			dev mode, test file, do not use 
 67 | 	-d | --dev 	  			dev mode, test file, do not use 
 68 | 	-k | --cut 	 			CUTOFF mode (tedeval is really slow for long sentences	, length cutoff hardcoded to 70 
 69 | 	-s | --system FILE		test file to be evaluated 
 70 | 	-g | --gold FILE		gold standard file ;;
 71 | 	-L | --predlat FILE		predicted lattice files as provided by  SPMRL. If not given, spmrl one will be used
 72 | 	-m | --predmap			mapping for predicted files (use it only when with non-spmrl predicted file	. Generated from predlat file otherwise
 73 | 	-b | --begin 			line ID to start evaluate
 74 | 	-e | --end 	 			line ID+1 to stop the evaluation 
 75 | 	--spmrldata_rootdir 	root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/	
 76 | 	--help 	
 77 | 	" ; 
 78 | 	exit
 79 | fi
 80 | 
 81 | echo "###########################"
 82 | echo "Running\: tedeval.sh $@"
 83 | echo "###########################"
 84 | echo "\n"
 85 | TEMP=`/sw/bin/getopt -o DnlupcAHy:tdks:m:L:g:b:e: --long help,debug,new,labeled,unlabeled,ptb,conll,Arabic,Hebrew,any:,test,dev,cut,predfile:,predmap:,predlat:,gold:,begin:,end:,spmrldata_rootdir,fivek -- "$@"`
 86 | 
 87 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
 88 | 
 89 | # Note the quotes around `$TEMP': they are essential!
 90 | eval set -- "$TEMP"
 91 | 
 92 | 
 93 | while true; do
 94 |   case "$1" in
 95 |     -D | --debug ) TEDEVALJAR=$PROGDIR/tedeval_debug.jar; shift ;;
 96 |     -n | --new )   TEDEVALJAR=$PROGDIR/tedeval-2.2.jar; shift ;;
 97 | 	-u | --unlabeled ) LABEL="-unlabeled" ; shift ;;
 98 | 	-l | --labeled ) LABEL="" ; shift ;;
 99 | 	-p | --ptb ) TYPE=ptb ; TYPENAME=bracketed ; ARG="" ; shift   ;;
100 | 	-c | --conll ) TYPE=conll ; TYPENAME=conll ; ARG="-c" ; shift ;;
101 | 	-A | --arabic ) LANGUAGE=ARABIC ;SUF=""; SKIPFILE="" ; exit ; shift ;; 
102 | 	-H | --hebrew ) LANGUAGE=HEBREW  ; shift ;;
103 | 	-y | --any	) LANGUAGE="$2" ; shift 2 ;;
104 | 	-t | --test ) PREF="test" ; shift ;;
105 | 	-d | --dev )  PREF="dev"  ; shift ;;
106 | 	-k | --cut ) DOCUT=1 ; shift ;;
107 | 	-s | --system ) PREDFILE="$2" ; shift 2;;
108 | 	-g | --gold ) GOLDFILE="$2" ; shift 2;;
109 | 	-m | --predmap ) PREDMAP="$2" ; shift 2;;
110 | 	-L | --predlat ) PREDLAT="$2" ; shift 2;;
111 | 	-b | --begin )	START="$2"  ; shift 2;;
112 | 	-e | --end ) END="$2"  ; shift 2;;
113 | 	-R | --spmrldata_rootdir ) SPMRLDATA_ROOTDIR="$2" ; shift 2;;
114 | 	--fivek ) FIVEK="-5k" ; shift ;;
115 | 	--help )
116 | 	echo "tedeval_simple.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 
117 |     -D | --debug 	 	use tedeval + debug outputs
118 |     -n | --new 	   		use latest tedeval-2.2.jar (default is 2.1	
119 | 	-u | --unlabeled 	unlabeled evaluation  (default)	 
120 | 	-l | --labeled 	 	labeled evaluation
121 | 	-p | --ptb 	 		evaluate const. files
122 | 	-c | --conll 	 	evaluate conll files (default)
123 | 	-a | --arabic 	 	dev mode for Arabic, do not use 
124 | 	-h | --hebrew 	 	dev mode for Hebrew, do not use 
125 | 	-y | --any	LANGUAGE	LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...)
126 | 	-t | --test 			dev mode, test file, do not use 
127 | 	-d | --dev 	  			dev mode, test file, do not use 
128 | 	-k | --cut 	 			CUTOFF mode (tedeval is really slow for long sentences	, length cutoff hardcoded to 70 
129 | 	-s | --system FILE		test file to be evaluated 
130 | 	-g | --gold FILE		gold standard file ;;
131 | 	-b | --begin 			line ID to start evaluate
132 | 	-e | --end 	 			line ID+1 to stop the evaluation 
133 | 	--spmrldata_rootdir 	root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/	
134 | 	--help 	
135 | 	" ; 
136 | 	shift ; exit 1;;
137 |     -- ) shift; break ;;
138 |     * ) break ;;
139 |   esac
140 | done
141 | 
142 | 
143 | # *** INIT
144 | # arabic lines bug
145 | #START=699 # should be 799 but 100 sentences > 70 were removed
146 | #START=167 # same bug as 699 but in labeled 
147 | 
148 | #END=700
149 | #END=168
150 | LDIR=`echo $LANGUAGE | perl -ne 'print uc $_'`_SPMRL
151 | LANGUPPED=`echo $LANGUAGE | perl -ne 'print uc $_'`
152 | LANGUAGE=`echo $LANGUAGE|perl -p -ne '$_ = ucfirst lc $_ ;'`
153 | if [ ! -z "$FIVEK" ] ; then
154 | 	FIVEK="-5k $LANGUPPED"
155 | fi
156 | 
157 | 
158 | if [ -z "$PREDFILE" ]; then
159 | 		PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed$SUF
160 | fi
161 | 
162 | if [ ! -f "$PREDFILE" ] ; then
163 | 	echo "PREDFILE: $PREDFILE not found"
164 | 	exit 0
165 | fi
166 | 
167 | echo "LDIR = $LDIR   LANGUAGE $LANGUAGE  PREDLAT $PREDLAT"
168 | #exit
169 | 
170 | 
171 | 
172 | 
173 | # we're always picking the 
174 | if [ -z "$GOLDFILE" ] ; then
175 | 	GOLDFILE=${SPMRLDATA_ROOTDIR}/${LDIR}/gold/$TYPE/${PREF}/${PREF}.${LANGUAGE}.gold.$TYPE
176 | fi
177 | 
178 | 	
179 | 
180 | # checking all files
181 | 
182 | 
183 | if [ -f "$GOLDFILE" ] ; then
184 | 	echo "gold: $GOLDFILE found"
185 | else
186 | 	wc -L $GOLDFILE
187 | 	echo "gold: $GOLDFILE not found"
188 | 	exit
189 | fi
190 | 
191 | 
192 | 
193 | if [ -f "$PREDFILE" ] ; then
194 | 	echo "pred file: $PREDFILE found"
195 | else
196 | 	echo "pred file: $PREDFILE not found"
197 | 	exit
198 | fi
199 | 
200 | 
201 | 
202 | #exit
203 | 
204 | 	
205 | #PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices
206 | 
207 | #  dev.Hebrew.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.parsed.lattices.5k
208 | # test.Arabic.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed.full
209 | 
210 | 
211 | # fixing the blank line bug
212 | #perl -i.bak -pe 's/^\s*$/\n/g' $GOLFILE $PREDFILE $PREDLAT $GOLDLAT
213 | 
214 | ####################################################
215 | ##  real workd done here
216 | ######################################################
217 | 
218 | if test $DOCUT = 1 ; then
219 | 	echo "generating lines to be skipped"
220 | 	cat $GOLDFILE | perl -pe 's/^\s*$/\n/' |get_cutoffed_sent.pl -c -K 70 /dev/null > $GOLDFILE.tobeskipped
221 | 	if test "$PREF.$LANGUAGE" = "test.Arabic" ; then
222 | 		#echo -e "\n799" >> $GOLDFILELAT.tobeskipped # buggy sentence for unlabeld evaluation on arabic test
223 | 		echo "\n" >>  $GOLDFILE.tobeskipped
224 | 	elif test "$PREF.$LANGUAGE" = "dev.Arabic" ; then
225 | 		#echo -e "\n904\n1889" >>  $GOLDFILELAT.tobeskipped #buggy sentence for unlabeld evaluation on arabic dev
226 | 		echo "\n" >>  $GOLDFILE.tobeskipped
227 | 
228 | 	fi
229 | 	SKIPFILE=$GOLDFILE.tobeskipped
230 | else
231 | 	SKIPFILE=/dev/null
232 | fi
233 | 
234 | 
235 | #exit
236 |  
237 | echo "generating normalized files"
238 | echo -e "\t==> gold"
239 | wc $ARG $GOLDFILE
240 | cat $GOLDFILE|perl -pe 's/^\s*$/\n/'| skip_lines.pl $ARG $FIVEK  $SKIPFILE  | lines $ARG $START $END | reprojectivize.sh -$TYPE|   clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $GOLDFILE.4tedeval.$$
241 | wc $ARG $GOLDFILE.4tedeval.$$
242 | 
243 | echo -e "\t==> pred"
244 | wc $ARG $PREDFILE
245 | cat $PREDFILE|perl -pe 's/^\s*$/\n/'|skip_lines.pl $ARG $FIVEK $SKIPFILE  | lines $ARG $START $END | reprojectivize.sh -$TYPE|   clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $PREDFILE.4tedeval.$$ #no idea why twice
246 | wc $ARG $PREDFILE.4tedeval.$$
247 | #exit
248 | 
249 | # that was to generate a fake arabic parsed file
250 | #cat $PREDLAT | lines $ARG 1 $END | cut -f2-7 | add_fake_col.pl | clean$TYPE.pl | perl -pe 's/^\s*$/\n/' |uniq> $PREDLAT.fake
251 | 
252 | # normal  gold vs pred (LABELED)
253 | 
254 | if test $LABEL = "-unlabeled" ; then
255 | 	SUF="-unlabeled"
256 | else
257 | 	SUF="-labeled"
258 | fi
259 | 
260 | java -server -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$  -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF
261 | file="$PREDFILE.4tedeval.simple_tedeval.res$SUF.ted"
262 | echo "cat $file|grep "AVG:"|perl -p -s -e 'chomp ; s/(.)$/\1\t$file \n/' -- -file=$file|get_ted_res.pl"
263 | 
264 | 
265 | echo "java -server -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$  -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF"
266 | 
267 | echo "\n\n"
268 | #rm -f $GOLDFILE.4tedeval.$$ $GOLDLAT.4tedeval.mapping.$$ $PREDMAP
269 | 
270 | #eval gold vs gold => 100%
271 | #java -Xmx3g -jar $TEDEVALJAR -g  $GOLDFILE.4tedeval -sg $GOLDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE
272 | 
273 | # eval pred vs gold
274 | #java -server -Xmx3g -jar $TEDEVALJAR -g  $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE
275 | 
276 | # eval pred vs pred => 100%
277 | #java -server -Xmx3g -jar $TEDEVALJAR -g  $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $PREDFILE.4tedeval.$$ -sp $PREDLAT.4tedeval.mapping -format $TYPE
278 | 
279 | exit
280 | 
281 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/tedeval_simple_polish.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # wrapper script to make tedeval work on SPMRL Shared task data set
  4 | # Djame Seddah
  5 | 
  6 | # version  August 19, 03:49
  7 | # fixing java.nullPointer exception (removing sentence 908 and 1889 from dev arabic pred+pred)
  8 | # version  August 18, 02:14
  9 | # fixing java.nullPointer exception (removing sentence 799 from test test arabic pred+pred)
 10 | 
 11 | 
 12 | 
 13 | #  see
 14 | # http://stackoverflow.com/questions/402377/using-getopts-in-bash-shell-script-to-get-long-and-short-command-line-options/7680682#7680682
 15 | # options  
 16 | # 	-d (debug version), -n  
 17 | # 	-labeled, -unlalebed (*)
 18 | #	-ptb, -conll (*)
 19 | # 	-ar  (for arabic, default hebrew)
 20 | #   -test  (test set gold file used, default dev)
 21 | #   -cut   (cut-off lenght + bad sentences removed, fixed?)
 22 | #  -predfile FILE   (predicted parsed file)
 23 | #  -predmap  FILE  (predicted mapping file) # if not given, calculated
 24 | #  -gold FILE  
 25 | # -begin  starting line to be evaluated, 1 if nothing
 26 | # -end  end+1 line to be evaluated	+1000000 if nothing 
 27 | 
 28 | 
 29 | set -x
 30 | 
 31 | # VARIABLES
 32 | PROGDIR=`dirname $0`
 33 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar
 34 | #TEDEVALJAR=$PROGDIR/tedeval.jar
 35 | LABEL="-unlabeled"
 36 | TYPE="conll"
 37 | TYPENAME=conll
 38 | ARG="-c"
 39 | LANGUAGE=""
 40 | SKIPFILE=/dev/null
 41 | PREF=test
 42 | DOCUT=1
 43 | CUTOFF=70  #should be a parameter, later...
 44 | PREDFILE=""
 45 | PREDMAP=""
 46 | PREDLAT=""
 47 | GOLDFILE=""
 48 | GOLDLAT=""
 49 | LATSUF=tobeparsed.gold_tagged+gold_token.lattices
 50 | START=1
 51 | END=100000  # let's hope that no file will ever be that long (linewise)
 52 | SPMRLDATA_ROOTDIR=$SHARED/SPMRL_FINAL/READY_TO_SHIP_FINAL/
 53 | FIVEK=""
 54 | 
 55 | if [ -z $1 ] ; then
 56 | 	echo "tedeval.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 
 57 |     -D | --debug 	 	use tedeval + debug outputs
 58 |     -n | --new 	   		use latest tedeval-2.2.jar (default is 2.1	
 59 | 	-u | --unlabeled 	unlabeled evaluation  (default)	 
 60 | 	-l | --labeled 	 	labeled evaluation
 61 | 	-p | --ptb 	 		evaluate const. files
 62 | 	-c | --conll 	 	evaluate conll files (default)
 63 | 	-a | --arabic 	 	dev mode for Arabic, do not use 
 64 | 	-h | --hebrew 	 	dev mode for Hebrew, do not use 
 65 | 	-y | --any	LANGUAGE	LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...)
 66 | 	-t | --test 			dev mode, test file, do not use 
 67 | 	-d | --dev 	  			dev mode, test file, do not use 
 68 | 	-k | --cut 	 			CUTOFF mode (tedeval is really slow for long sentences	, length cutoff hardcoded to 70 
 69 | 	-s | --system FILE		test file to be evaluated 
 70 | 	-g | --gold FILE		gold standard file ;;
 71 | 	-L | --predlat FILE		predicted lattice files as provided by  SPMRL. If not given, spmrl one will be used
 72 | 	-m | --predmap			mapping for predicted files (use it only when with non-spmrl predicted file	. Generated from predlat file otherwise
 73 | 	-b | --begin 			line ID to start evaluate
 74 | 	-e | --end 	 			line ID+1 to stop the evaluation 
 75 | 	--spmrldata_rootdir 	root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/	
 76 | 	--help 	
 77 | 	" ; 
 78 | 	exit
 79 | fi
 80 | 
 81 | echo "###########################"
 82 | echo "Running\: tedeval.sh $@"
 83 | echo "###########################"
 84 | echo "\n"
 85 | TEMP=`/sw/bin/getopt -o DnlupcAHy:tdks:m:L:g:b:e: --long help,debug,new,labeled,unlabeled,ptb,conll,Arabic,Hebrew,any:,test,dev,cut,predfile:,predmap:,predlat:,gold:,begin:,end:,spmrldata_rootdir,fivek -- "$@"`
 86 | 
 87 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi
 88 | 
 89 | # Note the quotes around `$TEMP': they are essential!
 90 | eval set -- "$TEMP"
 91 | 
 92 | 
 93 | while true; do
 94 |   case "$1" in
 95 |     -D | --debug ) TEDEVALJAR=$PROGDIR/tedeval_debug.jar; shift ;;
 96 |     -n | --new )   TEDEVALJAR=$PROGDIR/tedeval-2.2.jar; shift ;;
 97 | 	-u | --unlabeled ) LABEL="-unlabeled" ; shift ;;
 98 | 	-l | --labeled ) LABEL="" ; shift ;;
 99 | 	-p | --ptb ) TYPE=ptb ; TYPENAME=bracketed ; ARG="" ; shift   ;;
100 | 	-c | --conll ) TYPE=conll ; TYPENAME=conll ; ARG="-c" ; shift ;;
101 | 	-A | --arabic ) LANGUAGE=ARABIC ;SUF=""; SKIPFILE="" ; exit ; shift ;; 
102 | 	-H | --hebrew ) LANGUAGE=HEBREW  ; shift ;;
103 | 	-y | --any	) LANGUAGE="$2" ; shift 2 ;;
104 | 	-t | --test ) PREF="test" ; shift ;;
105 | 	-d | --dev )  PREF="dev"  ; shift ;;
106 | 	-k | --cut ) DOCUT=1 ; shift ;;
107 | 	-s | --system ) PREDFILE="$2" ; shift 2;;
108 | 	-g | --gold ) GOLDFILE="$2" ; shift 2;;
109 | 	-m | --predmap ) PREDMAP="$2" ; shift 2;;
110 | 	-L | --predlat ) PREDLAT="$2" ; shift 2;;
111 | 	-b | --begin )	START="$2"  ; shift 2;;
112 | 	-e | --end ) END="$2"  ; shift 2;;
113 | 	-R | --spmrldata_rootdir ) SPMRLDATA_ROOTDIR="$2" ; shift 2;;
114 | 	--fivek ) FIVEK="-5k" ; shift ;;
115 | 	--help )
116 | 	echo "tedeval_simple.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 
117 |     -D | --debug 	 	use tedeval + debug outputs
118 |     -n | --new 	   		use latest tedeval-2.2.jar (default is 2.1	
119 | 	-u | --unlabeled 	unlabeled evaluation  (default)	 
120 | 	-l | --labeled 	 	labeled evaluation
121 | 	-p | --ptb 	 		evaluate const. files
122 | 	-c | --conll 	 	evaluate conll files (default)
123 | 	-a | --arabic 	 	dev mode for Arabic, do not use 
124 | 	-h | --hebrew 	 	dev mode for Hebrew, do not use 
125 | 	-y | --any	LANGUAGE	LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...)
126 | 	-t | --test 			dev mode, test file, do not use 
127 | 	-d | --dev 	  			dev mode, test file, do not use 
128 | 	-k | --cut 	 			CUTOFF mode (tedeval is really slow for long sentences	, length cutoff hardcoded to 70 
129 | 	-s | --system FILE		test file to be evaluated 
130 | 	-g | --gold FILE		gold standard file ;;
131 | 	-b | --begin 			line ID to start evaluate
132 | 	-e | --end 	 			line ID+1 to stop the evaluation 
133 | 	--spmrldata_rootdir 	root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/	
134 | 	--help 	
135 | 	" ; 
136 | 	shift ; exit 1;;
137 |     -- ) shift; break ;;
138 |     * ) break ;;
139 |   esac
140 | done
141 | 
142 | 
143 | # *** INIT
144 | # arabic lines bug
145 | #START=699 # should be 799 but 100 sentences > 70 were removed
146 | #START=167 # same bug as 699 but in labeled 
147 | 
148 | #END=700
149 | #END=168
150 | LDIR=`echo $LANGUAGE | perl -ne 'print uc $_'`_SPMRL
151 | LANGUPPED=`echo $LANGUAGE | perl -ne 'print uc $_'`
152 | LANGUAGE=`echo $LANGUAGE|perl -p -ne '$_ = ucfirst lc $_ ;'`
153 | if [ ! -z "$FIVEK" ] ; then
154 | 	FIVEK="-5k $LANGUPPED"
155 | fi
156 | 
157 | 
158 | if [ -z "$PREDFILE" ]; then
159 | 		PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed$SUF
160 | fi
161 | 
162 | if [ ! -f "$PREDFILE" ] ; then
163 | 	echo "PREDFILE: $PREDFILE not found"
164 | 	exit 0
165 | fi
166 | 
167 | echo "LDIR = $LDIR   LANGUAGE $LANGUAGE  PREDLAT $PREDLAT"
168 | #exit
169 | 
170 | 
171 | 
172 | 
173 | # we're always picking the 
174 | if [ -z "$GOLDFILE" ] ; then
175 | 	GOLDFILE=${SPMRLDATA_ROOTDIR}/${LDIR}/gold/$TYPE/${PREF}/${PREF}.${LANGUAGE}.gold.$TYPE
176 | fi
177 | 
178 | 	
179 | 
180 | # checking all files
181 | 
182 | 
183 | if [ -f "$GOLDFILE" ] ; then
184 | 	echo "gold: $GOLDFILE found"
185 | else
186 | 	wc -L $GOLDFILE
187 | 	echo "gold: $GOLDFILE not found"
188 | 	exit
189 | fi
190 | 
191 | 
192 | 
193 | if [ -f "$PREDFILE" ] ; then
194 | 	echo "pred file: $PREDFILE found"
195 | else
196 | 	echo "pred file: $PREDFILE not found"
197 | 	exit
198 | fi
199 | 
200 | 
201 | 
202 | #exit
203 | 
204 | 	
205 | #PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices
206 | 
207 | #  dev.Hebrew.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.parsed.lattices.5k
208 | # test.Arabic.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed.full
209 | 
210 | 
211 | # fixing the blank line bug
212 | #perl -i.bak -pe 's/^\s*$/\n/g' $GOLFILE $PREDFILE $PREDLAT $GOLDLAT
213 | 
214 | ####################################################
215 | ##  real workd done here
216 | ######################################################
217 | 
218 | if test $DOCUT = 1 ; then
219 | 	echo "generating lines to be skipped"
220 | 	cat $GOLDFILE | perl -pe 's/^\s*$/\n/' |get_cutoffed_sent.pl -c -K 70 /dev/null > $GOLDFILE.tobeskipped
221 | 	if test "$PREF.$LANGUAGE" = "test.Arabic" ; then
222 | 		#echo -e "\n799" >> $GOLDFILELAT.tobeskipped # buggy sentence for unlabeld evaluation on arabic test
223 | 		echo "\n" >>  $GOLDFILE.tobeskipped
224 | 	elif test "$PREF.$LANGUAGE" = "dev.Arabic" ; then
225 | 		#echo -e "\n904\n1889" >>  $GOLDFILELAT.tobeskipped #buggy sentence for unlabeld evaluation on arabic dev
226 | 		echo "\n" >>  $GOLDFILE.tobeskipped
227 | 
228 | 	fi
229 | 	SKIPFILE=$GOLDFILE.tobeskipped
230 | else
231 | 	SKIPFILE=/dev/null
232 | fi
233 | 
234 | 
235 | #exit
236 |  
237 | echo "generating normalized files"
238 | echo -e "\t==> gold"
239 | wc $ARG $GOLDFILE
240 | cat $GOLDFILE|perl -pe 's/^\s*$/\n/'| skip_lines.pl $ARG $FIVEK  $SKIPFILE  | lines $ARG $START $END | reprojectivize.sh -$TYPE|clean$TYPE.pl| perl -pe 's/^\s*$/\n/'|cat -s |uniq > $GOLDFILE.4tedeval.$$
241 | wc $ARG $GOLDFILE.4tedeval.$$
242 | 
243 | echo -e "\t==> pred"
244 | wc $ARG $PREDFILE
245 | cat $PREDFILE|perl -pe 's/^\s*$/\n/'|skip_lines.pl $ARG $FIVEK $SKIPFILE  | lines $ARG $START $END | reprojectivize.sh -$TYPE|clean$TYPE.pl| perl -pe 's/^\s*$/\n/'|cat -s |uniq > $PREDFILE.4tedeval.$$ #no idea why twice
246 | wc $ARG $PREDFILE.4tedeval.$$
247 | #exit
248 | 
249 | # that was to generate a fake arabic parsed file
250 | #cat $PREDLAT | lines $ARG 1 $END | cut -f2-7 | add_fake_col.pl | clean$TYPE.pl | perl -pe 's/^\s*$/\n/' |uniq> $PREDLAT.fake
251 | 
252 | # normal  gold vs pred (LABELED)
253 | 
254 | if test $LABEL = "-unlabeled" ; then
255 | 	SUF="-unlabeled"
256 | else
257 | 	SUF="-labeled"
258 | fi
259 | 
260 | java  -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$  -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF
261 | file="$PREDFILE.4tedeval.simple_tedeval.res$SUF.ted"
262 | 
263 | 
264 | 
265 | echo "java  -Xmx768m -jar $TEDEVALJAR $LABEL -g  $GOLDFILE.4tedeval.$$  -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF" > /dev/stderr
266 | echo " "
267 | cat $file | grep "AVG:"|  perl -p -s -e 'chomp ; s/(.)$/\1\t$file \n/' -- -file=$file | get_ted_res.pl
268 | 
269 | 
270 | echo -e "\n\n"
271 | #rm -f $GOLDFILE.4tedeval.$$ $GOLDLAT.4tedeval.mapping.$$ $PREDMAP
272 | 
273 | #eval gold vs gold => 100%
274 | #java -Xmx3g -jar $TEDEVALJAR -g  $GOLDFILE.4tedeval -sg $GOLDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE
275 | 
276 | # eval pred vs gold
277 | #java -server -Xmx3g -jar $TEDEVALJAR -g  $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE
278 | 
279 | # eval pred vs pred => 100%
280 | #java -server -Xmx3g -jar $TEDEVALJAR -g  $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $PREDFILE.4tedeval.$$ -sp $PREDLAT.4tedeval.mapping -format $TYPE
281 | 
282 | exit
283 | 
284 | 


--------------------------------------------------------------------------------
/TedWrappers_20131015/wc:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if test "$1" = "-c" -o "$1" = "-L" ; then
 4 | 	ARG=$1
 5 | 	for file in `eval ls $@` ; do
 6 | #		echo "=== $file"
 7 | #		echo "cat $file | lines $ARG -p 1 100000000"
 8 | 		NB=`cat $file | lines $ARG -p 1 100000000`
 9 | 		echo -e  "\t$NB $file"
10 | 	done
11 | 	
12 | else
13 | 	/usr/bin/wc -l $@ 
14 | fi
15 | 


--------------------------------------------------------------------------------
/data/core12map.txt:
--------------------------------------------------------------------------------
 1 | li_PRT	PRT
 2 | fa_PRT	PRT
 3 | 2mp_pron	PRO
 4 | PRT	PRT
 5 | 2ms_pron	PRO
 6 | PNX	PNX
 7 | 2fs_PRO	PRO
 8 | C	C
 9 | 2ms_PRO	PRO
10 | N	N
11 | la_PRT	PRT
12 | 2mp_PRO	PRO
13 | V	V
14 | REL	REL
15 | P	P
16 | fa_C	C
17 | PRO	PRO
18 | 3ms_pron	PRO
19 | 1s_PRO	PRO
20 | 3d_PRO	PRO
21 | 3ms_PRO	PRO
22 | wa_prep	P
23 | 3fs_PRO	PRO
24 | wa_C	C
25 | AV	AV	
26 | 1p_PRO	PRO
27 | 3mp_PRO	PRO
28 | 1p_pron	PRO
29 | 1s_pron	PRO
30 | PN	PN
31 | AB	ABBREV
32 | sa_PRT	PRT
33 | 3fs_pron	PRO
34 | AJ	AJ
35 | ma_PRO	PRO
36 | ma_REL	REL
37 | 2d_PRO	PRO
38 | 2fp_pron	PRO
39 | 3fp_PRO	PRO
40 | 2fs_pron	PRO
41 | 


--------------------------------------------------------------------------------
/data/spmrl.uni.map:
--------------------------------------------------------------------------------
1 | C	CONJ
2 | P	ADP
3 | PNX	PNX
4 | V	VERB
5 | 


--------------------------------------------------------------------------------
/data/tags-all.mod.txt:
--------------------------------------------------------------------------------
  1 | ABBREV
  2 | ADJ
  3 | ADJ.VN	ADJ
  4 | ADJ_COMP	ADJ
  5 | ADJ_NUM	NUM
  6 | ADV
  7 | INTERROG_ADV	ADV
  8 | CASE
  9 | CASE_INDEF_ACC	CASE
 10 | CONJ
 11 | CV	V
 12 | CVSUFF	VSUFF
 13 | CVSUFF_DO:1S	PRON
 14 | CVSUFF_DO:3MS	PRON
 15 | CVSUFF_SUBJ:2MP	VSUFF
 16 | CVSUFF_SUBJ:2MS	VSUFF
 17 | CVSUFF_DO:3MP	PRON
 18 | CVSUFF_DO:1P	PRON
 19 | CVSUFF_DO:3FS	PRON
 20 | CVSUFF_DO:3D	PRON
 21 | CVSUFF_DO:3FP	PRON
 22 | CVSUFF_SUBJ:2FS	VSUFF
 23 | DEM_PRON_FD	PRON
 24 | DEM_PRON_FS	PRON
 25 | DEM_PRON_MD	PRON
 26 | DEM_PRON_MP	PRON
 27 | DEM_PRON_MS	PRON
 28 | DEM_PRON	PRON
 29 | DEM_PRON_F	PRON
 30 | DEM_PRON_P	PRON
 31 | DET
 32 | FOREIGN
 33 | INTERJ
 34 | IV	V
 35 | IV1P	V
 36 | IV1S	V
 37 | IV2D	V
 38 | IV2FP	V
 39 | IV2FS	V
 40 | IV2MP	V
 41 | IV2MS	V
 42 | IV3FD	V
 43 | IV3FP	V
 44 | IV3FS	V
 45 | IV3MD	V
 46 | IV3MP	V
 47 | IV3MS	V
 48 | IVSUFF_DO:1P	PRON
 49 | IVSUFF_DO:1S	PRON
 50 | IVSUFF_DO:2FS	PRON
 51 | IVSUFF_DO:2D	PRON
 52 | IVSUFF_DO:2MP	PRON
 53 | IVSUFF_DO:2MS	PRON
 54 | IVSUFF_DO:3D	PRON
 55 | IVSUFF_DO:3FS	PRON
 56 | IVSUFF_DO:3MP	PRON
 57 | IVSUFF_DO:3MS	PRON
 58 | IVSUFF_DO:2FP	PRON
 59 | IVSUFF_DO:3FP	PRON
 60 | IVSUFF_MOOD	VSUFF
 61 | IVSUFF_SUBJ:2FS	VSUFF
 62 | IVSUFF_SUBJ:D	VSUFF
 63 | IVSUFF_SUBJ:FP	VSUFF
 64 | IVSUFF_SUBJ:MP	VSUFF
 65 | PVSUFF_SUBJ:2FP	VSUFF
 66 | IVSUFF_SUBJ:3FP	VSUFF
 67 | IVSUFF_SUBJ:3D	VSUFF
 68 | IVSUFF_MOOD:I	VSUFF
 69 | IVSUFF_MOOD:J	VSUFF
 70 | IVSUFF_MOOD:S	VSUFF
 71 | IVSUFF_SUBJ:2FS_MOOD:I	VSUFF
 72 | IVSUFF_SUBJ:D_MOOD:I	VSUFF
 73 | IVSUFF_SUBJ:D_MOOD:SJ	VSUFF
 74 | IVSUFF_SUBJ:MP_MOOD:I	VSUFF
 75 | IVSUFF_SUBJ:MP_MOOD:SJ	VSUFF
 76 | IVSUFF_SUBJ:2FS_MOOD:SJ	VSUFF
 77 | IVSUFF_SUBJ:3MP_MOOD:SJ	VSUFF
 78 | IV_PASS	V
 79 | LATIN	FOREIGN
 80 | NEG_PART	PART
 81 | NOUN
 82 | NOUN_PROP	NOUN_PROP
 83 | NOUN_NUM	NUM
 84 | NOUN_QUANT	NOUN
 85 | NOUN.VN	NOUN
 86 | NSUFF_FEM_DU	NSUFF
 87 | NSUFF_FEM_PL	NSUFF
 88 | NSUFF_FEM_SG	NSUFF
 89 | NSUFF_MASC_DU	NSUFF
 90 | NSUFF_MASC_PL	NSUFF
 91 | NSUFF_FEM_DU_ACC	NSUFF
 92 | NSUFF_FEM_DU_ACC_POSS	NSUFF
 93 | NSUFF_FEM_DU_GEN	NSUFF
 94 | NSUFF_FEM_DU_GEN_POSS	NSUFF
 95 | NSUFF_FEM_DU_NOM	NSUFF
 96 | NSUFF_FEM_DU_NOM_POSS	NSUFF
 97 | NSUFF_MASC_DU_ACC	NSUFF
 98 | NSUFF_MASC_DU_ACC_POSS	NSUFF
 99 | NSUFF_MASC_DU_GEN	NSUFF
100 | NSUFF_MASC_DU_GEN_POSS	NSUFF
101 | NSUFF_MASC_DU_NOM	NSUFF
102 | NSUFF_MASC_DU_NOM_POSS	NSUFF
103 | NSUFF_MASC_PL_ACC	NSUFF
104 | NSUFF_MASC_PL_ACC_POSS	NSUFF
105 | NSUFF_MASC_PL_GEN	NSUFF
106 | NSUFF_MASC_PL_GEN_POSS	NSUFF
107 | NSUFF_MASC_PL_NOM	NSUFF
108 | NSUFF_MASC_PL_NOM_POSS	NSUFF
109 | NUM
110 | O
111 | PART
112 | FUT	PART
113 | CONNEC_PART	PART
114 | FOCUS_PART	PART
115 | RESTRIC_PART	PART
116 | EMPHATIC_PART	PART
117 | PSEUDO_VERB	V
118 | VOC_PART	PART
119 | INTERROG_PART	PART
120 | INTERROG_PRON	PART
121 | INTERJ	PART
122 | JUS_PART	PART
123 | EMPH_PART	PART
124 | EXCEPT_PART	PART
125 | POSS_PRON_1P	PRON
126 | POSS_PRON_1S	PRON
127 | POSS_PRON_2FP	PRON
128 | POSS_PRON_2FS	PRON
129 | POSS_PRON_2MP	PRON
130 | POSS_PRON_2MS	PRON
131 | POSS_PRON_3D	PRON
132 | POSS_PRON_3FP	PRON
133 | POSS_PRON_3FS	PRON
134 | POSS_PRON_3MP	PRON
135 | POSS_PRON_3MS	PRON
136 | POSS_PRON_2D	PRON
137 | PREP
138 | PRON
139 | PRON_1P	PRON
140 | PRON_1S	PRON
141 | PRON_2D	PRON
142 | PRON_2FP	PRON
143 | PRON_2FS	PRON
144 | PRON_2MP	PRON
145 | PRON_2MS	PRON
146 | PRON_3D	PRON
147 | PRON_3FP	PRON
148 | PRON_3FS	PRON
149 | PRON_3MP	PRON
150 | PRON_3MS	PRON
151 | EXCLAM_PRON	PRON
152 | PUNC
153 | PV	V
154 | PVSUFF_DO:1P	PRON
155 | PVSUFF_DO:1S	PRON
156 | PVSUFF_DO:2MP	PRON
157 | PVSUFF_DO:2MS	PRON
158 | PVSUFF_DO:3D	PRON
159 | PVSUFF_DO:2D	PRON
160 | PVSUFF_DO:3FS	PRON
161 | PVSUFF_DO:3MP	PRON
162 | PVSUFF_DO:3MS	PRON
163 | PVSUFF_DO:2FS	PRON
164 | PVSUFF_DO:2FP	PRON
165 | PVSUFF_DO:3FP	PRON
166 | PVSUFF_SUBJ:1P	VSUFF
167 | PVSUFF_SUBJ:1S	VSUFF
168 | PVSUFF_SUBJ:2FS	VSUFF
169 | PVSUFF_SUBJ:2MP	VSUFF
170 | PVSUFF_SUBJ:2MS	VSUFF
171 | PVSUFF_SUBJ:3FD	VSUFF
172 | PVSUFF_SUBJ:3FP	VSUFF
173 | PVSUFF_SUBJ:3FS	VSUFF
174 | PVSUFF_SUBJ:3MD	VSUFF
175 | PVSUFF_SUBJ:3MP	VSUFF
176 | PVSUFF_SUBJ:3MS	VSUFF
177 | PVSUFF_SUBJ:2D	VSUFF
178 | PV_PASS	V
179 | NUMERIC_COMMA	PUNC
180 | RC_PART	PART
181 | REL_ADV	REL
182 | REL_PRON	REL
183 | SUB_CONJ	CONJ
184 | VERB_PART	PART
185 | TYPO	FOREIGN
186 | DIALECT	FOREIGN
187 | FUT_PART	PART
188 | VERB	V
189 | NO_FUNC	NOUN
190 | DEM	NOUN
191 | JUS	JUS
192 | FUNC_WORD	FUNC
193 | 


--------------------------------------------------------------------------------
/data/tags-mada2core12.txt:
--------------------------------------------------------------------------------
 1 | PREP	P
 2 | NOUN	N
 3 | PUNC	PNX
 4 | FUNC	N
 5 | ADV	AV
 6 | PART	PRT
 7 | DET	PRT
 8 | V	V
 9 | CONJ	C
10 | REL	REL
11 | FOREIGN	N
12 | ADJ	AJ
13 | ABBREV	ABBREV
14 | PRON	PRO
15 | NUM	N
16 | NOUN_PROP	PN
17 | adj	AJ
18 | adj_comp	AJ
19 | adj_num	N
20 | noun	N
21 | noun_num	N
22 | noun_quant	N
23 | noun_prop	PN
24 | verb	V
25 | verb_pseudo	V
26 | pron_rel	REL
27 | adv_rel	REL
28 | part_restrict	PRT
29 | prep	P
30 | 


--------------------------------------------------------------------------------
/data/test0.mada.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/data/test0.mada.gz


--------------------------------------------------------------------------------
/decoder/ClassifierDecoder.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ClassifierDecoder.cpp
  3 |  *
  4 |  *  Created on: May 7, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #include "ClassifierDecoder.h"
  9 | #include <float.h>
 10 | 
 11 | namespace segparser {
 12 | 
 13 | ClassifierDecoder::ClassifierDecoder(Options* options) : DependencyDecoder(options) {
 14 | }
 15 | 
 16 | ClassifierDecoder::~ClassifierDecoder() {
 17 | }
 18 | 
 19 | void ClassifierDecoder::decode(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe) {
 20 | 	ThrowException("no need to decode");
 21 | }
 22 | 
 23 | void ClassifierDecoder::train(DependencyInstance* gold, DependencyInstance* pred, FeatureExtractor* fe, int trainintIter) {
 24 | 	// check the gold and pred have same seg/pos
 25 | 	for (int i = 1; i < pred->numWord; ++i) {
 26 | 		assert(gold->word[i].currSegCandID == pred->word[i].currSegCandID);
 27 | 
 28 | 		SegInstance& goldInst = gold->word[i].getCurrSeg();
 29 | 		SegInstance& predInst = pred->word[i].getCurrSeg();
 30 | 
 31 | 		assert(goldInst.size() == predInst.size());
 32 | 		for (int j = 0; j < predInst.size(); ++j) {
 33 | 			assert(goldInst.element[j].currPosCandID == predInst.element[j].currPosCandID);
 34 | 		}
 35 | 	}
 36 | 
 37 | 	CacheTable* cache = fe->getCacheTable(pred);
 38 | 	boost::shared_ptr<CacheTable> tmpCache = boost::shared_ptr<CacheTable>(new CacheTable());
 39 | 	if (!cache) {
 40 | 		cache = tmpCache.get();		// temporary cache for this run
 41 | 		tmpCache->initCacheTable(fe->type, pred, NULL, options);
 42 | 	}
 43 | 
 44 | 	for (int mw = 1; mw < pred->numWord; ++mw) {
 45 | 		SegInstance& segInst = pred->word[mw].getCurrSeg();
 46 | 		SegInstance& goldInst = gold->word[mw].getCurrSeg();
 47 | 
 48 | 		for (int ms = 0; ms < segInst.size(); ++ms) {
 49 | 			HeadIndex m(mw, ms);
 50 | 			findOptHead(pred, gold, m, fe, cache);
 51 | 
 52 | 			if (segInst.element[ms].dep != goldInst.element[ms].dep) {
 53 | 				FeatureVector newFV;
 54 | 				fe->getArcFv(fe, gold, goldInst.element[ms].dep, m, &newFV, cache);
 55 | 				double newScore = fe->parameters->getScore(&newFV);
 56 | 
 57 | 				FeatureVector oldFV;
 58 | 				fe->getArcFv(fe, pred, segInst.element[ms].dep, m, &oldFV, cache);
 59 | 				double oldScore = fe->parameters->getScore(&oldFV);
 60 | 
 61 | 				newFV.concatNeg(&oldFV);
 62 | 
 63 | 				double err = fe->parameters->wordDepError(gold->word[mw], pred->word[mw]);
 64 | 
 65 | 				if (err - (newScore - oldScore) > 1e-4) {
 66 | 					fe->parameters->update(gold, pred, &newFV, err - (newScore - oldScore), fe, updateTimes);
 67 | 				}
 68 | 			}
 69 | 			updateTimes++;
 70 | 		}
 71 | 	}
 72 | }
 73 | 
 74 | void ClassifierDecoder::findOptHead(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, FeatureExtractor* fe, CacheTable* cache) {
 75 | 
 76 | 	assert(cache && cache->numSeg == pred->getNumSeg());
 77 | 
 78 | 	// get pruned list
 79 | 	vector<bool> isPruned = move(fe->isPruned(pred, m, cache));
 80 | 	int segID = -1;
 81 | 
 82 | 	SegElement& predSegEle = pred->getElement(m);
 83 | 	double bestScore = -DBL_MAX;
 84 | 	HeadIndex bestDep(-1, 0);
 85 | 
 86 | 	for (int hw = 0; hw < pred->numWord; ++hw) {
 87 | 		SegInstance& segInst = pred->word[hw].getCurrSeg();
 88 | 
 89 | 		for (int hs = 0; hs < segInst.size(); ++hs) {
 90 | 			segID++;
 91 | 			if (isPruned[segID]) {
 92 | 				continue;
 93 | 			}
 94 | 
 95 | 			assert(hw != m.hWord || hs != m.hSeg);
 96 | 
 97 | 			HeadIndex h(hw, hs);
 98 | 
 99 | 			predSegEle.dep = h;
100 | 			FeatureVector fv;
101 | 			fe->getArcFv(fe, pred, h, m, &fv, cache);
102 | 			double score = fe->parameters->getScore(&fv);
103 | 			if (gold) {
104 | 				// add loss
105 | 				score += fe->parameters->wordDepError(gold->word[m.hWord], pred->word[m.hWord]);
106 | 			}
107 | 
108 | 			if (score > bestScore + 1e-6) {
109 | 				bestScore = score;
110 | 				bestDep = h;
111 | 			}
112 | 		}
113 | 	}
114 | 	assert(segID == (int)isPruned.size() - 1);
115 | 	predSegEle.dep = bestDep;
116 | }
117 | 
118 | } /* namespace segparser */
119 | 


--------------------------------------------------------------------------------
/decoder/ClassifierDecoder.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * ClassifierDecoder.h
 3 |  *
 4 |  *  Created on: May 7, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef CLASSIFIERDECODER_H_
 9 | #define CLASSIFIERDECODER_H_
10 | 
11 | #include "DependencyDecoder.h"
12 | 
13 | namespace segparser {
14 | 
15 | class ClassifierDecoder: public segparser::DependencyDecoder {
16 | public:
17 | 	ClassifierDecoder(Options* options);
18 | 	virtual ~ClassifierDecoder();
19 | 
20 | 	void decode(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe);
21 | 	void train(DependencyInstance* gold, DependencyInstance* pred, FeatureExtractor* fe, int trainintIter);
22 | 
23 | private:
24 | 	void findOptHead(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, FeatureExtractor* fe, CacheTable* cache);
25 | 
26 | };
27 | 
28 | } /* namespace segparser */
29 | #endif /* CLASSIFIERDECODER_H_ */
30 | 


--------------------------------------------------------------------------------
/decoder/DependencyDecoder.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * DependencyDecoder.h
 3 |  *
 4 |  *  Created on: Apr 8, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef DEPENDENCYDECODER_H_
 9 | #define DEPENDENCYDECODER_H_
10 | 
11 | #include "../Options.h"
12 | #include "../DependencyInstance.h"
13 | #include "../SegParser.h"
14 | #include "../util/StringUtils.h"
15 | #include "../util/Random.h"
16 | #include "../FeatureExtractor.h"
17 | 
18 | namespace segparser {
19 | 
20 | class FeatureExtractor;
21 | class CacheTable;
22 | 
23 | class DependencyDecoder {
24 | public:
25 | 	DependencyDecoder(Options* options);
26 | 	virtual ~DependencyDecoder();
27 | 
28 | 	static DependencyDecoder* createDependencyDecoder(Options* options, int mode, int thread, bool isTrain);
29 | 
30 | 	virtual void initialize() {
31 | 
32 | 	}
33 | 
34 | 	virtual void shutdown() {
35 | 
36 | 	}
37 | 
38 | 	virtual void decode(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe) {
39 | 		ThrowException("should not go virtual decode function");
40 | 	}
41 | 
42 | 	virtual void train(DependencyInstance* gold, DependencyInstance* pred, FeatureExtractor* fe, int trainintIter) {
43 | 		ThrowException("should not go virtual train function");
44 | 	}
45 | 
46 | 	void prune(DependencyInstance* inst, HeadIndex& m, FeatureExtractor* fe, vector<bool>& pruned);
47 | 
48 | 	int getUpdateTimes() {
49 | 		return updateTimes;
50 | 	}
51 | 
52 | 	void resetUpdateTimes() {
53 | 		updateTimes = 0;
54 | 	}
55 | 
56 | 	void initInst(DependencyInstance* inst, FeatureExtractor* fe);
57 | 	void removeGoldInfo(DependencyInstance* inst);
58 | 
59 | 	int seed;
60 | 	Options* options;
61 | 
62 | 	int getBottomUpOrder(DependencyInstance* inst, HeadIndex& arg, vector<HeadIndex>& idx, int id);
63 | 	double sampleSeg1O(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe, int wordID, Random& r);
64 | 	double samplePos1O(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe, int wordID, Random& r);
65 | 	bool randomWalkSampler(DependencyInstance* pred, DependencyInstance* gold, FeatureExtractor* fe, CacheTable* cache,
66 | 			vector<bool>& toBeSampled, Random& r, double T);
67 | 
68 | protected:
69 | 	int updateTimes;
70 | 
71 | 	bool isAncestor(DependencyInstance* s, HeadIndex& h, HeadIndex m);
72 | 	bool isProj(DependencyInstance* s, HeadIndex& h, HeadIndex& m);
73 | 	int samplePoint(vector<double>& prob, Random& r);
74 | 	void convertScoreToProb(vector<double>& score);
75 | 	double getSegPosProb(WordInstance& word);
76 | 	void getFirstOrderVec(DependencyInstance* inst, DependencyInstance* gold,
77 | 			FeatureExtractor* fe, HeadIndex& m, CacheTable* cache, bool treeConstraint, vector<HeadIndex>& candH, vector<double>& score);
78 | 	double getMHDepProb(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe, int wordID);
79 | 	double sampleSegPos(DependencyInstance* inst, DependencyInstance* gold, int wordID, Random& r);
80 | 	double sampleMHDepProb(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe, int wordID, Random& r);
81 | 	void updateSeg(DependencyInstance* inst, WordInstance& word, int newSeg);
82 | 	void updatePos(WordInstance& word, SegElement& ele, int newPos);
83 | 
84 | 	void cycleErase(DependencyInstance* inst, HeadIndex i, vector<bool>& toBeSampled);
85 | 	void updateSeg(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m,
86 | 			int newSeg, int oldSeg, int baseOptSeg, int baseOptPos, vector<int>& oldPos, vector<HeadIndex>& oldHeadIndex,
87 | 			vector<HeadIndex>& relatedChildren, vector<int>& relatedOldParent);
88 | 	void setGoldSegAndPos(DependencyInstance* pred, DependencyInstance* gold);
89 | };
90 | 
91 | } /* namespace segparser */
92 | #endif /* DEPENDENCYDECODER_H_ */
93 | 


--------------------------------------------------------------------------------
/decoder/DevelopmentThread.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * DevelopmentThread.h
 3 |  *
 4 |  *  Created on: Apr 16, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef DEVELOPMENTTHREAD_H_
 9 | #define DEVELOPMENTTHREAD_H_
10 | 
11 | #include <string>
12 | #include <vector>
13 | #include <unordered_map>
14 | #include "../io/DependencyReader.h"
15 | #include "../SegParser.h"
16 | 
17 | namespace segparser {
18 | 
19 | using namespace std;
20 | 
21 | class DependencyReader;
22 | class SegParser;
23 | 
24 | class DevelopmentThread {
25 | public:
26 | 	DevelopmentThread();
27 | 	virtual ~DevelopmentThread();
28 | 
29 | 	void start(string devfile, string devoutfile, SegParser* sp, bool verbal);
30 | 	void evaluate(DependencyInstance* inst, DependencyInstance* gold);
31 | 	double computeTedEval();
32 | 
33 | 	bool isDevTesting;
34 | 
35 | 	string devfile;
36 | 	string devoutfile;
37 | 
38 | 	DependencyReader reader;
39 | 
40 | 	int currProcessID;
41 | 	int currFinishID;
42 | 
43 | 	pthread_mutex_t processMutex;
44 | 	pthread_mutex_t finishMutex;
45 | 
46 | 	double wordNum;
47 | 	double corrWordSegNum;
48 | 	double goldSegNum;
49 | 	double predSegNum;
50 | 	double corrSegNum;
51 | 	double corrPosNum;
52 | 	double goldDepNum;
53 | 	double predDepNum;
54 | 	double corrDepNum;
55 | 
56 | 	pthread_t workThread;
57 | 	pthread_t outputThread;
58 | 	vector<pthread_t> decodeThread;
59 | 	int decodeThreadNum;
60 | 
61 | 	unordered_map<int, inst_ptr> id2Pred;
62 | 	int finishThreadNum;
63 | 
64 | 	SegParser* sp;
65 | 	Options* options;
66 | 	bool verbal;
67 | 
68 | private:
69 | 	bool isPunc(string& pos);
70 | 	int numSegWithoutPunc(DependencyInstance* inst);
71 | 	string normalize(string form);
72 | };
73 | 
74 | } /* namespace segparser */
75 | #endif /* DEVELOPMENTTHREAD_H_ */
76 | 


--------------------------------------------------------------------------------
/decoder/HillClimbingDecoder.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * HillClimbingDecoder.h
 3 |  *
 4 |  *  Created on: May 1, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef HILLCLIMBINGDECODER_H_
 9 | #define HILLCLIMBINGDECODER_H_
10 | 
11 | #include "DependencyDecoder.h"
12 | #include <pthread.h>
13 | 
14 | namespace segparser {
15 | 
16 | class HillClimbingDecoder: public segparser::DependencyDecoder {
17 | public:
18 | 	HillClimbingDecoder(Options* options, int thread, int convergeIter);
19 | 	virtual ~HillClimbingDecoder();
20 | 
21 | 	void initialize();
22 | 	void shutdown();
23 | 	void startTask(DependencyInstance* pred, DependencyInstance* gold, FeatureExtractor* fe);
24 | 	void waitAndGetResult(DependencyInstance* inst);
25 | 	void decode(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe);
26 | 	void train(DependencyInstance* gold, DependencyInstance* pred, FeatureExtractor* fe, int trainintIter);
27 | 	double findOptHead(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, FeatureExtractor* fe, CacheTable* cache);
28 | 	double findOptBigramHead(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, HeadIndex& n, FeatureExtractor* fe, CacheTable* cache);
29 | 	double findOptPos(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, FeatureExtractor* fe, CacheTable* cache);
30 | 	double findOptSeg(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, FeatureExtractor* fe, CacheTable* cache);
31 | 
32 | 	void debug(string msg, int id);
33 | 
34 | 	vector<pthread_t> threadID;
35 | 	vector<pthread_mutex_t> taskMutex;
36 | 	vector<pthread_cond_t> taskStartCond;
37 | 	vector<pthread_cond_t> taskDoneCond;
38 | 	vector<bool> taskDone;
39 | 	vector<bool> threadExit;
40 | 
41 | 	double bestScore;
42 | 	VariableInfo best;
43 | 	int unChangeIter;		// converge criteria
44 | 	pthread_mutex_t updateMutex;
45 | 	pthread_mutex_t debugMutex;
46 | 
47 | 	DependencyInstance* pred;
48 | 	DependencyInstance* gold;
49 | 	FeatureExtractor* fe;
50 | 
51 | 	int thread;
52 | 
53 | 	int convergeIter;
54 | 
55 | 	int earlyStopIter;
56 | 	bool samplePos;
57 | 	bool sampleSeg;
58 | };
59 | 
60 | } /* namespace segparser */
61 | #endif /* HILLCLIMBINGDECODER_H_ */
62 | 


--------------------------------------------------------------------------------
/io/DependencyReader.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * DependencyReader.cpp
  3 |  *
  4 |  *  Created on: Mar 27, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #include "DependencyReader.h"
  9 | #include "../util/StringUtils.h"
 10 | #include <assert.h>
 11 | #include <utility>
 12 | #include <boost/regex.hpp>
 13 | #include "../util/Constant.h"
 14 | 
 15 | namespace segparser {
 16 | 
 17 | DependencyReader::DependencyReader(Options* options, string file) : options(options) {
 18 | 	hasCandidate = true;
 19 | 	isTrain = false;
 20 | 	startReading(file);
 21 | }
 22 | 
 23 | DependencyReader::DependencyReader() : options(NULL) {
 24 | 	hasCandidate = true;
 25 | 	isTrain = false;
 26 | }
 27 | 
 28 | DependencyReader::~DependencyReader() {
 29 | }
 30 | 
 31 | void DependencyReader::startReading(Options* options, string file) {
 32 | 	this->options = options;
 33 | 	fin.open(file.c_str());
 34 | }
 35 | 
 36 | void DependencyReader::startReading(string file) {
 37 | 	fin.open(file.c_str());
 38 | }
 39 | 
 40 | void DependencyReader::close() {
 41 | 	if (fin.is_open())
 42 | 		fin.close();
 43 | }
 44 | 
 45 | HeadIndex DependencyReader::parseHeadIndex(string str) {
 46 | 	unsigned int pos = str.find("/");
 47 | 	assert(pos != string::npos);
 48 | 	int word = atoi(str.substr(0, pos).c_str());
 49 | 	int seg = atoi(str.substr(pos + 1).c_str());
 50 | 	return HeadIndex(word, seg);
 51 | }
 52 | 
 53 | void DependencyReader::addGoldSegElement(WordInstance* word, string form, string lemma, string pos,
 54 | 		string morphStr, int segid, int hwordid, int hsegid, string lab) {
 55 | 	word->goldForm.push_back(form);
 56 | 	word->goldLemma.push_back(lemma);
 57 | 	word->goldPos.push_back(pos);
 58 | 	word->goldDep.push_back(HeadIndex(hwordid, hsegid));
 59 | 	word->goldLab.push_back(lab);
 60 | 	word->goldMorphIndex = -1;
 61 | 	word->goldAlIndex = -1;
 62 | 
 63 | 	if (morphStr != "_") {
 64 | 		vector<string> data;
 65 | 		StringSplit(morphStr, "|", &data);
 66 | 
 67 | 		if (word->goldAlIndex == -1 && data[0][data[0].length() - 1] == 'y') {
 68 | 			word->goldAlIndex = segid;
 69 | 		}
 70 | 
 71 | 		if (word->goldMorphIndex == -1) {
 72 | 			assert(data.size() == 4);
 73 | 			bool hasMorphInfo = false;
 74 | 			for (int i = 1; i < 4; ++i) {
 75 | 				string val = data[i].substr(data[i].find_last_of("=") + 1, string::npos);
 76 | 				if (val != "na" && val != "NA")
 77 | 					hasMorphInfo = true;
 78 | 			}
 79 | 			if (hasMorphInfo) {
 80 | 				word->goldMorph.clear();
 81 | 				for (int i = 1; i < 4; ++i) {
 82 | 					string val = data[i].substr(data[i].find_last_of("=") + 1, string::npos);
 83 | 					word->goldMorph.push_back(val);
 84 | 				}
 85 | 				word->goldMorphIndex = segid;
 86 | 			}
 87 | 		}
 88 | 	}
 89 | }
 90 | 
 91 | void DependencyReader::normalizeProb(WordInstance* word) {
 92 | 	// normalize seg/pos candidate probability
 93 | 
 94 | 	double sumSegProb = 0.0;
 95 | 	for (unsigned int i = 0; i < word->candSeg.size(); ++i) {
 96 | 		for (unsigned int j = 0; j < word->candSeg[i].element.size(); ++j) {
 97 | 			SegElement& ele = word->candSeg[i].element[j];
 98 | 			double sumPosProb = 0.0;
 99 | 			for (unsigned int k = 0; k < ele.candPos.size(); ++k) {
100 | 				sumPosProb += ele.candProb[k];
101 | 
102 | 				//if (k > 0) {
103 | 				//	assert(ele.candProb[k - 1] >= ele.candProb[k]);
104 | 				//}
105 | 			}
106 | 			assert(sumPosProb > 0.0);
107 | 			for (unsigned int k = 0; k < ele.candPos.size(); ++k) {
108 | 				ele.candProb[k] /= sumPosProb;
109 | 				if (ele.candProb[k] > 1e-6)
110 | 					ele.candProb[k] = log(ele.candProb[k]);
111 | 				else
112 | 					ele.candProb[k] = -1000000;
113 | 			}
114 | 		}
115 | 		sumSegProb += word->candSeg[i].prob;
116 | 	}
117 | 
118 | 	assert(sumSegProb > 0.0);
119 | 	for (unsigned int i = 0; i < word->candSeg.size(); ++i) {
120 | 		word->candSeg[i].prob /= sumSegProb;
121 | 		if (word->candSeg[i].prob > 1e-6)
122 | 			word->candSeg[i].prob = log(word->candSeg[i].prob);
123 | 		else
124 | 			word->candSeg[i].prob = -1000000;
125 | 	}
126 | }
127 | 
128 | void DependencyReader::addGoldSegToCand(WordInstance* word) {
129 | 	// add the gold seg in to seg candidate if not exist (with prob 0)
130 | 	double prob = hasCandidate ? (isTrain ? 0.3 : 0.0) : 1.0;
131 | 
132 | 	string goldSegStr = word->goldForm[0];
133 | 	for (unsigned int i = 1; i < word->goldForm.size(); ++i)
134 | 		goldSegStr += "+" + word->goldForm[i];
135 | 
136 | 	unsigned int goldSegID = 0;
137 | 	for (; goldSegID < word->candSeg.size(); ++goldSegID) {
138 | 		if (goldSegStr.compare(word->candSeg[goldSegID].segStr) == 0) {
139 | 			break;
140 | 		}
141 | 	}
142 | 
143 | 	if (goldSegID == word->candSeg.size()) {
144 | 		// new cand
145 | 		//cout << "gold seg not exist" << endl;
146 | 
147 | 		SegInstance segInst;
148 | 		segInst.prob = prob;
149 | 		segInst.segStr = goldSegStr;
150 | 		segInst.morph = word->goldMorph;
151 | 		segInst.morphIndex = word->goldMorphIndex;
152 | 		segInst.AlIndex = word->goldAlIndex;
153 | 
154 | 		segInst.element.resize(word->goldForm.size());
155 | 		for (unsigned int i = 0; i < word->goldForm.size(); ++i) {
156 | 			SegElement& curr = segInst.element[i];
157 | 			curr.currPosCandID = 0;
158 | 			curr.form = word->goldForm[i];
159 | 			curr.lemma = word->goldLemma[i];
160 | 			curr.candPos.resize(1);
161 | 			curr.candPosid.resize(1);
162 | 			curr.candDetPosid.resize(1);
163 | 			curr.candProb.resize(1);
164 | 			curr.candSpecialPos.resize(1);
165 | 
166 | 			curr.candPos[0] = word->goldPos[i];
167 | 			curr.candProb[0] = 1.0;
168 | 		}
169 | 
170 | 		word->currSegCandID = goldSegID;
171 | 		word->candSeg.push_back(segInst);
172 | 	}
173 | 	else {
174 | 		// old cand, check pos
175 | 		SegInstance& segInst = word->candSeg[goldSegID];
176 | 		word->currSegCandID = goldSegID;
177 | 		segInst.morph = word->goldMorph;
178 | 		segInst.morphIndex = word->goldMorphIndex;
179 | 		segInst.AlIndex = word->goldAlIndex;
180 | 
181 | 		assert(word->goldForm.size() == segInst.element.size());
182 | 		for (unsigned int i = 0; i < word->goldForm.size(); ++i) {
183 | 			assert(word->goldForm[i].compare(segInst.element[i].form) == 0);
184 | 			segInst.element[i].lemma = word->goldLemma[i];
185 | 			string goldPos = word->goldPos[i];
186 | 			unsigned int goldPosID = 0;
187 | 			SegElement& ele = segInst.element[i];
188 | 			for (; goldPosID < ele.candPos.size(); ++goldPosID) {
189 | 				if (goldPos.compare(ele.candPos[goldPosID]) == 0) {
190 | 					break;
191 | 				}
192 | 			}
193 | 
194 | 			if (goldPosID == ele.candPos.size()) {
195 | 				// new pos
196 | 				//cout << "gold pos not exist" << endl;
197 | 				ele.candPos.resize(goldPosID + 1);
198 | 				ele.candPosid.resize(goldPosID + 1);
199 | 				ele.candDetPosid.resize(goldPosID + 1);
200 | 				ele.candProb.resize(goldPosID + 1);
201 | 				ele.candSpecialPos.resize(goldPosID + 1);
202 | 
203 | 				ele.candPos[goldPosID] = goldPos;
204 | 				ele.candProb[goldPosID] = prob;
205 | 
206 | 				ele.currPosCandID = goldPosID;
207 | 			}
208 | 			else {
209 | 				// old pos
210 | 				ele.currPosCandID = goldPosID;
211 | 			}
212 | 		}
213 | 	}
214 | 
215 | 	// add dep, lab will be set in segInstID
216 | 	SegInstance& segInst = word->getCurrSeg();
217 | 	for (int i = 0; i < segInst.size(); ++i) {
218 | 		segInst.element[i].dep = word->goldDep[i];
219 | 	}
220 | }
221 | 
222 | void DependencyReader::addSegCand(WordInstance* word, string str) {
223 | 	vector<string> dataList;
224 | 	StringSplit(str, "||", &dataList);
225 | 	if (dataList.size() != 5)
226 | 		cout << str << endl;
227 | 	assert(dataList.size() == 5);
228 | 	SegInstance segInst;
229 | 
230 | 	double segProb = atof(dataList[4].c_str());
231 | 	segInst.prob = segProb;
232 | 
233 | 	int AlIndex = atoi(dataList[1].c_str());
234 | 	segInst.AlIndex = AlIndex;
235 | 
236 | 	int morphIndex = atoi(dataList[2].c_str());
237 | 	segInst.morphIndex = morphIndex;
238 | 
239 | 	StringSplit(dataList[3], "/", &segInst.morph);
240 | 
241 | 	bool hasMorphValue = false;
242 | 	for (unsigned int i = 0; i < segInst.morph.size(); ++i) {
243 | 		if (segInst.morph[i] != "na" && segInst.morph[i] != "NA") {
244 | 			hasMorphValue = true;
245 | 			break;
246 | 		}
247 | 	}
248 | 	if (!hasMorphValue) {
249 | 		segInst.morphIndex = -1;
250 | 		segInst.morph.clear();
251 | 	}
252 | 
253 | 	string segStr = dataList[0];
254 | 	vector<string> segList;
255 | 	StringSplit(segStr, "&&", &segList);
256 | 
257 | 	segInst.element.resize(segList.size());
258 | 	for (unsigned int i = 0; i < segList.size(); ++i) {
259 | 		SegElement& curr = segInst.element[i];
260 | 		vector<string> posList;
261 | 		StringSplit(segList[i], "@#", &posList);
262 | 		curr.form = posList[0];		// normalize is done when set inst ids
263 | 		curr.lemma = posList[1];
264 | 		curr.candPos.resize(posList.size() - 2);
265 | 		curr.candPosid.resize(posList.size() - 2);
266 | 		curr.candDetPosid.resize(posList.size() - 2);
267 | 		curr.candProb.resize(posList.size() - 2);
268 | 		curr.candSpecialPos.resize(posList.size() - 2);
269 | 
270 | 		for (unsigned int j = 2; j < posList.size(); ++j) {
271 | 			int pos = posList[j].find_last_of("_");
272 | 			curr.candPos[j - 2] = posList[j].substr(0, pos);
273 | 			curr.candProb[j - 2] = atof(posList[j].substr(pos + 1).c_str());
274 | 		}
275 | 
276 | 		if (i > 0)
277 | 			segInst.segStr += "+";
278 | 		segInst.segStr += curr.form;
279 | 	}
280 | 
281 | 	word->candSeg.push_back(move(segInst));
282 | }
283 | 
284 | void DependencyReader::concatSegStr(WordInstance* word) {
285 | 	word->wordStr = "";
286 | 	for (unsigned int i = 0; i < word->goldForm.size(); ++i) {
287 | 		word->wordStr += word->goldForm[i];
288 | 	}
289 | }
290 | 
291 | inst_ptr DependencyReader::nextInstance() {
292 | 
293 | 	if (fin.eof()) {
294 | 		return inst_ptr((DependencyInstance*)NULL);
295 | 	}
296 | 
297 | 	string str;
298 | 	getline(fin, str);
299 | 	if (str.empty()) {
300 | 		return inst_ptr((DependencyInstance*)NULL);
301 | 	}
302 | 
303 | 	inst_ptr s(new DependencyInstance());
304 | 	vector<string> data;
305 | 	while (!str.empty()) {
306 | 		data.push_back(str);
307 | 		getline(fin, str);
308 | 	}
309 | 
310 | 	// get sentence length and seg counts
311 | 	// word id starts from 1, seg starts from 0
312 | 	int len = 0;
313 | 	for (unsigned int i = 0; i < data.size(); ++i) {
314 | 		int pos = data[i].find("\t");
315 | 		HeadIndex hi = parseHeadIndex(data[i].substr(0, pos));
316 | 		len = hi.hWord + 1;	// include root
317 | 	}
318 | 
319 | 	s->numWord = len;
320 | 	s->word.resize(len);
321 | 
322 | 	// add root information
323 | 	addGoldSegElement(&s->word[0], "<root>", "<root>", "<root-POS>", "_", 0, -1, 0, "<no-type>");
324 | 	concatSegStr(&s->word[0]);
325 | 	addSegCand(&s->word[0], "<root>@#<root>@#<root-POS>_1.0||-1||0||_||1.0");
326 | 
327 | 	// process each line
328 | 	for (unsigned int i = 0; i < data.size(); ++i) {
329 | 		vector<string> line;
330 | 		StringSplit(data[i], "\t", &line);
331 | 
332 | 		HeadIndex id = parseHeadIndex(line[0]);
333 | 		string word = line[1];
334 | 		string lemma = line[2];
335 | 		string pos = line[3];
336 | 		string morphStr = line[5];
337 | 		HeadIndex head = parseHeadIndex(line[6]);
338 | 		string lab = "<no-type>";
339 | 
340 | 		addGoldSegElement(&s->word[id.hWord], word, lemma, pos, morphStr, id.hSeg, head.hWord, head.hSeg, lab);
341 | 		assert((unsigned int)id.hSeg + 1 == s->word[id.hWord].goldForm.size());
342 | 	}
343 | 
344 | 	// complete information
345 | 	for (int i = 1; i < len; ++i) {
346 | 		concatSegStr(&s->word[i]);
347 | 	}
348 | 
349 | 	// process segmentation candidate
350 | 	if (hasCandidate) {
351 | 		for (int i = 1; i < len; ++i) {
352 | 			getline(fin, str);
353 | 			vector<string> segCand;
354 | 			StringSplit(str, "\t", &segCand);
355 | 			if (s->word[i].wordStr != segCand[0]) {
356 | 				cout << str << endl;
357 | 				cout << s->word[i].wordStr << " " << segCand[0] << endl;
358 | 			}
359 | 			assert(s->word[i].wordStr.compare(segCand[0]) == 0);
360 | 			for (unsigned int j = 1; j < segCand.size(); ++j) {
361 | 				addSegCand(&s->word[i], segCand[j]);
362 | 			}
363 | 		}
364 | 		getline(fin, str);
365 | 		assert(str.empty());
366 | 	}
367 | 
368 | 	// complete information
369 | 	for (int i = 0; i < len; ++i) {
370 | 		addGoldSegToCand(&s->word[i]);
371 | 		normalizeProb(&s->word[i]);
372 | 	}
373 | 
374 | 	s->constructConversionList();
375 |    	s->setOptSegPosCount();
376 |   	s->buildChild();
377 | 
378 |    	return s;
379 | }
380 | 
381 | } /* namespace segparser */
382 | 


--------------------------------------------------------------------------------
/io/DependencyReader.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * DependencyReader.h
 3 |  *
 4 |  *  Created on: Mar 27, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef DEPENDENCYREADER_H_
 9 | #define DEPENDENCYREADER_H_
10 | 
11 | #include <fstream>
12 | #include "../Options.h"
13 | #include "../DependencyInstance.h"
14 | 
15 | namespace segparser {
16 | 
17 | using namespace std;
18 | 
19 | class DependencyReader {
20 | public:
21 | 	DependencyReader();
22 | 	DependencyReader(Options* options, string file);
23 | 	virtual ~DependencyReader();
24 | 
25 | 	void startReading(Options* options, string file);
26 | 	void startReading(string file);
27 | 	void close();
28 | 	inst_ptr nextInstance();
29 | 
30 | 	bool hasCandidate;
31 | 	bool isTrain;
32 | 
33 | private:
34 | 	ifstream fin;
35 | 	Options* options;
36 | 
37 | 	HeadIndex parseHeadIndex(string str);
38 | 	void addGoldSegElement(WordInstance* word, string form, string lemma, string pos,
39 | 			string morphStr, int segid, int hwordid, int hsegid, string lab);
40 | 	void addGoldSegToCand(WordInstance* word);
41 | 	void normalizeProb(WordInstance* word);
42 | 	void addSegCand(WordInstance* word, string str);
43 | 	string normalize(string s);
44 | 	void concatSegStr(WordInstance* word);
45 | };
46 | 
47 | } /* namespace segparser */
48 | #endif /* DEPENDENCYREADER_H_ */
49 | 


--------------------------------------------------------------------------------
/io/DependencyWriter.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * DependencyWriter.cpp
 3 |  *
 4 |  *  Created on: Apr 16, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #include "DependencyWriter.h"
 9 | 
10 | namespace segparser {
11 | 
12 | DependencyWriter::DependencyWriter(Options* options) : options(options) {
13 | }
14 | 
15 | DependencyWriter::DependencyWriter(Options* options, string file) : options(options) {
16 | 	startWriting(file);
17 | }
18 | 
19 | DependencyWriter::~DependencyWriter() {
20 | }
21 | 
22 | void DependencyWriter::startWriting(string file) {
23 | 	fout.open(file.c_str());
24 | }
25 | 
26 | void DependencyWriter::close() {
27 | 	if (fout.is_open())
28 | 		fout.close();
29 | }
30 | 
31 | void DependencyWriter::writeInstance(DependencyInstance* inst) {
32 | 	for (int i = 1; i < inst->numWord; ++i) {
33 | 		WordInstance& word = inst->word[i];
34 | 		SegInstance& segInst = word.getCurrSeg();
35 | 
36 | 		for (int j = 0; j < segInst.size(); ++j) {
37 | 			fout << i << "/" << j << "\t" << segInst.element[j].form << "\t" << segInst.element[j].form << "\t";
38 | 			string pos = segInst.element[j].candPos[segInst.element[j].currPosCandID];
39 | 			fout << pos << "\t" << pos << "\t_\t";
40 | 			fout << segInst.element[j].dep << "\t" << word.currSegCandID << "\t" << segInst.element[j].currPosCandID << "\t_\n";
41 | 		}
42 | 	}
43 | 	fout << endl;
44 | 	fout.flush();
45 | }
46 | 
47 | } /* namespace segparser */
48 | 


--------------------------------------------------------------------------------
/io/DependencyWriter.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * DependencyWriter.h
 3 |  *
 4 |  *  Created on: Apr 16, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef DEPENDENCYWRITER_H_
 9 | #define DEPENDENCYWRITER_H_
10 | 
11 | #include <fstream>
12 | #include "../DependencyInstance.h"
13 | #include "../Options.h"
14 | 
15 | namespace segparser {
16 | 
17 | class DependencyWriter {
18 | public:
19 | 	DependencyWriter(Options* options);
20 | 	DependencyWriter(Options* options, string file);
21 | 	virtual ~DependencyWriter();
22 | 
23 | 	void startWriting(string file);
24 | 	void close();
25 | 	void writeInstance(DependencyInstance* inst);
26 | 
27 | private:
28 | 	ofstream fout;
29 | 	Options* options;
30 | 
31 | };
32 | 
33 | } /* namespace segparser */
34 | #endif /* DEPENDENCYWRITER_H_ */
35 | 


--------------------------------------------------------------------------------
/runs/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/spmrl_code_generator/SpmrlDataGenerator.java:
--------------------------------------------------------------------------------
  1 | import java.io.*;
  2 | import java.util.*;
  3 | 
  4 | public class SpmrlDataGenerator {
  5 | 
  6 | 	public static boolean sameSentence(SpmrlSentence s1, MadaSentence s2) {
  7 | 		String x1 = "";
  8 | 		for (int i = 0; i < s1.words.length; ++i)
  9 | 			x1 += s1.words[i].word + " ";
 10 | 		
 11 | 		String x2 = "";
 12 | 		for (int i = 0; i < s2.words.length; ++i) 
 13 | 			x2 += s2.words[i].word + " ";
 14 | 		
 15 | 		if (!x1.equals(x2)) {
 16 | 			System.out.println(x1);
 17 | 			System.out.println(x2);	
 18 | 		}
 19 | 		return x1.equals(x2);
 20 | 	}
 21 | 	
 22 | 	public static void generateFile(int split, String fileName) throws IOException {
 23 | 		int maxLength = 70;
 24 | 		
 25 | 		SpmrlReader reader = new SpmrlReader();
 26 | 		reader.open("../data/" + fileName + ".Arabic");
 27 | 		if (reader.useGoldSegDict)
 28 | 			reader.buildGoldSegDict();
 29 | 		
 30 | 		MadaReader[] madaReader = new MadaReader[split];
 31 | 		int maxLine = -1;
 32 | 		for (int i = 0; i < split; ++i) {
 33 | 			madaReader[i] = new MadaReader();
 34 | 			madaReader[i].open("../data/" + fileName + i + ".mada");
 35 | 			madaReader[i].maxLine = maxLine;
 36 | 		}
 37 | 		
 38 | 		BufferedWriter bw = new BufferedWriter(new FileWriter("../data/spmrl/spmrl.seg.full." + fileName));
 39 | 		
 40 | 		SpmrlSentence sent = null;
 41 | 		int id = 0;
 42 | 		int cnt = 0;
 43 | 		while ((sent = reader.readNextSentence()) != null) {
 44 | 			MadaSentence madaSent = madaReader[id].readNextSentence();
 45 | 
 46 | 			MadaReader.Assert(sameSentence(sent, madaSent));
 47 | 			
 48 | 			int segNum = 0;
 49 | 			for (int i = 0; i < sent.words.length; ++i)
 50 | 				segNum += sent.words[i].segs.length;
 51 | 			if (segNum <= maxLength) {
 52 | 				reader.outputGoldSentence(sent, bw);
 53 | 				SegStruct[] segStruct =  madaReader[id].generateCandidate(madaSent);
 54 | 				for (int z = 0; z < segStruct.length; ++z) {
 55 | 					if (segStruct[z].word.equals("l<TlAq")) {
 56 | 						System.out.println("aaaa");
 57 | 						for (int y = 0; y < segStruct[z].segData.size(); ++y)
 58 | 							System.out.println(segStruct[z].segData.get(y).morphInfo);
 59 | 						System.out.println();
 60 | 					}
 61 | 				}
 62 | 				reader.evaluateMadaPredict(segStruct, sent);
 63 | 				if (maxLine == -1 || maxLine >= 5) {
 64 | 					reader.addGoldSegDict(segStruct);
 65 | //					for (int z = 0; z < segStruct.length; ++z) {
 66 | //						if (segStruct[z].word.equals("l<TlAq")) {
 67 | //							System.out.println("bbbb");
 68 | //							for (int y = 0; y < segStruct[z].segData.size(); ++y)
 69 | //								System.out.println(segStruct[z].segData.get(y).morphInfo);
 70 | //							System.out.println();
 71 | //						}
 72 | //					}
 73 | 				}
 74 | 				reader.outputCandidate(segStruct, bw);
 75 | 				reader.evaluateOracle(segStruct, sent);
 76 | 			}
 77 | 			
 78 | 			id = (id + 1) % split;
 79 | 			cnt++;
 80 | 		}
 81 | 		
 82 | 		reader.close();
 83 | 		for (int i = 0; i < split; ++i)
 84 | 			madaReader[i].close();
 85 | 		
 86 | 		bw.close();
 87 | 		
 88 | 		System.out.println("sentence: " + cnt);
 89 | 		System.out.println("Total word: " + reader.numWord + " Correct seg: " + reader.corrSegment + " Oracle: " + reader.oracleCorrSegment);
 90 | 		System.out.println("Total segment: " + reader.numSegment + " Correct pos: " + reader.corrPos + " Oracle: " + reader.oracleCorrPos);
 91 | 
 92 | 		
 93 | 		double segPre = reader.corrSeg / reader.predSeg;
 94 | 		double segRec = reader.corrSeg / reader.goldSeg;
 95 | 		System.out.println("Seg pre/rec/f1: " + segPre + " " + segRec + " " + (2 * segPre * segRec) / (segPre + segRec));
 96 | 		double posPre = reader.corrP/ reader.predSeg;
 97 | 		double posRec = reader.corrP / reader.goldSeg;
 98 | 		System.out.println("pos pre/rec/f1: " + posPre + " " + posRec + " " + (2 * posPre * posRec) / (posPre + posRec));
 99 | 	}
100 | 
101 | 	/**
102 | 	 * @param args
103 | 	 */
104 | 	public static void main(String[] args) {
105 | 		try {
106 | 			generateFile(10, "train");
107 | 			generateFile(1, "dev");
108 | 			generateFile(1, "test");
109 | 		} catch (Exception e) {
110 | 			e.printStackTrace();
111 | 		}
112 | 
113 | 	}
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/util/Alphabet.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Alphabet.cpp
 3 |  *
 4 |  *  Created on: Mar 28, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #include "Alphabet.h"
 9 | #include "SerializationUtils.h"
10 | 
11 | namespace segparser {
12 | 
13 | Alphabet::Alphabet(int capacity) {
14 | 	map.reserve(capacity);
15 | 	numEntries = 0;
16 | 	growthStopped = false;
17 | }
18 | 
19 | Alphabet::Alphabet() : Alphabet(10000) {
20 | 	//Alphabet(10000);
21 | }
22 | 
23 | Alphabet::~Alphabet() {
24 | }
25 | 
26 | int Alphabet::lookupIndex (const string& entry, bool addIfNotPresent) {
27 | 	int ret = 0;
28 | 	if (map.find(entry) == map.end()) {
29 | 		if (!growthStopped && addIfNotPresent) {
30 | 			ret = numEntries + 1;		// index start from 1
31 | 			numEntries++;
32 | 			map[entry] = ret;
33 | 		}
34 | 	}
35 | 	else {
36 | 		ret = map[entry];
37 | 	}
38 | 	return ret;
39 | }
40 | 
41 | int Alphabet::lookupIndex (const string& entry) {
42 | 	return lookupIndex (entry, true);
43 | }
44 | 
45 | void Alphabet::toArray(vector<string>& key) {
46 | 	key.reserve(numEntries);
47 | 	for (auto kv : map) {
48 | 		key.push_back(kv.first);
49 | 	}
50 | }
51 | 
52 | int Alphabet::size() {
53 | 	return numEntries + 1;
54 | }
55 | 
56 | void Alphabet::stopGrowth() {
57 | 	growthStopped = true;
58 | }
59 | 
60 | void Alphabet::writeObject (FILE* fs) {
61 | 	CHECK(WriteInteger(fs, numEntries));
62 | 	CHECK(WriteStringIntegerMap(fs, map));
63 | 	CHECK(WriteBool(fs, growthStopped));
64 | }
65 | 
66 | void Alphabet::readObject (FILE* fs) {
67 | 	CHECK(ReadInteger(fs, &numEntries));
68 | 	CHECK(ReadStringIntegerMap(fs, &map));
69 | 	CHECK(ReadBool(fs, &growthStopped));
70 | }
71 | 
72 | } /* namespace segparser */
73 | 


--------------------------------------------------------------------------------
/util/Alphabet.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Alphabet.h
 3 |  *
 4 |  *  Created on: Mar 28, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef ALPHABET_H_
 9 | #define ALPHABET_H_
10 | 
11 | #include <string>
12 | #include <vector>
13 | #include <unordered_map>
14 | 
15 | namespace segparser {
16 | 
17 | using namespace std;
18 | 
19 | class Alphabet {
20 | public:
21 | 	Alphabet();
22 | 	Alphabet(int capacity);
23 | 	virtual ~Alphabet();
24 | 
25 | 	int lookupIndex (const string& entry, bool addIfNotPresent);
26 | 	int lookupIndex (const string& entry);
27 | 	void toArray(vector<string>& key);
28 | 	int size();
29 | 	void stopGrowth();
30 | 
31 | 	void writeObject (FILE* fs);
32 | 	void readObject (FILE* fs);
33 | 
34 | private:
35 | 	unordered_map<string, int> map;
36 | 	int numEntries;
37 | 	bool growthStopped;
38 | };
39 | 
40 | } /* namespace segparser */
41 | #endif /* ALPHABET_H_ */
42 | 


--------------------------------------------------------------------------------
/util/Constant.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Constant.cpp
 3 |  *
 4 |  *  Created on: Mar 27, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #include "Constant.h"
 9 | 
10 | namespace segparser {
11 | 
12 | const string PossibleLang::langString[] = {"qatar", "spmrl", "ctb"};
13 | 
14 | } /* namespace segparser */
15 | 


--------------------------------------------------------------------------------
/util/Constant.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Constant.h
 3 |  *
 4 |  *  Created on: Mar 27, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef CONSTANT_H_
 9 | #define CONSTANT_H_
10 | 
11 | #include <string>
12 | 
13 | namespace segparser {
14 | 
15 | class DependencyInstance;
16 | 
17 | using namespace std;
18 | 
19 | struct DecodingMode {
20 | 	enum types {
21 | 		HillClimb = 0,
22 | 		Exact,
23 | 	};
24 | };
25 | 
26 | struct PossibleLang {
27 | 	enum types {
28 | 		Arabic = 0,
29 | 		SPMRL,
30 | 		Chinese,
31 | 		Count,
32 | 	};
33 | 
34 | 	static const string langString[PossibleLang::Count];
35 | };
36 | 
37 | struct SpecialPos {
38 | 	enum types {
39 | 		C = 0, P, PNX, V, AJ, N, OTHER, COUNT,
40 | 	};
41 | };
42 | 
43 | struct ConstPosLex {
44 | 	enum types {
45 | 		UNSEEN = 0,
46 | 		START,
47 | 		MID,
48 | 		END,
49 | 		QUOTE,
50 | 		LRB,
51 | 		RRB,
52 | 	};
53 | };
54 | 
55 | struct ConstLab {
56 | 	enum types {
57 | 		UNSEEN = 0,
58 | 		NOTYPE,
59 | 	};
60 | };
61 | 
62 | #define BINNED_BUCKET 8
63 | #define MAX_CHILD_NUM 5
64 | #define MAX_SPAN_LENGTH 5
65 | #define MAX_LEN_DIFF 4
66 | #define MAX_FEATURE_NUM 7
67 | 
68 | } /* namespace segparser */
69 | #endif /* CONSTANT_H_ */
70 | 


--------------------------------------------------------------------------------
/util/FeatureAlphabet.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * FeatureAlphabet.cpp
  3 |  *
  4 |  *  Created on: Mar 28, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #include "FeatureAlphabet.h"
  9 | #include "SerializationUtils.h"
 10 | #include "../FeatureEncoder.h"
 11 | 
 12 | namespace segparser {
 13 | 
 14 | FeatureAlphabet::FeatureAlphabet (int capacity) {
 15 | 	arcMap.reserve(capacity);
 16 | 	secondOrderMap.reserve(capacity);
 17 | 	thirdOrderMap.reserve(capacity);
 18 | 	highOrderMap.reserve(capacity);
 19 | 	numEntries = 0;
 20 | 	growthStopped = false;
 21 | 
 22 | 	table[TemplateType::TArc] = &arcMap;
 23 | 	table[TemplateType::TSecondOrder] = &secondOrderMap;
 24 | 	table[TemplateType::TThirdOrder] = &thirdOrderMap;
 25 | 	table[TemplateType::THighOrder] = &highOrderMap;
 26 | }
 27 | 
 28 | FeatureAlphabet::FeatureAlphabet() : FeatureAlphabet(10000) {
 29 | }
 30 | 
 31 | FeatureAlphabet::~FeatureAlphabet() {
 32 | }
 33 | 
 34 | unordered_map<uint64_t, int>* FeatureAlphabet::getMap(int type) {
 35 | 
 36 | 	unordered_map<uint64_t, int>* intmap = NULL;
 37 | 	if (type < TemplateType::COUNT)
 38 | 		intmap = table[type];
 39 | 	else
 40 | 		ThrowException("undefined template type");
 41 | 	return intmap;
 42 | }
 43 | 
 44 | int FeatureAlphabet::lookupIndex(const int type, const uint64_t entry, bool addIfNotPresent) {
 45 | 	unordered_map<uint64_t, int>* intmap = getMap(type);
 46 | 
 47 | 	int ret = 0;
 48 | 	if (intmap->find(entry) == intmap->end()) {
 49 | 		if (!growthStopped && addIfNotPresent) {
 50 | 			ret = numEntries + 1;
 51 | 			numEntries++;
 52 | 			(*intmap)[entry] = ret;
 53 | 		}
 54 | 	}
 55 | 	else {
 56 | 		ret = (*intmap)[entry];
 57 | 	}
 58 | 	return ret;
 59 | }
 60 | 
 61 | int FeatureAlphabet::lookupIndex(unordered_map<uint64_t, int>& intmap, const uint64_t entry, bool addIfNotPresent) {
 62 | 	int ret = 0;
 63 | 	if (intmap.find(entry) == intmap.end()) {
 64 | 		if (!growthStopped && addIfNotPresent) {
 65 | 			ret = numEntries + 1;
 66 | 			numEntries++;
 67 | 			intmap[entry] = ret;
 68 | 		}
 69 | 	}
 70 | 	else {
 71 | 		ret = intmap[entry];
 72 | 	}
 73 | 	return ret;
 74 | }
 75 | 
 76 | int FeatureAlphabet::lookupIndex(const int type, const uint64_t entry) {
 77 | 	return lookupIndex (type, entry, true);
 78 | }
 79 | 
 80 | int FeatureAlphabet::size() {
 81 | 	return numEntries + 1;
 82 | }
 83 | 
 84 | void FeatureAlphabet::stopGrowth() {
 85 | 	growthStopped = true;
 86 | }
 87 | 
 88 | void FeatureAlphabet::writeObject (FILE* fs) {
 89 | 	CHECK(WriteInteger(fs, numEntries));
 90 | 	CHECK(WriteUINT64IntegerMap(fs, arcMap));
 91 | 	CHECK(WriteUINT64IntegerMap(fs, secondOrderMap));
 92 | 	CHECK(WriteUINT64IntegerMap(fs, thirdOrderMap));
 93 | 	CHECK(WriteUINT64IntegerMap(fs, highOrderMap));
 94 | 	CHECK(WriteBool(fs, growthStopped));
 95 | }
 96 | 
 97 | void FeatureAlphabet::readObject (FILE* fs) {
 98 | 	CHECK(ReadInteger(fs, &numEntries));
 99 | 	CHECK(ReadUINT64IntegerMap(fs, &arcMap));
100 | 	CHECK(ReadUINT64IntegerMap(fs, &secondOrderMap));
101 | 	CHECK(ReadUINT64IntegerMap(fs, &thirdOrderMap));
102 | 	CHECK(ReadUINT64IntegerMap(fs, &highOrderMap));
103 | 	CHECK(ReadBool(fs, &growthStopped));
104 | }
105 | 
106 | } /* namespace segparser */
107 | 


--------------------------------------------------------------------------------
/util/FeatureAlphabet.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * FeatureAlphabet.h
 3 |  *
 4 |  *  Created on: Mar 28, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef FEATUREALPHABET_H_
 9 | #define FEATUREALPHABET_H_
10 | 
11 | #include <unordered_map>
12 | #include <string>
13 | #include "../FeatureEncoder.h"
14 | 
15 | namespace segparser {
16 | 
17 | using namespace std;
18 | 
19 | class FeatureAlphabet {
20 | public:
21 | 	FeatureAlphabet(int capacity);
22 | 	FeatureAlphabet();
23 | 	virtual ~FeatureAlphabet();
24 | 
25 | 	int lookupIndex (const string& entry, bool addIfNotPresent);
26 | 	int lookupIndex (const string& entry);
27 | 	unordered_map<uint64_t, int>* getMap(int type);
28 | 	int lookupIndex(const int type, const uint64_t entry, bool addIfNotPresent);
29 | 	int lookupIndex(unordered_map<uint64_t, int>& intmap, const uint64_t entry, bool addIfNotPresent);
30 | 	int lookupIndex(const int type, const uint64_t entry);
31 | 	int size();
32 | 	void stopGrowth();
33 | 	void writeObject (FILE* fs);
34 | 	void readObject (FILE* fs);
35 | 
36 | 	unordered_map<uint64_t, int> arcMap;
37 | 	unordered_map<uint64_t, int> secondOrderMap;
38 | 	unordered_map<uint64_t, int> thirdOrderMap;
39 | 	unordered_map<uint64_t, int> highOrderMap;
40 | 
41 | private:
42 | 	int numEntries;
43 | 	bool growthStopped;
44 | 
45 | 	unordered_map<uint64_t, int>* table[TemplateType::COUNT];
46 | };
47 | 
48 | } /* namespace segparser */
49 | #endif /* FEATUREALPHABET_H_ */
50 | 


--------------------------------------------------------------------------------
/util/FeatureVector.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * FeatureVector.cpp
  3 |  *
  4 |  *  Created on: Mar 28, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #include "FeatureVector.h"
  9 | #include <assert.h>
 10 | #include <iostream>
 11 | 
 12 | namespace segparser {
 13 | 
 14 | vector<double> FeatureVector::dpVec;
 15 | unsigned int FeatureVector::rows;
 16 | 
 17 | FeatureVector::FeatureVector() {
 18 | }
 19 | 
 20 | FeatureVector::~FeatureVector() {
 21 | }
 22 | 
 23 | void FeatureVector::clear() {
 24 | 	binaryIndex.clear();
 25 | 	negBinaryIndex.clear();
 26 | 	normalIndex.clear();
 27 | 	normalValue.clear();
 28 | }
 29 | 
 30 | void FeatureVector::add(int index, double value) {
 31 | 	normalIndex.push_back(index);
 32 | 	normalValue.push_back(value);
 33 | }
 34 | 
 35 | void FeatureVector::addBinary(int index) {
 36 | 	binaryIndex.push_back(index);
 37 | }
 38 | 
 39 | void FeatureVector::addNegBinary(int index) {
 40 | 	negBinaryIndex.push_back(index);
 41 | }
 42 | 
 43 | void FeatureVector::concat(FeatureVector* fv) {
 44 | 	for(unsigned int i = 0; i < fv->binaryIndex.size(); ++i) {
 45 | 		binaryIndex.push_back(fv->binaryIndex[i]);
 46 | 	}
 47 | 	for(unsigned int i = 0; i < fv->negBinaryIndex.size(); ++i) {
 48 | 		negBinaryIndex.push_back(fv->negBinaryIndex[i]);
 49 | 	}
 50 | 	for(unsigned int i = 0; i < fv->normalIndex.size(); ++i) {
 51 | 		normalIndex.push_back(fv->normalIndex[i]);
 52 | 		normalValue.push_back(fv->normalValue[i]);
 53 | 	}
 54 | }
 55 | 
 56 | void FeatureVector::concatNeg(FeatureVector* fv) {
 57 | 	for(unsigned int i = 0; i < fv->binaryIndex.size(); ++i) {
 58 | 		negBinaryIndex.push_back(fv->binaryIndex[i]);
 59 | 	}
 60 | 	for(unsigned int i = 0; i < fv->negBinaryIndex.size(); ++i) {
 61 | 		binaryIndex.push_back(fv->negBinaryIndex[i]);
 62 | 	}
 63 | 	for(unsigned int i = 0; i < fv->normalIndex.size(); ++i) {
 64 | 		normalIndex.push_back(fv->normalIndex[i]);
 65 | 		normalValue.push_back(-fv->normalValue[i]);
 66 | 	}
 67 | }
 68 | 
 69 | double FeatureVector::dotProduct(FeatureVector* fv) {
 70 | 	double result = 0.0;
 71 | 
 72 | 	double b = 2.0;
 73 | 
 74 | 	for(unsigned int i = 0; i < binaryIndex.size(); ++i) {
 75 | 		dpVec[binaryIndex[i]] += 1.0;
 76 | 	}
 77 | 	for(unsigned int i = 0; i < negBinaryIndex.size(); ++i) {
 78 | 		dpVec[negBinaryIndex[i]] -= 1.0;
 79 | 	}
 80 | 	for(unsigned int i = 0; i < normalIndex.size(); ++i) {
 81 | 		double val = min(b, max(-b, normalValue[i]));
 82 | 		dpVec[normalIndex[i]] += val;
 83 | 	}
 84 | 
 85 | 	for(unsigned int i = 0; i < fv->binaryIndex.size(); ++i) {
 86 | 		result += dpVec[fv->binaryIndex[i]];
 87 | 	}
 88 | 	for(unsigned int i = 0; i < fv->negBinaryIndex.size(); ++i) {
 89 | 		result -= dpVec[fv->negBinaryIndex[i]];
 90 | 	}
 91 | 	for(unsigned int i = 0; i < fv->normalIndex.size(); ++i) {
 92 | 		double val = min(b, max(-b, fv->normalValue[i]));
 93 | 		result += dpVec[fv->normalIndex[i]] * val;
 94 | 	}
 95 | 
 96 | 	for(unsigned int i = 0; i < binaryIndex.size(); ++i) {
 97 | 		dpVec[binaryIndex[i]] = 0.0;
 98 | 	}
 99 | 	for(unsigned int i = 0; i < negBinaryIndex.size(); ++i) {
100 | 		dpVec[negBinaryIndex[i]] = 0.0;
101 | 	}
102 | 	for(unsigned int i = 0; i < normalIndex.size(); ++i) {
103 | 		dpVec[normalIndex[i]] = 0.0;
104 | 	}
105 | 
106 | 	return result;
107 | }
108 | 
109 | double FeatureVector::dotProduct(vector<double>& param) {
110 | 	// get score
111 | 	double score = 0.0;
112 | 	for(unsigned int i = 0; i < binaryIndex.size(); ++i) {
113 | 		score += param[binaryIndex[i]];
114 | 	}
115 | 	for(unsigned int i = 0; i < negBinaryIndex.size(); ++i) {
116 | 		score -= param[negBinaryIndex[i]];
117 | 	}
118 | 	for(unsigned int i = 0; i < normalIndex.size(); ++i) {
119 | 		score += param[normalIndex[i]] * normalValue[i];
120 | 	}
121 | 	return score;
122 | }
123 | 
124 | void FeatureVector::initVec(unsigned int _rows) {
125 | 	rows = _rows;
126 | 	dpVec.resize(rows);
127 | }
128 | 
129 | void FeatureVector::output() {
130 | 	cout << "bi: ";
131 | 	for(unsigned int i = 0; i < binaryIndex.size(); ++i) {
132 | 		cout << binaryIndex[i] << " ";
133 | 	}
134 | 	cout << endl;
135 | 	cout << "nbi: ";
136 | 	for(unsigned int i = 0; i < negBinaryIndex.size(); ++i) {
137 | 		cout << negBinaryIndex[i] << " ";
138 | 	}
139 | 	cout << endl;
140 | 	cout << "ni: ";
141 | 	for(unsigned int i = 0; i < normalIndex.size(); ++i) {
142 | 		cout << normalIndex[i] << " ";
143 | 	}
144 | 	cout << endl;
145 | 
146 | 	int x;
147 | 	cin >> x;
148 | }
149 | 
150 | } /* namespace segparser */
151 | 


--------------------------------------------------------------------------------
/util/FeatureVector.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * FeatureVector.h
 3 |  *
 4 |  *  Created on: Mar 28, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef FEATUREVECTOR_H_
 9 | #define FEATUREVECTOR_H_
10 | 
11 | #include <vector>
12 | 
13 | namespace segparser {
14 | 
15 | using namespace std;
16 | 
17 | class FeatureVector {
18 | public:
19 | 	vector<int> binaryIndex;
20 | 	vector<int> negBinaryIndex;
21 | 	vector<int> normalIndex;
22 | 	vector<double> normalValue;
23 | 
24 | 	FeatureVector();
25 | 	virtual ~FeatureVector();
26 | 
27 | 	void clear();
28 | 	void add(int index, double value);
29 | 	void addBinary(int index);
30 | 	void addNegBinary(int index);
31 | 	void concat(FeatureVector* fv);
32 | 	void concatNeg(FeatureVector* fv);
33 | 	double dotProduct(FeatureVector* fv);
34 | 	double dotProduct(vector<double>& param);
35 | 
36 | 	static void initVec(unsigned int _rows);
37 | 	void output();
38 | private:
39 | 	static vector<double> dpVec;
40 | 	static unsigned int rows;
41 | };
42 | 
43 | } /* namespace segparser */
44 | #endif /* FEATUREVECTOR_H_ */
45 | 


--------------------------------------------------------------------------------
/util/Logarithm.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Logarithm.cpp
 3 |  *
 4 |  *  Created on: Apr 15, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | 
 9 | #include "Logarithm.h"
10 | #include <math.h>
11 | 
12 | double logsumexp(double num1, double num2) {
13 | 	double max_exp = 0.0;
14 | 	double sum = 0.0;
15 | 	if (num2 > num1) {
16 | 		max_exp = num2;
17 | 		sum = 1.0 + exp(num1 - max_exp);
18 | 	}
19 | 	else {
20 | 		max_exp = num1;
21 | 		sum = 1.0 + exp(num2 - max_exp);
22 | 	}
23 | 	double ret = log(sum) + max_exp;
24 | 	return ret;
25 | }
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/util/Logarithm.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Logarithm.h
 3 |  *
 4 |  *  Created on: Apr 15, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef LOGARITHM_H_
 9 | #define LOGARITHM_H_
10 | 
11 | extern double logsumexp(double num1, double num2);
12 | 
13 | #endif /* LOGARITHM_H_ */
14 | 


--------------------------------------------------------------------------------
/util/Random.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Random.h
 3 |  *
 4 |  *  Created on: Jan 21, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef RANDOM_H_
 9 | #define RANDOM_H_
10 | 
11 | #include <time.h>
12 | #include <random>
13 | 
14 | class Random {
15 | public:
16 | 	Random() {
17 | 		eng.seed(time(NULL));
18 | 	}
19 | 
20 | 	Random(int seed) {
21 | 		eng.seed(seed);
22 | 	}
23 | 
24 | 	void setSeed(int seed) {
25 | 		eng.seed(seed);
26 | 	}
27 | 
28 | 	int nextInt(int n) {
29 | 		std::uniform_int_distribution<int> dist(0, n - 1);
30 | 		return dist(eng);
31 | 	}
32 | 
33 | 	double nextDouble() {
34 | 		std::uniform_real_distribution<double> dist(0, 1);
35 | 		return dist(eng);
36 | 	}
37 | 
38 | private:
39 | 	std::default_random_engine eng;
40 | };
41 | 
42 | #endif /* RANDOM_H_ */
43 | 


--------------------------------------------------------------------------------
/util/SerializationUtils.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SerializationUtils.cpp
  3 |  *
  4 |  *  Created on: Jan 21, 2014
  5 |  *      Author: yuanz
  6 |  */
  7 | 
  8 | #include "SerializationUtils.h"
  9 | #include <cstring>
 10 | 
 11 | bool WriteString(FILE *fs, const std::string& data) {
 12 | 	const char *buffer = data.c_str();
 13 | 	unsigned int length = strlen(buffer);
 14 | 	if (1 != fwrite(&length, sizeof(int), 1, fs)) return false;
 15 | 	if (length != fwrite(buffer, sizeof(char), length, fs)) return false;
 16 | 	return true;
 17 | }
 18 | 
 19 | bool WriteBool(FILE *fs, bool value) {
 20 | 	if (1 != fwrite(&value, sizeof(bool), 1, fs)) return false;
 21 | 	return true;
 22 | }
 23 | 
 24 | bool WriteInteger(FILE *fs, int value) {
 25 | 	if (1 != fwrite(&value, sizeof(int), 1, fs)) return false;
 26 | 	return true;
 27 | }
 28 | 
 29 | bool WriteUINT64(FILE *fs, uint64_t value) {
 30 | 	if (1 != fwrite(&value, sizeof(uint64_t), 1, fs)) return false;
 31 | 	return true;
 32 | }
 33 | 
 34 | bool WriteDouble(FILE *fs, double value) {
 35 | 	if (1 != fwrite(&value, sizeof(double), 1, fs)) return false;
 36 | 	return true;
 37 | }
 38 | 
 39 | bool WriteStringIntegerMap(FILE *fs, const std::unordered_map<std::string, int>& map) {
 40 | 	if (1 != WriteInteger(fs, map.size()))
 41 | 		return false;
 42 | 	for (auto kv : map) {
 43 | 		if (1 != WriteString(fs, kv.first))
 44 | 			return false;
 45 | 		if (1 != WriteInteger(fs, kv.second))
 46 | 			return false;
 47 | 	}
 48 | 	return true;
 49 | }
 50 | 
 51 | bool WriteUINT64IntegerMap(FILE *fs, const std::unordered_map<uint64_t, int>& map) {
 52 | 	if (1 != WriteInteger(fs, map.size()))
 53 | 		return false;
 54 | 	for (auto kv : map) {
 55 | 		if (1 != WriteUINT64(fs, kv.first))
 56 | 			return false;
 57 | 		if (1 != WriteInteger(fs, kv.second))
 58 | 			return false;
 59 | 	}
 60 | 	return true;
 61 | }
 62 | 
 63 | bool WriteDoubleArray(FILE *fs, const std::vector<double>& arr) {
 64 | 	if (1 != WriteInteger(fs, arr.size()))
 65 | 		return false;
 66 | 	for (unsigned int i = 0; i < arr.size(); ++i) {
 67 | 		if (1 != WriteDouble(fs, arr[i]))
 68 | 			return false;
 69 | 	}
 70 | 	return true;
 71 | }
 72 | 
 73 | bool ReadString(FILE *fs, std::string *data) {
 74 | 	unsigned int length;
 75 | 	if (1 != fread(&length, sizeof(int), 1, fs)) return false;
 76 | 	char *buffer = new char[length + 1];
 77 | 	if (length != fread(buffer, sizeof(char), length, fs)) return false;
 78 | 	buffer[length] = '\0';
 79 | 	*data = buffer;
 80 | 	delete[] buffer;
 81 | 	return true;
 82 | }
 83 | 
 84 | bool ReadBool(FILE *fs, bool *value) {
 85 | 	if (1 != fread(value, sizeof(bool), 1, fs)) return false;
 86 | 	return true;
 87 | }
 88 | 
 89 | bool ReadInteger(FILE *fs, int *value) {
 90 | 	if (1 != fread(value, sizeof(int), 1, fs)) return false;
 91 | 	return true;
 92 | }
 93 | 
 94 | bool ReadUINT64(FILE *fs, uint64_t *value) {
 95 | 	if (1 != fread(value, sizeof(uint64_t), 1, fs)) return false;
 96 | 	return true;
 97 | }
 98 | 
 99 | bool ReadDouble(FILE *fs, double *value) {
100 | 	if (1 != fread(value, sizeof(double), 1, fs)) return false;
101 | 	return true;
102 | }
103 | 
104 | bool ReadStringIntegerMap(FILE *fs, std::unordered_map<std::string, int>* map) {
105 | 	int size = 0;
106 | 	if (1 != ReadInteger(fs, &size))
107 | 		return false;
108 | 	for (int i = 0; i < size; ++i) {
109 | 		std::string key;
110 | 		int value;
111 | 		if (1 != ReadString(fs, &key))
112 | 			return false;
113 | 		if (1 != ReadInteger(fs, &value))
114 | 			return false;
115 | 		(*map)[key] = value;
116 | 	}
117 | 	return true;
118 | }
119 | 
120 | bool ReadUINT64IntegerMap(FILE *fs, std::unordered_map<uint64_t, int>* map) {
121 | 	int size = 0;
122 | 	if (1 != ReadInteger(fs, &size))
123 | 		return false;
124 | 	for (int i = 0; i < size; ++i) {
125 | 		uint64_t key;
126 | 		int value;
127 | 		if (1 != ReadUINT64(fs, &key))
128 | 			return false;
129 | 		if (1 != ReadInteger(fs, &value))
130 | 			return false;
131 | 		(*map)[key] = value;
132 | 	}
133 | 	return true;
134 | }
135 | 
136 | bool ReadDoubleArray(FILE* fs, std::vector<double>* arr) {
137 | 	int size = 0;
138 | 	if (1 != ReadInteger(fs, &size))
139 | 		return false;
140 | 	arr->clear();
141 | 	arr->reserve(size);
142 | 	for (int i = 0; i < size; ++i) {
143 | 		double value;
144 | 		if (1 != ReadDouble(fs, &value))
145 | 			return false;
146 | 		arr->push_back(value);
147 | 	}
148 | 	return true;
149 | }
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/util/SerializationUtils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SerializationUtils.h
 3 |  *
 4 |  *  Created on: Jan 21, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef SERIALIZATIONUTILS_H_
 9 | #define SERIALIZATIONUTILS_H_
10 | 
11 | #include <stdio.h>
12 | #include <string>
13 | #include <stdint.h>
14 | #include <unordered_map>
15 | #include <vector>
16 | #include "StringUtils.h"
17 | 
18 | extern bool WriteString(FILE *fs, const std::string& data);
19 | extern bool WriteBool(FILE *fs, bool value);
20 | extern bool WriteInteger(FILE *fs, int value);
21 | extern bool WriteUINT64(FILE *fs, uint64_t value);
22 | extern bool WriteDouble(FILE *fs, double value);
23 | extern bool WriteStringIntegerMap(FILE *fs, const std::unordered_map<std::string, int>& map);
24 | extern bool WriteUINT64IntegerMap(FILE *fs, const std::unordered_map<uint64_t, int>& map);
25 | extern bool WriteDoubleArray(FILE *fs, const std::vector<double>& arr);
26 | 
27 | extern bool ReadString(FILE *fs, std::string *data);
28 | extern bool ReadBool(FILE *fs, bool *value);
29 | extern bool ReadInteger(FILE *fs, int *value);
30 | extern bool ReadUINT64(FILE *fs, uint64_t *value);
31 | extern bool ReadDouble(FILE *fs, double *value);
32 | extern bool ReadStringIntegerMap(FILE *fs, std::unordered_map<std::string, int>* map);
33 | extern bool ReadUINT64IntegerMap(FILE *fs, std::unordered_map<uint64_t, int>* map);
34 | extern bool ReadDoubleArray(FILE *fs, std::vector<double>* arr);
35 | 
36 | #define CHECK(x) { if (!x) ThrowException("check bug"); }
37 | 
38 | #endif /* SERIALIZATIONUTILS_H_ */
39 | 


--------------------------------------------------------------------------------
/util/StringUtils.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * StringUtils.cpp
 3 |  *
 4 |  *  Created on: Jan 21, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #include <iostream>
 9 | #include "StringUtils.h"
10 | #include <assert.h>
11 | 
12 | // Split string str on any delimiting character in delim, and write the result
13 | // as a vector of strings.
14 | void StringSplit(const string &str,
15 | 		const string &delim,
16 | 		vector<string> *results) {
17 | 	size_t cutAt;
18 | 	string tmp = str;
19 | 	size_t len = delim.size();
20 | 
21 | 	while ((cutAt = tmp.find(delim)) != tmp.npos) {
22 | 		if(cutAt > 0) {
23 | 			results->push_back(tmp.substr(0,cutAt));
24 | 		}
25 | 		tmp = tmp.substr(cutAt+len);
26 | 	}
27 | 	if(tmp.length() > 0) results->push_back(tmp);
28 | }
29 | 
30 | // Deletes any head in the string "line" after the first occurrence of any
31 | // non-delimiting character (e.g. whitespaces).
32 | void TrimLeft(const string &delim, string *line) {
33 | 	size_t cutAt = line->find_first_not_of(delim);
34 | 	if (cutAt == line->npos) {
35 | 		*line = "";
36 | 	} else {
37 | 		*line = line->substr(cutAt);
38 | 	}
39 | }
40 | 
41 | // Deletes any tail in the string "line" after the last occurrence of any
42 | // non-delimiting character (e.g. whitespaces).
43 | void TrimRight(const string &delim, string *line) {
44 | 	size_t cutAt = line->find_last_not_of(delim);
45 | 	if (cutAt == line->npos) {
46 | 		*line = "";
47 | 	} else {
48 | 		*line = line->substr(0, cutAt+1);
49 | 	}
50 | }
51 | 
52 | // Trims left and right (see above).
53 | void Trim(const string &delim, string *line) {
54 | 	TrimLeft(delim, line);
55 | 	TrimRight(delim, line);
56 | }
57 | 
58 | void ThrowException(const string& msg) {
59 | 	cerr << msg << endl;
60 | 	exit(-1);
61 | }
62 | 
63 | int ChineseStringLength(const string& str) {
64 | 	int p = 0;
65 | 	int len = 0;
66 | 	while (str.find("ASC/", p) != string::npos) {
67 | 		len++;
68 | 		p = str.find("ASC/", p) + 4;
69 | 	}
70 | 	return len;
71 | }
72 | 
73 | string GetChineseChar(const string& str, int k) {
74 | 	int p = 0;
75 | 	for (int i = 0; i < k; ++i) {
76 | 		assert(str.find("ASC/", p) != string::npos);
77 | 		p = str.find("ASC/", p) + 4;
78 | 	}
79 | 
80 | 	int st = str.find("ASC/", p);
81 | 	int en = str.find("ASC/", st + 4);
82 | 
83 | 	return str.substr(st, (en == (int)string::npos ? string::npos : en - st));
84 | }
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/util/StringUtils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * StringUtils.h
 3 |  *
 4 |  *  Created on: Jan 21, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef STRINGUTILS_H_
 9 | #define STRINGUTILS_H_
10 | 
11 | #include <string>
12 | #include <vector>
13 | 
14 | using namespace std;
15 | 
16 | extern void StringSplit(const string &str,
17 |                         const string &delim,
18 |                         vector<string> *results);
19 | 
20 | extern void TrimLeft(const string &delim, string *line);
21 | 
22 | extern void TrimRight(const string &delim, string *line);
23 | 
24 | extern void Trim(const string &delim, string *line);
25 | 
26 | extern void ThrowException(const string& msg);
27 | 
28 | extern int ChineseStringLength(const string& str);
29 | 
30 | extern string GetChineseChar(const string& str, int k);
31 | 
32 | #endif /* STRINGUTILS_H_ */
33 | 


--------------------------------------------------------------------------------
/util/Timer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Timer.h
 3 |  *
 4 |  *  Created on: Jan 21, 2014
 5 |  *      Author: yuanz
 6 |  */
 7 | 
 8 | #ifndef TIMER_H_
 9 | #define TIMER_H_
10 | 
11 | #include <sys/time.h>
12 | #include <iostream>
13 | 
14 | //#define CLOCKS_PER_SEC  1000000l
15 | 
16 | class Timer {
17 | public:
18 | 
19 | 	Timer() {
20 | 		gettimeofday(&begin, NULL);
21 | 	}
22 | 
23 | 	double stop() {
24 | 		timeval end;
25 | 		gettimeofday(&end, NULL);
26 | 		double diffms = (((end.tv_sec - begin.tv_sec) * 1000000) + (end.tv_usec - begin.tv_usec))/1000;
27 | 	    return diffms;
28 | 	}
29 | private:
30 | 	timeval begin;
31 | };
32 | 
33 | 
34 | #endif /* TIMER_H_ */
35 | 


--------------------------------------------------------------------------------