├── .gitignore ├── DependencyInstance.cpp ├── DependencyInstance.h ├── DependencyPipe.cpp ├── DependencyPipe.h ├── FeatureEncoder.cpp ├── FeatureEncoder.h ├── FeatureExtractor.cpp ├── FeatureExtractor.h ├── LICENSE ├── Options.cpp ├── Options.h ├── Parameters.cpp ├── Parameters.h ├── README.md ├── Release ├── SharedTaskCommon.py ├── SharedTaskCommon.pyc ├── decoder │ └── subdir.mk ├── io │ └── subdir.mk ├── lattice_to_segmentation.py ├── makefile ├── objects.mk ├── run.sh ├── run_chinese.sh ├── run_chinese_test.sh ├── run_classical.sh ├── run_classical_test.sh ├── run_spmrl.sh ├── run_spmrl_test.sh ├── run_test.sh ├── sources.mk ├── subdir.mk ├── test.txt ├── util │ └── subdir.mk └── validateFormat.py ├── SegParser.cpp ├── SegParser.h ├── TedWrappers_20131015 ├── SharedTaskCommon.py ├── SharedTaskCommon.pyc ├── TedEvalApps.jar ├── TedPart.jar ├── cleanconll.pl ├── cleanptb.pl ├── debug │ ├── check_sourceid.pl │ └── do_check.sh ├── genere_tfm_tedeval.pl ├── get_cutoffed_sent.pl ├── get_ted_res.pl ├── lattice_to_segmentation.py ├── lines ├── pproj_24934 │ ├── conllx.xml │ └── pproj_24934_pseudo.info ├── reprojectivize.sh ├── skip_lines.pl ├── tedeval-2.2.jar ├── tedeval.jar ├── tedeval.sh ├── tedeval_cross.sh ├── tedeval_cross2.sh ├── tedeval_debug.jar ├── tedeval_seg.sh ├── tedeval_simple.sh ├── tedeval_simple.sh.good ├── tedeval_simple.sh.old ├── tedeval_simple_polish.sh ├── validateFormat.py └── wc ├── data ├── core12map.txt ├── spmrl.seg.dev ├── spmrl.seg.test ├── spmrl.seg.train ├── spmrl.uni.map ├── tags-all.mod.txt ├── tags-mada2core12.txt ├── test.Arabic └── test0.mada.gz ├── decoder ├── ClassifierDecoder.cpp ├── ClassifierDecoder.h ├── DependencyDecoder.cpp ├── DependencyDecoder.h ├── DevelopmentThread.cpp ├── DevelopmentThread.h ├── HillClimbingDecoder.cpp └── HillClimbingDecoder.h ├── io ├── DependencyReader.cpp ├── DependencyReader.h ├── DependencyWriter.cpp └── DependencyWriter.h ├── runs └── .gitignore ├── spmrl_code_generator ├── MadaReader.java ├── SpmrlDataGenerator.java └── SpmrlReader.java └── util ├── Alphabet.cpp ├── Alphabet.h ├── Constant.cpp ├── Constant.h ├── FeatureAlphabet.cpp ├── FeatureAlphabet.h ├── FeatureVector.cpp ├── FeatureVector.h ├── Logarithm.cpp ├── Logarithm.h ├── Random.h ├── SerializationUtils.cpp ├── SerializationUtils.h ├── StringUtils.cpp ├── StringUtils.h └── Timer.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | 19 | # Compiled Static libraries 20 | *.lai 21 | *.la 22 | *.a 23 | *.lib 24 | 25 | # Executables 26 | *.exe 27 | *.out 28 | *.app 29 | -------------------------------------------------------------------------------- /DependencyInstance.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DependencyInstance.h 3 | * 4 | * Created on: Mar 21, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef DEPENDENCYINSTANCE_H_ 9 | #define DEPENDENCYINSTANCE_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "util/FeatureVector.h" 16 | 17 | namespace segparser { 18 | 19 | class DependencyPipe; 20 | class Options; 21 | 22 | using namespace std; 23 | 24 | class HeadIndex { 25 | public: 26 | int hWord; 27 | int hSeg; 28 | 29 | HeadIndex(int _hWord, int _hSeg) : hWord(_hWord), hSeg(_hSeg) { } 30 | 31 | HeadIndex() : hWord(-1), hSeg(0) { } 32 | 33 | void setIndex(int _hWord, int _hSeg) { 34 | hWord = _hWord; hSeg = _hSeg; 35 | } 36 | 37 | friend bool operator < (HeadIndex &id1, HeadIndex &id2) { 38 | return id1.hWord < id2.hWord || (id1.hWord == id2.hWord && id1.hSeg < id2.hSeg); 39 | } 40 | 41 | friend bool operator != (HeadIndex &id1, HeadIndex &id2) { 42 | return id1.hWord != id2.hWord || id1.hSeg != id2.hSeg; 43 | } 44 | 45 | friend bool operator == (HeadIndex &id1, HeadIndex &id2) { 46 | return id1.hWord == id2.hWord && id1.hSeg == id2.hSeg; 47 | } 48 | 49 | friend ostream& operator << (ostream& os, const HeadIndex& id) { 50 | os << id.hWord << "/" << id.hSeg; 51 | return os; 52 | } 53 | }; 54 | 55 | class SegElement { 56 | public: 57 | string form; 58 | int formid; 59 | string lemma; 60 | int lemmaid; 61 | 62 | HeadIndex dep; 63 | int labid; 64 | 65 | vector child; 66 | 67 | int currPosCandID; // id of the pos in candidate list 68 | vector candPos; 69 | vector candPosid; 70 | vector candDetPosid; 71 | vector candSpecialPos; 72 | vector candProb; 73 | 74 | // for chinese character 75 | int st; 76 | int en; 77 | 78 | SegElement() : form(""), formid(-1), lemma(""), lemmaid(-1), labid(-1), currPosCandID(-1), st(-1), en(-1) {} 79 | 80 | int candPosNum() { 81 | return candPos.size(); 82 | } 83 | 84 | bool isOptPos() { 85 | return currPosCandID == 0; 86 | } 87 | 88 | int getCurrPos() { 89 | return candPosid[currPosCandID]; 90 | } 91 | 92 | int getCurrDetPos() { 93 | return candDetPosid[currPosCandID]; 94 | } 95 | 96 | int getCurrSpecialPos() { 97 | return candSpecialPos[currPosCandID]; 98 | } 99 | 100 | friend ostream& operator << (ostream& os, const SegElement& ele) { 101 | int i = ele.currPosCandID; 102 | os << ele.form << "_" << ele.formid << " " << ele.dep << " " << 103 | ele.candPos[i] << "_" << ele.candPosid[i] << "_" << ele.candProb[i] << endl; 104 | return os; 105 | } 106 | }; 107 | 108 | class SegInstance { 109 | public: 110 | vector element; 111 | string segStr; 112 | double prob; 113 | 114 | // morphology features 115 | int AlIndex; 116 | int morphIndex; 117 | vector morph; //per/gen/num 118 | vector morphid; 119 | 120 | SegInstance() : segStr(""), prob(0.0), AlIndex(-1), morphIndex(-1) {} 121 | 122 | int size() { 123 | return element.size(); 124 | } 125 | 126 | friend ostream& operator << (ostream& os, const SegInstance& inst) { 127 | os << "seg str: " << inst.segStr << "_" << inst.prob << endl; 128 | for (unsigned int i = 0; i < inst.element.size(); ++i) 129 | os << "element: " << i << " " << inst.element[i] << endl; 130 | return os; 131 | } 132 | }; 133 | 134 | class WordInstance { 135 | public: 136 | // form/pos/dep/lab are retrieved from the candidate and id 137 | // the following are just for temporarily record gold info 138 | vector goldForm; 139 | vector goldLemma; 140 | vector goldPos; 141 | 142 | int goldAlIndex; 143 | int goldMorphIndex; 144 | vector goldMorph; 145 | 146 | vector goldDep; 147 | vector goldLab; 148 | 149 | string wordStr; 150 | int wordid; 151 | 152 | int currSegCandID; // id of the seg in candidate list 153 | vector candSeg; 154 | 155 | int optPosCount; // number of segs with optimal pos 156 | 157 | vector< vector > inMap; // [a->b id][size of b], for each element of b, need a map to decide the head and POS 158 | vector< vector > outMap; // [a->b id][size of a], for each child of a, need a map to decide its new parent 159 | 160 | WordInstance() { 161 | wordStr = ""; 162 | wordid = -1; 163 | goldAlIndex = -1; 164 | goldMorphIndex = -1; 165 | optPosCount = 0; 166 | currSegCandID = 0; 167 | } 168 | 169 | SegInstance& getCurrSeg() { 170 | return candSeg[currSegCandID]; 171 | } 172 | 173 | bool isOptSeg() { 174 | return currSegCandID == 0; 175 | } 176 | 177 | void setOptPosCount() { 178 | optPosCount = 0; 179 | SegInstance& segInst = getCurrSeg(); 180 | for (int j = 0; j < segInst.size(); ++j) 181 | optPosCount += (segInst.element[j].currPosCandID == 0); 182 | } 183 | }; 184 | 185 | class DependencyInstance { 186 | public: 187 | DependencyInstance(); 188 | virtual ~DependencyInstance(); 189 | 190 | vector word; 191 | int numWord; // number of words 192 | 193 | vector characterid; 194 | 195 | int optSegCount; // number of words with optimal seg 196 | 197 | FeatureVector fv; // feature vector of the current tree 198 | 199 | // word index and seg index conversion 200 | void constructConversionList(); 201 | void setOptSegPosCount(); 202 | int getNumSeg(); 203 | int wordToSeg(HeadIndex& id); 204 | int wordToSeg(int hid, int segid); 205 | HeadIndex segToWord(int id); 206 | 207 | void setInstIds(DependencyPipe* pipe, Options* options); 208 | string normalize(string s); 209 | 210 | int segDist(HeadIndex& head, HeadIndex& mod); 211 | SegElement& getElement(int hw, int hs); 212 | SegElement& getElement(HeadIndex id); 213 | 214 | void buildChild(); 215 | void updateChildList(HeadIndex& newH, HeadIndex& oldH, HeadIndex& arg); 216 | 217 | void output(); 218 | private: 219 | vector numSeg; // total number of segs before this word, appending the total number in the end 220 | // size = number of words 221 | vector seg2Word; // word index for the segment, size = number of segs 222 | 223 | bool isPunc(string& w); 224 | bool isCoord(int lang, string& w); 225 | 226 | int computeOverlap(SegElement& e1, SegElement& e2); 227 | vector buildInMap(WordInstance& w, int a, int b); 228 | vector buildOutMap(WordInstance& w, int a, int b); 229 | }; 230 | 231 | typedef boost::shared_ptr inst_ptr; 232 | 233 | class VariableInfo { 234 | public: 235 | vector segID; 236 | vector posID; 237 | vector dep; 238 | 239 | VariableInfo() { 240 | 241 | } 242 | 243 | VariableInfo(DependencyInstance* inst) { 244 | copyInfoFromInst(inst); 245 | } 246 | 247 | void copyInfoFromInst(DependencyInstance* inst) { 248 | if ((int)segID.size() != inst->numWord) { 249 | segID.resize(inst->numWord); 250 | } 251 | 252 | int numSeg = inst->getNumSeg(); 253 | if ((int)posID.size() != numSeg) { 254 | posID.resize(numSeg); 255 | dep.resize(numSeg); 256 | } 257 | 258 | int p = 0; 259 | for (int i = 0; i < inst->numWord; ++i) { 260 | segID[i] = inst->word[i].currSegCandID; 261 | SegInstance& segInst = inst->word[i].getCurrSeg(); 262 | 263 | for (int j = 0; j < segInst.size(); ++j) { 264 | posID[p] = segInst.element[j].currPosCandID; 265 | dep[p] = segInst.element[j].dep; 266 | p++; 267 | } 268 | } 269 | assert(p == inst->getNumSeg()); 270 | } 271 | 272 | void loadInfoToInst(DependencyInstance* inst) { 273 | assert((int)segID.size() == inst->numWord); 274 | 275 | int p = 0; 276 | for (int i = 0; i < inst->numWord; ++i) { 277 | inst->word[i].currSegCandID = segID[i]; 278 | SegInstance& segInst = inst->word[i].getCurrSeg(); 279 | 280 | for (int j = 0; j < segInst.size(); ++j) { 281 | segInst.element[j].currPosCandID = posID[p]; 282 | segInst.element[j].dep = dep[p]; 283 | p++; 284 | } 285 | } 286 | 287 | assert(p == (int)posID.size()); 288 | } 289 | 290 | bool isChanged(DependencyInstance* inst) { 291 | assert((int)segID.size() == inst->numWord); 292 | 293 | int p = 0; 294 | for (int i = 0; i < inst->numWord; ++i) { 295 | if (inst->word[i].currSegCandID != segID[i]) 296 | return true; 297 | SegInstance& segInst = inst->word[i].getCurrSeg(); 298 | 299 | for (int j = 0; j < segInst.size(); ++j) { 300 | if (segInst.element[j].currPosCandID != posID[p]) 301 | return true; 302 | if (segInst.element[j].dep != dep[p]) 303 | return true; 304 | p++; 305 | } 306 | } 307 | 308 | assert(p == (int)posID.size()); 309 | return false; 310 | } 311 | }; 312 | 313 | } /* namespace segparser */ 314 | 315 | #include "DependencyPipe.h" 316 | 317 | #endif /* DEPENDENCYINSTANCE_H_ */ 318 | -------------------------------------------------------------------------------- /DependencyPipe.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DependencyPipe.h 3 | * 4 | * Created on: Apr 4, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef DEPENDENCYPIPE_H_ 9 | #define DEPENDENCYPIPE_H_ 10 | 11 | #include "Options.h" 12 | #include "DependencyInstance.h" 13 | #include "util/Alphabet.h" 14 | #include "util/FeatureAlphabet.h" 15 | #include 16 | 17 | namespace segparser { 18 | 19 | using namespace std; 20 | 21 | class DependencyPipe { 22 | public: 23 | DependencyPipe(Options* options); 24 | virtual ~DependencyPipe(); 25 | 26 | void loadCoarseMap(string& file); 27 | void setAndCheckOffset(); 28 | void buildDictionary(string& goldfile); 29 | void buildDictionaryWithOOV(string& goldfile); 30 | void closeAlphabets(); 31 | void createAlphabet(string& goldfile); 32 | vector createInstances(string goldFile); 33 | 34 | int findRightNearestChildID(vector& child, HeadIndex id); 35 | HeadIndex findRightNearestChild(vector& child, HeadIndex id); 36 | HeadIndex findLeftNearestChild(vector& child, HeadIndex id); 37 | vector findConjArg(DependencyInstance* s, HeadIndex& arg); 38 | 39 | void createFeatureVector(DependencyInstance* inst, FeatureVector* fv); 40 | int getBinnedDistance(int x); 41 | void createArcFeatureVector(DependencyInstance* inst, HeadIndex& headIndex, HeadIndex& modIndex, FeatureVector* fv); 42 | void createTripsFeatureVector(DependencyInstance* inst, HeadIndex& par, HeadIndex& ch1, HeadIndex& ch2, FeatureVector* fv); 43 | void createSibsFeatureVector(DependencyInstance* inst, HeadIndex& ch1, HeadIndex& ch2, bool isST, FeatureVector* fv); 44 | void createGPCFeatureVector(DependencyInstance* inst, HeadIndex& gp, HeadIndex& par, HeadIndex& c, FeatureVector* fv); 45 | void createGPSibFeatureVector(DependencyInstance* inst, SegElement* gp, SegElement* par, SegElement* ch1, SegElement* ch2, FeatureVector* fv); 46 | void createTriSibFeatureVector(DependencyInstance* inst, SegElement* par, SegElement* ch1, SegElement* ch2, SegElement* ch3, FeatureVector* fv); 47 | void createPos1OFeatureVector(DependencyInstance* inst, HeadIndex& m, FeatureVector* fv); 48 | void createPosHOFeatureVector(DependencyInstance* inst, HeadIndex& m, bool unigram, FeatureVector* fv); 49 | void createSegFeatureVector(DependencyInstance* inst, int wordid, FeatureVector* fv); 50 | void createHighOrderFeatureVector(DependencyInstance* inst, FeatureVector* fv); 51 | void createPartialHighOrderFeatureVector(DependencyInstance* inst, HeadIndex& x, bool bigram, FeatureVector* fv); 52 | void createPartialPosHighOrderFeatureVector(DependencyInstance* inst, HeadIndex& x, FeatureVector* fv); 53 | void addCode(int type, uint64_t code, double val, FeatureVector* fv); 54 | void addCode(int type, uint64_t code, FeatureVector* fv); 55 | 56 | FeatureAlphabet* dataAlphabet; 57 | 58 | // dictionary 59 | Alphabet* typeAlphabet; 60 | Alphabet* posAlphabet; // pos 61 | Alphabet* lexAlphabet; // lemma, word 62 | unordered_set suffixList; 63 | 64 | unordered_map coarseMap; 65 | 66 | // encoder 67 | FeatureEncoder* fe; 68 | private: 69 | Options* options; 70 | 71 | void buildSuffixList(); 72 | }; 73 | 74 | } /* namespace segparser */ 75 | #endif /* DEPENDENCYPIPE_H_ */ 76 | -------------------------------------------------------------------------------- /FeatureEncoder.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * FeatureEncoder.cpp 3 | * 4 | * Created on: Mar 29, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "FeatureEncoder.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace segparser { 15 | 16 | FeatureEncoder::FeatureEncoder() { 17 | largeOff = 17; 18 | midOff = 9; 19 | flagOff = 4; 20 | tempOff = 7; 21 | } 22 | 23 | FeatureEncoder::~FeatureEncoder() { 24 | } 25 | 26 | /********************************* 27 | * code generator 28 | * generally flag will be added lately, because code without flag is also needed 29 | */ 30 | 31 | int FeatureEncoder::getBits(uint64_t x) { 32 | uint64_t y = 1; 33 | int i = 0; 34 | while (y < x) { 35 | y = y << 1; 36 | ++i; 37 | } 38 | return i; 39 | } 40 | 41 | uint64_t FeatureEncoder::genCodePF(uint64_t temp, uint64_t p1) { 42 | return ((p1 << flagOff) << tempOff) | temp; 43 | } 44 | 45 | uint64_t FeatureEncoder::genCodePPF(uint64_t temp, uint64_t p1, uint64_t p2) { 46 | return ((((p1 << midOff) | p2) << flagOff) << tempOff) | temp; 47 | } 48 | 49 | uint64_t FeatureEncoder::genCodePPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3) { 50 | return ((((((p1 << midOff) | p2) << midOff) | p3) << flagOff) << tempOff) | temp; 51 | } 52 | 53 | uint64_t FeatureEncoder::genCodePPPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t p4) { 54 | return ((((((((p1 << midOff) | p2) << midOff) | p3) << midOff) | p4) << flagOff) << tempOff) | temp; 55 | } 56 | 57 | uint64_t FeatureEncoder::genCodePPPPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t p4, uint64_t p5) { 58 | return ((((((((((p1 << midOff) | p2) << midOff) | p3) << midOff) | p4) << midOff) | p5) << flagOff) << tempOff) | temp; 59 | } 60 | 61 | uint64_t FeatureEncoder::genCodeWF(uint64_t temp, uint64_t w1) { 62 | return ((w1 << flagOff) << tempOff) | temp; 63 | } 64 | 65 | uint64_t FeatureEncoder::genCodePWF(uint64_t temp, uint64_t p1, uint64_t w1) { 66 | return ((((w1 << midOff) | p1) << flagOff) << tempOff) | temp; 67 | } 68 | 69 | uint64_t FeatureEncoder::genCodeWWF(uint64_t temp, uint64_t w1, uint64_t w2) { 70 | return ((((w1 << largeOff) | w2) << flagOff) << tempOff) | temp; 71 | } 72 | 73 | uint64_t FeatureEncoder::genCodeWWW(uint64_t temp, uint64_t w1, uint64_t w2, uint64_t w3) { 74 | return (((((w1 << largeOff) | w2) << largeOff) | w3) << tempOff) | temp; 75 | } 76 | 77 | uint64_t FeatureEncoder::genCodePPWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t w1) { 78 | return ((((((w1 << midOff) | p1) << midOff) | p2) << flagOff) << tempOff) | temp; 79 | } 80 | 81 | uint64_t FeatureEncoder::genCodePPPWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t w1) { 82 | return ((((((((w1 << midOff) | p1) << midOff) | p2) << midOff) | p3) << flagOff) << tempOff) | temp; 83 | } 84 | 85 | uint64_t FeatureEncoder::genCodePWWF(uint64_t temp, uint64_t p1, uint64_t w1, uint64_t w2) { 86 | return ((((((w1 << largeOff) | w2) << midOff) | p1) << flagOff) << tempOff) | temp; 87 | } 88 | 89 | uint64_t FeatureEncoder::genCodePPWWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t w1, uint64_t w2) { 90 | return ((((((((w1 << largeOff) | w2) << midOff) | p1) << midOff) | p2) << flagOff) << tempOff) | temp; 91 | } 92 | 93 | uint64_t FeatureEncoder::genCodeIIVF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1) { 94 | return ((((((i1 << midOff) | i2) << midOff) | v1) << flagOff) << tempOff) | temp; 95 | } 96 | 97 | uint64_t FeatureEncoder::genCodeIIVVF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2) { 98 | return ((((((((i1 << midOff) | i2) << midOff) | v1) << midOff) | v2) << flagOff) << tempOff) | temp; 99 | } 100 | 101 | uint64_t FeatureEncoder::genCodeIIVVPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2, uint64_t p1) { 102 | return ((((((((((i1 << midOff) | i2) << midOff) | v1) << midOff) | v2) << midOff) | p1) << flagOff) << tempOff) | temp; 103 | } 104 | 105 | uint64_t FeatureEncoder::genCodeIIVPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t p1) { 106 | return ((((((((i1 << midOff) | i2) << midOff) | v1) << midOff) | p1) << flagOff) << tempOff) | temp; 107 | } 108 | 109 | uint64_t FeatureEncoder::genCodeIIVPPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t p1, uint64_t p2) { 110 | return ((((((((((i1 << midOff) | i2) << midOff) | v1) << midOff) | p1) << midOff) | p2) << flagOff) << tempOff) | temp; 111 | } 112 | 113 | uint64_t FeatureEncoder::genCodeIIVVPPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2, uint64_t p1, uint64_t p2) { 114 | return ((((((((((((i1 << midOff) | i2) << midOff) | v1) << midOff) | v2) 115 | << midOff) | p1) << midOff) | p2) << flagOff) << tempOff) | temp; 116 | } 117 | 118 | } /* namespace segparser */ 119 | -------------------------------------------------------------------------------- /FeatureEncoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * FeatureEncoder.h 3 | * 4 | * Created on: Mar 29, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef FEATUREENCODER_H_ 9 | #define FEATUREENCODER_H_ 10 | 11 | #include 12 | 13 | namespace segparser { 14 | 15 | /****************************** 16 | * template type 17 | *****************************/ 18 | 19 | struct TemplateType { 20 | enum types { 21 | TArc, TSecondOrder, TThirdOrder, THighOrder, COUNT, 22 | }; 23 | }; 24 | 25 | /********************************** 26 | * notation 27 | * P: pos; W: word; 28 | **********************************/ 29 | struct Arc { 30 | 31 | enum types { 32 | START, 33 | 34 | /************************************************* 35 | * First-order dependency feature from MST 36 | * ***********************************************/ 37 | // feature posL posIn posR 38 | LP_MP_RP, 39 | 40 | // feature posL-1 posL posR posR+1 41 | pLP_LP_RP_nRP, 42 | LP_RP_nRP, 43 | pLP_RP_nRP, 44 | pLP_LP_nRP, 45 | pLP_LP_RP, 46 | 47 | // feature posL posL+1 posR-1 posR 48 | LP_nLP_pRP_RP, 49 | nLP_pRP_RP, 50 | LP_pRP_RP, 51 | LP_nLP_RP, 52 | LP_nLP_pRP, 53 | 54 | // feature posL-1 posL posR-1 posR 55 | // feature posL posL+1 posR posR+1 56 | pLP_LP_pRP_RP, 57 | LP_nLP_RP_nRP, 58 | 59 | // two obs (word, pos) 60 | HP, 61 | HW, 62 | //MP, 63 | //MW, 64 | HW_HP, 65 | //MW_MP, 66 | HP_MP, 67 | HP_MW, 68 | HW_MP, 69 | HW_MW, 70 | HP_MP_MW, 71 | HW_MP_MW, 72 | HP_HW_MP, 73 | HP_HW_MW, 74 | HP_HW_MP_MW, 75 | 76 | // lemma pos 77 | HL, 78 | //ML, 79 | HP_ML, 80 | HL_MP, 81 | HL_ML, 82 | HP_HL, 83 | //MP_ML, 84 | HP_MP_ML, 85 | HL_MP_ML, 86 | HP_HL_MP, 87 | HP_HL_ML, 88 | HP_HL_MP_ML, 89 | 90 | // morphology, id, val, pos 91 | FF_IDH_IDM_HV, 92 | FF_IDH_IDM_MV, 93 | FF_IDH_IDM_HV_MV, 94 | FF_IDH_IDM_HP_MV, 95 | FF_IDH_IDM_HV_MP, 96 | FF_IDH_IDM_HV_HP, 97 | FF_IDH_IDM_MV_MP, 98 | FF_IDH_IDM_HP_MP_MV, 99 | FF_IDH_IDM_HV_MP_MV, 100 | FF_IDH_IDM_HP_HV_MP, 101 | FF_IDH_IDM_HP_HV_MV, 102 | FF_IDH_IDM_HP_HV_MP_MV, 103 | 104 | // label 105 | LAB, 106 | LAB_W_WP, 107 | LAB_WP, 108 | LAB_pWP_WP, 109 | LAB_WP_nWP, 110 | LAB_pWP_WP_nWP, 111 | LAB_W, 112 | LAB_L_WP, 113 | LAB_L, 114 | LAB_HP_MP, 115 | LAB_HL_MP, 116 | LAB_HP_ML, 117 | LAB_HL_ML, 118 | 119 | // first order 120 | HD_BD_MD, 121 | 122 | pHD_HD_MD_nMD, 123 | HD_MD_nMD, 124 | pHD_MD_nMD, 125 | pHD_HD_nMD, 126 | pHD_HD_MD, 127 | 128 | HD_nHD_pMD_MD, 129 | nHD_pMD_MD, 130 | HD_pMD_MD, 131 | HD_nHD_MD, 132 | HD_nHD_pMD, 133 | 134 | pHD_HD_pMD_MD, 135 | HD_nHD_MD_nMD, 136 | 137 | HD, 138 | HD_MD, 139 | 140 | // contextual 141 | pHW, 142 | nHW, 143 | pMW, 144 | MW, 145 | nMW, 146 | 147 | // flag 148 | HP_MP_FLAG, 149 | HW_MW_FLAG, 150 | 151 | COUNT, 152 | }; 153 | }; 154 | struct SecondOrder { 155 | enum types { 156 | START, 157 | 158 | HP_SP_MP, 159 | HC_SC_MC, 160 | 161 | pHC_HC_SC_MC, 162 | HC_nHC_SC_MC, 163 | HC_pSC_SC_MC, 164 | HC_SC_nSC_MC, 165 | HC_SC_pMC_MC, 166 | HC_SC_MC_nMC, 167 | 168 | pHC_HL_SC_MC, 169 | HL_nHC_SC_MC, 170 | HL_pSC_SC_MC, 171 | HL_SC_nSC_MC, 172 | HL_SC_pMC_MC, 173 | HL_SC_MC_nMC, 174 | 175 | pHC_HC_SL_MC, 176 | HC_nHC_SL_MC, 177 | HC_pSC_SL_MC, 178 | HC_SL_nSC_MC, 179 | HC_SL_pMC_MC, 180 | HC_SL_MC_nMC, 181 | 182 | pHC_HC_SC_ML, 183 | HC_nHC_SC_ML, 184 | HC_pSC_SC_ML, 185 | HC_SC_nSC_ML, 186 | HC_SC_pMC_ML, 187 | HC_SC_ML_nMC, 188 | 189 | HC_MC_SC_pHC_pMC, 190 | HC_MC_SC_pHC_pSC, 191 | HC_MC_SC_pMC_pSC, 192 | HC_MC_SC_nHC_nMC, 193 | HC_MC_SC_nHC_nSC, 194 | HC_MC_SC_nMC_nSC, 195 | HC_MC_SC_pHC_nMC, 196 | HC_MC_SC_pHC_nSC, 197 | HC_MC_SC_pMC_nSC, 198 | HC_MC_SC_nHC_pMC, 199 | HC_MC_SC_nHC_pSC, 200 | HC_MC_SC_nMC_pSC, 201 | 202 | SP_MP, 203 | SW_MW, 204 | SW_MP, 205 | SP_MW, 206 | SC_MC, 207 | SL_ML, 208 | SL_MC, 209 | SC_ML, 210 | 211 | // head bigram 212 | H1P_H2P_M1P_M2P, 213 | H1P_H2P_M1P_M2P_DIR, 214 | H1C_H2C_M1C_M2C, 215 | H1C_H2C_M1C_M2C_DIR, 216 | 217 | // gp-p-c 218 | GP_HP_MP, 219 | GC_HC_MC, 220 | GL_HC_MC, 221 | GC_HL_MC, 222 | GC_HC_ML, 223 | 224 | pGC_GC_HC_MC, 225 | GC_nGC_HC_MC, 226 | GC_pHC_HC_MC, 227 | GC_HC_nHC_MC, 228 | GC_HC_pMC_MC, 229 | GC_HC_MC_nMC, 230 | 231 | pGC_GL_HC_MC, 232 | GL_nGC_HC_MC, 233 | GL_pHC_HC_MC, 234 | GL_HC_nHC_MC, 235 | GL_HC_pMC_MC, 236 | GL_HC_MC_nMC, 237 | 238 | pGC_GC_HL_MC, 239 | GC_nGC_HL_MC, 240 | GC_pHC_HL_MC, 241 | GC_HL_nHC_MC, 242 | GC_HL_pMC_MC, 243 | GC_HL_MC_nMC, 244 | 245 | pGC_GC_HC_ML, 246 | GC_nGC_HC_ML, 247 | GC_pHC_HC_ML, 248 | GC_HC_nHC_ML, 249 | GC_HC_pMC_ML, 250 | GC_HC_ML_nMC, 251 | 252 | GC_HC_MC_pGC_pHC, 253 | GC_HC_MC_pGC_pMC, 254 | GC_HC_MC_pHC_pMC, 255 | GC_HC_MC_nGC_nHC, 256 | GC_HC_MC_nGC_nMC, 257 | GC_HC_MC_nHC_nMC, 258 | GC_HC_MC_pGC_nHC, 259 | GC_HC_MC_pGC_nMC, 260 | GC_HC_MC_pHC_nMC, 261 | GC_HC_MC_nGC_pHC, 262 | GC_HC_MC_nGC_pMC, 263 | GC_HC_MC_nHC_pMC, 264 | 265 | COUNT, 266 | }; 267 | }; 268 | 269 | struct ThirdOrder { 270 | enum types { 271 | START, 272 | 273 | // move some gpc features here... 274 | GL_HL_MC, 275 | GL_HC_ML, 276 | GC_HL_ML, 277 | GL_HL_ML, 278 | 279 | GC_HC, 280 | GL_HC, 281 | GC_HL, 282 | GL_HL, 283 | 284 | // only cross with dir flag 285 | GC_MC, 286 | GL_MC, 287 | GC_ML, 288 | GL_ML, 289 | HC_MC, 290 | HL_MC, 291 | HC_ML, 292 | HL_ML, 293 | 294 | // ggpc 295 | GGC_GC_HC_MC, 296 | GGL_GC_HC_MC, 297 | GGC_GL_HC_MC, 298 | GGC_GC_HL_MC, 299 | GGC_GC_HC_ML, 300 | 301 | GGC_HC_MC, 302 | GGL_HC_MC, 303 | GGC_HL_MC, 304 | GGC_HC_ML, 305 | GGC_GC_MC, 306 | GGL_GC_MC, 307 | GGC_GL_MC, 308 | GGC_GC_ML, 309 | GGC_MC, 310 | GGL_MC, 311 | GGC_ML, 312 | GGL_ML, 313 | 314 | HC_MC_CC_SC, 315 | HL_MC_CC_SC, 316 | HC_ML_CC_SC, 317 | HC_MC_CL_SC, 318 | HC_MC_CC_SL, 319 | 320 | HC_CC_SC, 321 | HL_CC_SC, 322 | HC_CL_SC, 323 | HC_CC_SL, 324 | 325 | // gp sibling 326 | GC_HC_MC_SC, 327 | GL_HC_MC_SC, 328 | GC_HL_MC_SC, 329 | GC_HC_ML_SC, 330 | GC_HC_MC_SL, 331 | 332 | // tri-sibling 333 | HC_PC_MC_NC, 334 | HL_PC_MC_NC, 335 | HC_PL_MC_NC, 336 | HC_PC_ML_NC, 337 | HC_PC_MC_NL, 338 | 339 | HC_PC_NC, 340 | PC_MC_NC, 341 | HL_PC_NC, 342 | HC_PL_NC, 343 | HC_PC_NL, 344 | PL_MC_NC, 345 | PC_ML_NC, 346 | PC_MC_NL, 347 | 348 | PC_NC, 349 | PL_NC, 350 | PC_NL, 351 | 352 | COUNT, 353 | }; 354 | }; 355 | 356 | struct HighOrder { 357 | enum types { 358 | START, 359 | 360 | // pp attachment 361 | PP_HC_MC, 362 | PP_HL_MC, 363 | PP_HC_ML, 364 | PP_HL_ML, 365 | 366 | PP_PL_HC_MC, 367 | PP_PL_HL_MC, 368 | PP_PL_HC_ML, 369 | PP_PL_HL_ML, 370 | 371 | // conjunction 372 | CC_CP_LP_RP, 373 | CC_CP_LC_RC, 374 | CC_CW_LP_RP, 375 | CC_CW_LC_RC, 376 | 377 | CC_LC_RC_FID, 378 | 379 | CC_CP_HC_AC, 380 | CC_CP_HL_AL, 381 | CC_CW_HC_AC, 382 | CC_CW_HL_AL, 383 | 384 | CC_LP_RP_LENDIFF, 385 | CC_LC_RC_LENDIFF, 386 | CC_LENDIFF, 387 | 388 | CC_LP_RP_CHILDF, 389 | CC_LC_RC_CHILDF, 390 | 391 | // PNX 392 | PNX_MW, 393 | PNX_HP_MW, 394 | 395 | // right branch 396 | RB, 397 | 398 | // child num 399 | CN_HP_NUM, 400 | CN_HP_LNUM_RNUM, 401 | CN_STR, 402 | 403 | // heavy 404 | HV_HP, 405 | HV_HC, 406 | 407 | // neighbor 408 | NB_HP_LC_RC, 409 | NB_HC_LC_RC, 410 | NB_HL_LC_RC, 411 | NB_GC_HC_LC_RC, 412 | NB_GC_HL_LC_RC, 413 | NB_GL_HC_LC_RC, 414 | 415 | // non-proj 416 | NP, 417 | NP_MC, 418 | NP_HC, 419 | NP_HL, 420 | NP_ML, 421 | NP_HC_MC, 422 | NP_HL_MC, 423 | NP_HC_ML, 424 | NP_HL_ML, 425 | 426 | // pos tagging features 427 | ppP_P, 428 | pP_P, 429 | P_nP, // duplicated... 430 | P_nnP, 431 | 432 | ppP_pP_P, 433 | pP_P_nP, 434 | P_nP_nnP, 435 | ppP_P_nP, 436 | pP_P_nnP, 437 | ppP_P_nnP, 438 | 439 | ppP_pP_P_nP, 440 | ppP_pP_P_nnP, 441 | pP_P_nP_nnP, 442 | ppP_P_nP_nnP, 443 | 444 | ppP_pP_P_nP_nnP, 445 | 446 | ppL_P, 447 | pL_P, 448 | L_P, 449 | P_nL, 450 | P_nnL, 451 | pP_L_P, 452 | L_P_nP, 453 | pP_L_P_nP, 454 | ppP_pP_L_P, 455 | L_P_nP_nnP, 456 | 457 | POS_PROB, 458 | P_POS_PROB, 459 | W_POS_PROB, 460 | SEG_PROB, 461 | W_SEG_PROB, 462 | 463 | SEG_P2, 464 | /* SEG_P1, 465 | SEG_U, 466 | SEG_N1, 467 | SEG_N2, 468 | SEG_P2_P1, 469 | SEG_P1_U, 470 | SEG_U_N1, 471 | SEG_N1_N2, 472 | SEG_IP2P1, 473 | SEG_IP1U, 474 | SEG_IUN1, 475 | SEG_IN1N2, 476 | SEG_IP3P1, 477 | SEG_IP2U, 478 | SEG_IP1N1, 479 | SEG_IUN2, 480 | */ 481 | pL_P_L, 482 | P_L_nL, 483 | pL_P_nL, 484 | P_START_C_pC, 485 | P_MID_C_pC, 486 | P_C_C0, 487 | P_C0, 488 | pP_P_pC_C, 489 | P_PRE, 490 | P_SUF, 491 | P_LENGTH, 492 | 493 | SEG_W, 494 | 495 | COUNT, 496 | }; 497 | }; 498 | 499 | class FeatureEncoder { 500 | public: 501 | FeatureEncoder(); 502 | virtual ~FeatureEncoder(); 503 | 504 | /******************************* 505 | * offset 506 | ******************************/ 507 | 508 | int largeOff; // word, lemma 509 | int midOff; // pos, cpos, type 510 | int flagOff; // flag, children num, length diff etc. 511 | int tempOff; // template 512 | 513 | int getBits(uint64_t x); 514 | 515 | uint64_t genCodePF(uint64_t temp, uint64_t p1); 516 | 517 | uint64_t genCodePPF(uint64_t temp, uint64_t p1, uint64_t p2); 518 | 519 | uint64_t genCodePPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3);; 520 | 521 | uint64_t genCodePPPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t p4); 522 | 523 | uint64_t genCodePPPPPF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t p4, uint64_t p5); 524 | 525 | uint64_t genCodeWF(uint64_t temp, uint64_t w1); 526 | 527 | uint64_t genCodePWF(uint64_t temp, uint64_t p1, uint64_t w1); 528 | 529 | uint64_t genCodeWWF(uint64_t temp, uint64_t w1, uint64_t w2); 530 | 531 | uint64_t genCodeWWW(uint64_t temp, uint64_t w1, uint64_t w2, uint64_t w3); 532 | 533 | uint64_t genCodePPWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t w1); 534 | 535 | uint64_t genCodePPPWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t p3, uint64_t w1); 536 | 537 | uint64_t genCodePWWF(uint64_t temp, uint64_t p1, uint64_t w1, uint64_t w2); 538 | 539 | uint64_t genCodePPWWF(uint64_t temp, uint64_t p1, uint64_t p2, uint64_t w1, uint64_t w2); 540 | 541 | uint64_t genCodeIIVF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1); 542 | 543 | uint64_t genCodeIIVVF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2); 544 | 545 | uint64_t genCodeIIVVPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2, uint64_t p1); 546 | 547 | uint64_t genCodeIIVPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t p1); 548 | 549 | uint64_t genCodeIIVPPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t p1, uint64_t p2); 550 | 551 | uint64_t genCodeIIVVPPF(uint64_t temp, uint64_t i1, uint64_t i2, uint64_t v1, uint64_t v2, uint64_t p1, uint64_t p2) ; 552 | }; 553 | 554 | } /* namespace segparser */ 555 | #endif /* FEATUREENCODER_H_ */ 556 | -------------------------------------------------------------------------------- /FeatureExtractor.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CacheTable.h 3 | * 4 | * Created on: Apr 2, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef FEATUREEXTRACTOR_H_ 9 | #define FEATUREEXTRACTOR_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "util/FeatureVector.h" 16 | #include "DependencyInstance.h" 17 | #include "SegParser.h" 18 | #include "Options.h" 19 | #include "DependencyPipe.h" 20 | #include "Parameters.h" 21 | 22 | namespace segparser { 23 | 24 | using namespace std; 25 | 26 | class SegParser; 27 | class Parameters; 28 | 29 | class CacheItem { 30 | public: 31 | FeatureVector fv; 32 | double score; 33 | int flag; 34 | 35 | CacheItem() { 36 | score = 0.0; 37 | flag = 123; 38 | } 39 | }; 40 | 41 | typedef boost::shared_ptr item_ptr; 42 | 43 | /*** 44 | * CacheTable always uses segIndex while FeatureExtractor always uses word/seg Index. 45 | * DependencyInstance is responsible for the conversion 46 | */ 47 | 48 | class PrunerFeatureExtractor; 49 | 50 | class CacheTable { 51 | public: 52 | CacheTable(); 53 | virtual ~CacheTable(); 54 | 55 | void initCacheTable(int _type, DependencyInstance* inst, PrunerFeatureExtractor* pfe, Options* options); 56 | 57 | bool isPruned(int h, int m); 58 | int arc2ID(int h, int m); 59 | 60 | int numSeg; // length based on seg 61 | int numWord; 62 | int type; 63 | 64 | int nuparcs; // number of un-pruned arcs, include gold 65 | 66 | vector arc; // first order cache [h][m] 67 | vector trips; // second order [dep id][sib] 68 | vector sibs; // [mod][sib][2] 69 | vector gpc; // [dep id][child] 70 | vector posho; // pos feature [hid] 71 | 72 | private: 73 | vector arc2id; // map (h->m) arc to an id in [0, nuparcs-1] 74 | vector pruned; // whether a (h->m) arc is pruned, not necessarily include gold 75 | }; 76 | 77 | class FeatureExtractor { 78 | public: 79 | FeatureExtractor(); 80 | FeatureExtractor(DependencyInstance* inst, SegParser* parser, Parameters* params, int thread); 81 | virtual ~FeatureExtractor(); 82 | 83 | CacheTable* getCacheTable(DependencyInstance* s); 84 | 85 | double getPartialDepScore(DependencyInstance* s, HeadIndex& x, CacheTable* cache); 86 | double getPartialBigramDepScore(DependencyInstance* s, HeadIndex& x, HeadIndex& y, CacheTable* cache); 87 | double getPartialPosScore(DependencyInstance* s, HeadIndex& x, CacheTable* cache); 88 | double getScore(DependencyInstance* s); 89 | double getScore(DependencyInstance* s, CacheTable* cache); 90 | void getPartialFv(DependencyInstance* s, HeadIndex& x, FeatureVector* fv); 91 | void getFv(DependencyInstance* s, FeatureVector* fv); 92 | 93 | vector isPruned(DependencyInstance* s, HeadIndex& m, CacheTable* cache); 94 | 95 | int numWord; 96 | int type; 97 | int thread; 98 | 99 | //DependencyInstance* inst; so risky to add this variable in multi-thread scenario. Other variables are read-only 100 | DependencyPipe* pipe; 101 | Parameters* parameters; 102 | SegParser* pruner; 103 | boost::shared_ptr pfe; 104 | 105 | void (*getArcFv)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, FeatureVector*, CacheTable*); 106 | double (*getArcScore)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, CacheTable*); 107 | 108 | void (*getSibsFv)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, bool, FeatureVector*, CacheTable*); 109 | double (*getSibsScore)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, bool, CacheTable*); 110 | 111 | void (*getTripsFv)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, HeadIndex&, FeatureVector*, CacheTable*); 112 | double (*getTripsScore)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, HeadIndex&, CacheTable*); 113 | 114 | void (*getGPCFv)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, HeadIndex&, FeatureVector*, CacheTable*); 115 | double (*getGPCScore)(FeatureExtractor*, DependencyInstance*, HeadIndex&, HeadIndex&, HeadIndex&, CacheTable*); 116 | 117 | void (*getPosHOFv)(FeatureExtractor*, DependencyInstance*, HeadIndex&, FeatureVector*, CacheTable*); 118 | double (*getPosHOScore)(FeatureExtractor*, DependencyInstance*, HeadIndex&, CacheTable*); 119 | 120 | // pre-computed 121 | void getPos1OFv(DependencyInstance* inst, HeadIndex& m, FeatureVector* fv); 122 | double getPos1OScore(DependencyInstance* inst, HeadIndex& m); 123 | void getSegFv(DependencyInstance* inst, int wordid, FeatureVector* fv); 124 | double getSegScore(DependencyInstance* inst, int worid); 125 | 126 | vector optSegCacheMap; // cache for optimal seg for every word with different POS 127 | vector subOptSegCacheMap; // cache for sub-optimal seg for one word with optimal POS 128 | 129 | // cache not related to seg/pos choices 130 | vector seg1o; // seg feature [wordid] 131 | vector pos1o; // pos feature [segid] 132 | 133 | protected: 134 | void constructCacheMap(DependencyInstance* s); 135 | void initCacheMap(DependencyInstance* s); 136 | 137 | // feature functions and pointers 138 | static void getArcFvUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& h, HeadIndex& m, FeatureVector* fv, CacheTable* cache); 139 | static void getArcFvAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& h, HeadIndex& m, FeatureVector* fv, CacheTable* cache); 140 | static double getArcScoreUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& h, HeadIndex& m, CacheTable* cache); 141 | static double getArcScoreAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& h, HeadIndex& m, CacheTable* cache); 142 | 143 | static void getSibsFvUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& ch1, HeadIndex& ch2, bool isSt, FeatureVector* fv, CacheTable* cache); 144 | static void getSibsFvAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& ch1, HeadIndex& ch2, bool isSt, FeatureVector* fv, CacheTable* cache); 145 | static double getSibsScoreUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& ch1, HeadIndex& ch2, bool isSt, CacheTable* cache); 146 | static double getSibsScoreAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& ch1, HeadIndex& ch2, bool isSt, CacheTable* cache); 147 | 148 | static void getTripsFvUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& par, HeadIndex& ch1, HeadIndex& ch2, FeatureVector* fv, CacheTable* cache); 149 | static void getTripsFvAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& par, HeadIndex& ch1, HeadIndex& ch2, FeatureVector* fv, CacheTable* cache); 150 | static double getTripsScoreUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& par, HeadIndex& ch1, HeadIndex& ch2, CacheTable* cache); 151 | static double getTripsScoreAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& par, HeadIndex& ch1, HeadIndex& ch2, CacheTable* cache); 152 | 153 | static void getGPCFvUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& gp, HeadIndex& par, HeadIndex& c, FeatureVector* fv, CacheTable* cache); 154 | static void getGPCFvAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& gp, HeadIndex& par, HeadIndex& c, FeatureVector* fv, CacheTable* cache); 155 | static double getGPCScoreUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& gp, HeadIndex& par, HeadIndex& c, CacheTable* cache); 156 | static double getGPCScoreAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& gp, HeadIndex& par, HeadIndex& c, CacheTable* cache); 157 | 158 | static void getPosHOFvUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& m, FeatureVector* fv, CacheTable* cache); 159 | static void getPosHOFvAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& m, FeatureVector* fv, CacheTable* cache); 160 | static double getPosHOScoreUnsafe(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& m, CacheTable* cache); 161 | static double getPosHOScoreAtomic(FeatureExtractor* fe, DependencyInstance* inst, HeadIndex& m, CacheTable* cache); 162 | 163 | void setAtomic(int thread); 164 | bool atomic; // whether the load/store need atomic operation 165 | 166 | // cache map 167 | vector optSegCacheStPos; // start position in the cache map for each seg 168 | 169 | vector subOptSegCacheStPos; // start position in the cache map for each word 170 | 171 | vector seg1oStPos; // [word]->segcand 172 | vector pos1oStPos2d; // [word][segcand]->segid 173 | vector pos1oStPos3d; // [word][segcand][segid]->poscand 174 | 175 | int getSeg1OCachePos(int wordid, int segCandID); 176 | int getPos1OCachePos(int wordid, int segCandID, int segid, int posCandID); 177 | 178 | // others 179 | Options* options; 180 | }; 181 | 182 | class PrunerFeatureExtractor : public segparser::FeatureExtractor { 183 | public: 184 | CacheTable prunerCache; 185 | 186 | PrunerFeatureExtractor(); 187 | void init(DependencyInstance* inst, SegParser* pruner, int thread); 188 | void prune(DependencyInstance* inst, HeadIndex& m, vector& pruned); 189 | }; 190 | 191 | } /* namespace segparser */ 192 | #endif /* FEATUREEXTRACTOR_H_ */ 193 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 yuanzh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Options.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Options.cpp 3 | * 4 | * Created on: Mar 27, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "Options.h" 9 | #include "util/Constant.h" 10 | #include "util/StringUtils.h" 11 | #include 12 | #include 13 | #include 14 | 15 | namespace segparser { 16 | 17 | using namespace std; 18 | 19 | Options::Options() { 20 | trainFile = ""; 21 | testFile = ""; 22 | 23 | outFile = ""; 24 | modelName = ""; 25 | 26 | lang = -1; 27 | 28 | train = false; 29 | test = false; 30 | 31 | trainPruner = true; 32 | 33 | learningMode = DecodingMode::HillClimb; 34 | testingMode = DecodingMode::HillClimb; 35 | 36 | // parameter 37 | numIters = 10; 38 | maxHead = 20; 39 | pruneThresh = 0.05; 40 | 41 | trainSentences = 1000000; 42 | testSentences = 1000000; 43 | maxLength = 100; 44 | 45 | devThread = 5; 46 | trainThread = 10; 47 | 48 | seed = 0; 49 | regC = 0.0001; 50 | 51 | // feature; 52 | useCS = true; // consecutive sibling 53 | useGP = true; // grandparent 54 | useHO = true; // high order and global 55 | useSP = true; // seg/pos feature 56 | 57 | trainConvergeIter = 200; 58 | testConvergeIter = 200; 59 | 60 | evalPunc = true; 61 | useTedEval = false; 62 | jointSegPos = true; 63 | earlyStop = 40; 64 | 65 | saveBestModel = true; 66 | bestScore = -100; 67 | } 68 | 69 | Options::~Options() { 70 | } 71 | 72 | void Options::processArguments(int argc, char** argv) { 73 | for(int i = 0; i < argc; ++i) { 74 | string str(argv[i]); 75 | vector pair; 76 | StringSplit(str, ":", &pair); 77 | if(pair[0].compare("train") == 0) { 78 | train = true; 79 | } 80 | if(pair[0].compare("test") == 0) { 81 | test = true; 82 | } 83 | if(pair[0].compare("iters") == 0) { 84 | numIters = atoi(pair[1].c_str()); 85 | } 86 | if(pair[0].compare("output-file") == 0) { 87 | outFile = pair[1]; 88 | } 89 | if(pair[0].compare("train-file") == 0) { 90 | trainFile = pair[1]; 91 | } 92 | if(pair[0].compare("test-file") == 0) { 93 | testFile = pair[1]; 94 | if (outFile.empty()) 95 | outFile = testFile + ".res"; 96 | } 97 | if(pair[0].compare("model-name") == 0) { 98 | modelName = pair[1]; 99 | } 100 | if (pair[0].compare("seed") == 0) { 101 | seed = atoi(pair[1].c_str()); 102 | } 103 | if (pair[0].compare("devthread") == 0) { 104 | devThread = atoi(pair[1].c_str()); 105 | } 106 | if (pair[0].compare("trainthread") == 0) { 107 | trainThread = atoi(pair[1].c_str()); 108 | } 109 | if (pair[0].compare("max-sent") == 0) { 110 | trainSentences = atoi(pair[1].c_str()); 111 | } 112 | if (pair[0].compare("max-test-sent") == 0) { 113 | testSentences = atoi(pair[1].c_str()); 114 | } 115 | if (pair[0].compare("C") == 0) { 116 | regC = atof(pair[1].c_str()); 117 | } 118 | if (pair[0].compare("train-converge") == 0) { 119 | trainConvergeIter = atoi(pair[1].c_str()); 120 | } 121 | if (pair[0].compare("test-converge") == 0) { 122 | testConvergeIter = atoi(pair[1].c_str()); 123 | } 124 | if (pair[0].compare("tedeval") == 0) { 125 | useTedEval = (pair[1] == "true" ? true : false); 126 | } 127 | if (pair[0].compare("joint") == 0) { 128 | jointSegPos = (pair[1] == "true" ? true : false); 129 | } 130 | if (pair[0].compare("evalpunc") == 0) { 131 | evalPunc = (pair[1] == "true" ? true : false); 132 | } 133 | if (pair[0].compare("earlystop") == 0) { 134 | earlyStop = atoi(pair[1].c_str()); 135 | } 136 | if (pair[0].compare("savebest") == 0) { 137 | saveBestModel = (pair[1] == "true" ? true : false); 138 | } 139 | if (pair[0].compare("ho") == 0) { 140 | useHO = (pair[1] == "true" ? true : false); 141 | } 142 | 143 | //TODO: add useHO option 144 | } 145 | 146 | 147 | string file = trainFile; 148 | if (file.empty()) 149 | file = testFile; 150 | 151 | lang = findLang(file); 152 | } 153 | 154 | int Options::findLang(string file) { 155 | for (int i = 0; i < PossibleLang::Count; ++i) 156 | if (file.find(PossibleLang::langString[i]) != string::npos) { 157 | return i; 158 | } 159 | cout << "Warning: unknow language" << endl; 160 | return PossibleLang::Count; 161 | } 162 | 163 | void Options::setPrunerOptions() { 164 | modelName = modelName + ".pruner"; 165 | 166 | test = false; 167 | 168 | trainPruner = false; 169 | 170 | learningMode = DecodingMode::Exact; 171 | testingMode = DecodingMode::Exact; 172 | 173 | // parameter 174 | numIters = 10; 175 | 176 | devThread = 1; 177 | trainThread = 1; 178 | 179 | regC = 0.1; 180 | 181 | // feature; 182 | useCS = false; // consecutive sibling 183 | useGP = false; // grandparent 184 | useHO = false; // high order and global 185 | useSP = false; 186 | 187 | saveBestModel = false; 188 | } 189 | 190 | void Options::outputArg() { 191 | cout << "------\nFLAGS\n------" << endl; 192 | cout << "train-file: " << trainFile << endl; 193 | cout << "test-file: " << testFile << endl; 194 | cout << "out-file: " << outFile << endl; 195 | cout << "model-name: " << modelName << endl; 196 | cout << "train: " << train << endl; 197 | cout << "test: " << test << endl; 198 | cout << "training-iterations: " << numIters << endl; 199 | cout << "seed: " << seed << endl; 200 | cout << "use consecutive sibling: " << useCS << endl; 201 | cout << "use grandparent: " << useGP << endl; 202 | cout << "use grand sibling, tri-sibling and high order: " << useHO << endl; 203 | cout << "learning mode: " << learningMode << endl; 204 | cout << "testing mode: " << testingMode << endl; 205 | cout << "train thread: " << trainThread << endl; 206 | cout << "dev thread: " << devThread << endl; 207 | cout << "reg C: " << regC << endl; 208 | cout << "train converge iter: " << trainConvergeIter << endl; 209 | cout << "test converge iter: " << testConvergeIter << endl; 210 | cout << "early stop: " << earlyStop << endl; 211 | cout << "tedeval: " << useTedEval << endl; 212 | cout << "joint seg pos: " << jointSegPos << endl; 213 | cout << "prune: " << trainPruner << endl; 214 | cout << "save best model: " << saveBestModel << endl; 215 | cout << "------\n" << endl; 216 | } 217 | 218 | } /* namespace segparser */ 219 | -------------------------------------------------------------------------------- /Options.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Options.h 3 | * 4 | * Created on: Mar 27, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef OPTIONS_H_ 9 | #define OPTIONS_H_ 10 | 11 | #include 12 | 13 | namespace segparser { 14 | 15 | using namespace std; 16 | 17 | class Options { 18 | public: 19 | public: 20 | // file name 21 | string trainFile; 22 | string testFile; 23 | 24 | string outFile; 25 | string modelName; 26 | 27 | int lang; 28 | 29 | // model type 30 | bool train; 31 | bool test; 32 | 33 | bool trainPruner; 34 | 35 | int learningMode; 36 | int testingMode; 37 | 38 | // parameter 39 | int numIters; 40 | int maxHead; 41 | double pruneThresh; 42 | 43 | int trainSentences; 44 | int testSentences; 45 | int maxLength; // maximum length of the sentences during *training* 46 | 47 | int devThread; 48 | int trainThread; // only useful when hill climbing training 49 | 50 | int seed; 51 | double regC; 52 | 53 | // feature; 54 | bool useCS; // consecutive sibling 55 | bool useGP; // grandparent 56 | bool useHO; // grand-sibling, tri-sibling and high order and global 57 | bool useSP; // seg pos feature 58 | 59 | int trainConvergeIter; // for hill climbing 60 | int testConvergeIter; 61 | 62 | bool evalPunc; 63 | bool useTedEval; 64 | bool jointSegPos; // joint model or pipeline 65 | int earlyStop; // early stop strategy in training 66 | 67 | bool saveBestModel; 68 | double bestScore; 69 | 70 | Options(); 71 | virtual ~Options(); 72 | 73 | void processArguments(int argc, char** argv); 74 | void setPrunerOptions(); 75 | void outputArg(); 76 | 77 | private: 78 | int findLang(string file); 79 | }; 80 | 81 | } /* namespace segparser */ 82 | #endif /* OPTIONS_H_ */ 83 | -------------------------------------------------------------------------------- /Parameters.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Parameters.cpp 3 | * 4 | * Created on: Apr 6, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "Parameters.h" 9 | #include 10 | #include "util/SerializationUtils.h" 11 | 12 | namespace segparser { 13 | 14 | Parameters::Parameters(int size, Options* options) 15 | : size(size), options(options){ 16 | parameters.clear(); 17 | total.clear(); 18 | parameters.resize(size, 0.0); 19 | total.resize(size, 0.0); 20 | } 21 | 22 | Parameters::~Parameters() { 23 | } 24 | 25 | void Parameters::copyParams(Parameters* param) { 26 | parameters = param->parameters; 27 | total = param->total; 28 | size = param->size; 29 | options = param->options; 30 | } 31 | 32 | void Parameters::averageParams(double avVal) { 33 | std::cout << "update time: " << avVal << std::endl; 34 | for (int j = 0; j < size; ++j) 35 | parameters[j] -= (avVal == 0 ? 0 : total[j] / avVal); 36 | } 37 | 38 | double Parameters::numError(DependencyInstance* gold, DependencyInstance* pred) { 39 | ThrowException("should not be here"); 40 | double e = 0.0; 41 | 42 | for (int i = 1; i < gold->numWord; ++i) { 43 | SegInstance& goldSeg = gold->word[i].getCurrSeg(); 44 | SegInstance& predSeg = pred->word[i].getCurrSeg(); 45 | 46 | if (gold->word[i].currSegCandID != pred->word[i].currSegCandID) { 47 | e += 1.5 * predSeg.size(); 48 | } 49 | else { 50 | // compare match element 51 | for (int j = 0; j < predSeg.size(); ++j) { 52 | SegElement& goldEle = goldSeg.element[j]; 53 | SegElement& predEle = predSeg.element[j]; 54 | 55 | if (goldEle.currPosCandID != predEle.currPosCandID) { 56 | e += 1.0; 57 | } 58 | else if (goldEle.dep != predEle.dep) { 59 | e += 1.0; 60 | } 61 | else if (goldEle.labid != predEle.labid) { 62 | e += 0.5; 63 | } 64 | } 65 | } 66 | 67 | } 68 | return e; 69 | } 70 | 71 | double Parameters::elementError(WordInstance& gold, WordInstance& pred, int segid) { 72 | double e = 0.0; 73 | 74 | if (gold.currSegCandID != pred.currSegCandID) { 75 | e += 2.0; // this value should not matter... 76 | } 77 | else { 78 | SegElement& goldEle = gold.getCurrSeg().element[segid]; 79 | SegElement& predEle = pred.getCurrSeg().element[segid]; 80 | 81 | if (goldEle.currPosCandID != predEle.currPosCandID) { 82 | e += 1.0; // this value should not matter... 83 | } 84 | else if (goldEle.dep != predEle.dep) { 85 | e += 1.0; 86 | } 87 | else if (goldEle.labid != predEle.labid) { 88 | e += 0.5; 89 | } 90 | 91 | } 92 | 93 | return e; 94 | } 95 | 96 | double Parameters::wordError(WordInstance& gold, WordInstance& pred) { 97 | double e = 0.0; 98 | 99 | if (gold.currSegCandID != pred.currSegCandID) { 100 | e += 1.0 * (gold.getCurrSeg().size() + pred.getCurrSeg().size()); 101 | } 102 | else { 103 | assert(gold.getCurrSeg().size() == pred.getCurrSeg().size()); 104 | 105 | for (int i = 0; i < gold.getCurrSeg().size(); ++i) { 106 | SegElement& goldEle = gold.getCurrSeg().element[i]; 107 | SegElement& predEle = pred.getCurrSeg().element[i]; 108 | 109 | assert(goldEle.labid == predEle.labid); 110 | 111 | if (goldEle.currPosCandID != predEle.currPosCandID) { 112 | e += 1.0; 113 | } 114 | else if (goldEle.dep != predEle.dep) { 115 | e += 1.0; 116 | } 117 | else if (goldEle.labid != predEle.labid) { 118 | e += 0.5; 119 | } 120 | } 121 | } 122 | return e; 123 | } 124 | 125 | double Parameters::wordDepError(WordInstance& gold, WordInstance& pred) { 126 | double e = 0.0; 127 | 128 | if (gold.currSegCandID != pred.currSegCandID) { 129 | e += 1.0 * (gold.getCurrSeg().size() + pred.getCurrSeg().size()); 130 | } 131 | else { 132 | assert(gold.getCurrSeg().size() == pred.getCurrSeg().size()); 133 | 134 | for (int i = 0; i < gold.getCurrSeg().size(); ++i) { 135 | SegElement& goldEle = gold.getCurrSeg().element[i]; 136 | SegElement& predEle = pred.getCurrSeg().element[i]; 137 | 138 | assert(goldEle.labid == predEle.labid); 139 | 140 | if (goldEle.currPosCandID != predEle.currPosCandID) { 141 | e += 1.0; 142 | } 143 | else if (goldEle.dep != predEle.dep) { 144 | e += 1.0; 145 | } 146 | else if (goldEle.labid != predEle.labid) { 147 | e += 0.5; 148 | } 149 | } 150 | } 151 | 152 | return e; 153 | } 154 | 155 | void Parameters::update(DependencyInstance* target, DependencyInstance* curr, 156 | FeatureVector* diffFv, double loss, FeatureExtractor* fe, int upd) { 157 | // upd start from 0 158 | 159 | //double e = numError(gold, pred); 160 | //double loss = e - diffScore; 161 | 162 | if (loss < 1e-4) 163 | return; 164 | 165 | double l2norm = diffFv->dotProduct(diffFv); 166 | if (l2norm <= 1e-6) 167 | return; 168 | 169 | double alpha = loss/l2norm; 170 | 171 | if (alpha > options->regC) 172 | alpha = options->regC; 173 | 174 | if (alpha > 0) { 175 | // update theta 176 | for (unsigned int i = 0; i < diffFv->binaryIndex.size(); ++i) { 177 | parameters[diffFv->binaryIndex[i]] += alpha; 178 | total[diffFv->binaryIndex[i]] += upd * alpha; 179 | } 180 | for (unsigned int i = 0; i < diffFv->negBinaryIndex.size(); ++i) { 181 | parameters[diffFv->negBinaryIndex[i]] -= alpha; 182 | total[diffFv->negBinaryIndex[i]] -= upd * alpha; 183 | } 184 | for (unsigned int i = 0; i < diffFv->normalIndex.size(); ++i) { 185 | double val = min(2.0, max(-2.0, diffFv->normalValue[i])); 186 | parameters[diffFv->normalIndex[i]] += alpha * val; 187 | total[diffFv->normalIndex[i]] += upd * alpha * val; 188 | } 189 | } 190 | } 191 | 192 | double Parameters::getScore(FeatureVector* fv) { 193 | double score = 0.0; 194 | for (unsigned int i = 0; i < fv->binaryIndex.size(); ++i) { 195 | score += parameters[fv->binaryIndex[i]]; 196 | } 197 | for (unsigned int i = 0; i < fv->negBinaryIndex.size(); ++i) { 198 | score -= parameters[fv->negBinaryIndex[i]]; 199 | } 200 | for (unsigned int i = 0; i < fv->normalIndex.size(); ++i) { 201 | score += parameters[fv->normalIndex[i]] * fv->normalValue[i]; 202 | } 203 | return score; 204 | } 205 | 206 | void Parameters::writeParams(FILE* fs) { 207 | CHECK(WriteInteger(fs, size)); 208 | CHECK(WriteDoubleArray(fs, parameters)); 209 | } 210 | 211 | void Parameters::readParams(FILE* fs) { 212 | CHECK(ReadInteger(fs, &size)); 213 | CHECK(ReadDoubleArray(fs, ¶meters)); 214 | } 215 | 216 | } /* namespace segparser */ 217 | -------------------------------------------------------------------------------- /Parameters.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Parameters.h 3 | * 4 | * Created on: Apr 6, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef PARAMETERS_H_ 9 | #define PARAMETERS_H_ 10 | 11 | #include 12 | #include "Options.h" 13 | #include "DependencyInstance.h" 14 | #include "util/FeatureVector.h" 15 | #include "FeatureExtractor.h" 16 | 17 | namespace segparser { 18 | 19 | using namespace std; 20 | 21 | class FeatureExtractor; 22 | 23 | class Parameters { 24 | public: 25 | vector parameters; 26 | vector total; 27 | int size; 28 | 29 | Parameters(int size, Options* options); 30 | virtual ~Parameters(); 31 | 32 | void copyParams(Parameters* param); 33 | void averageParams(double avVal); 34 | void update(DependencyInstance* gold, DependencyInstance* pred, 35 | FeatureVector* diffFv, double loss, FeatureExtractor* fe, int upd); 36 | double getScore(FeatureVector* fv); 37 | 38 | void writeParams(FILE* fs); 39 | void readParams(FILE* fs); 40 | 41 | double elementError(WordInstance& gold, WordInstance& pred, int segid); 42 | double wordError(WordInstance& gold, WordInstance& pred); 43 | double wordDepError(WordInstance& gold, WordInstance& pred); 44 | private: 45 | Options* options; 46 | 47 | int maxMatch(SegInstance& gold, SegInstance& pred, vector& match); 48 | double numError(DependencyInstance* gold, DependencyInstance* pred); 49 | }; 50 | 51 | } /* namespace segparser */ 52 | #endif /* PARAMETERS_H_ */ 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #### SegParser 2 | 3 | Randomized Greedy algorithm for joint segmentation, POS tagging and dependency parsing 4 | 5 | ========= 6 | 7 | #### Usage 8 | 9 | ##### 1. Compilation 10 | 11 | To compile the project, first make sure you have installed boost and boost-regex on your machine. Next, go to the "Release" directory and run command "make all" to compile the code. Note that the implementation uses some c++0x/c++11 features. Please make sure your compiler supports them. 12 | 13 |
14 | 15 | ##### 2. Data Format 16 | 17 | The data format for each sentence has two parts. The first part is similar to the one used in CoNLL-X shared task. The only difference is the index in the first column. Here the index format is "token index/segment index", where the token index starts from 1 (0 is for the root), while the segment index starts from 0. 18 | 19 | The second part encodes the search space for segmentation and POS tagging. Each line contains a string for the lattice structure of each token. The format is as follows. 20 | 21 | line := Token form\tCandidate1\tCandidate2\t... 22 | 23 | Candidate := Segmentation||Al index||Morphology index||Morphology value||Candidate probability 24 | 25 | Segmentation := Segment1&&Segment2&&... 26 | 27 | Segment := Surface form@#Lemma form@#POS candidate1@#POS candidate2@#... 28 | 29 | POS candidate := POS tag_probability 30 | 31 | "data" directory includes sample data files for the SPMRL dataset. 32 | 33 | ##### 3. Datasets 34 | 35 | Because of the license issue, datasets are not directly released here. You can find sample files in "data" directory. Please contact me for the full dataset if you are interested in. 36 | 37 | UPDATE: data generator for SPMRL dataset and needed files for generating testing data are added into the directory spmrl_data_generator. 38 | 39 | ##### 4. Usage 40 | 41 | Take a look at the scripts "run_DATA.sh" and "run_DATA_test.sh" where DATA=spmrl|classical|chinese. For example, to train a model on the SPMRL dataset, you can simply run 42 | 43 | run_spmrl.sh run1 44 | 45 | The model and development results will be saved in directory "runs". Note that the model is evaluated on the development set (if exists) after each epoch *in parallel* with the training. After the model is trained, you can evaluate it on the test set by running 46 | 47 | run_spmrl_test.sh run1 48 | 49 | -------------------------------------------------------------------------------- /Release/SharedTaskCommon.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/Release/SharedTaskCommon.pyc -------------------------------------------------------------------------------- /Release/decoder/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../decoder/ClassifierDecoder.cpp \ 8 | ../decoder/DependencyDecoder.cpp \ 9 | ../decoder/DevelopmentThread.cpp \ 10 | ../decoder/HillClimbingDecoder.cpp 11 | 12 | OBJS += \ 13 | ./decoder/ClassifierDecoder.o \ 14 | ./decoder/DependencyDecoder.o \ 15 | ./decoder/DevelopmentThread.o \ 16 | ./decoder/HillClimbingDecoder.o 17 | 18 | CPP_DEPS += \ 19 | ./decoder/ClassifierDecoder.d \ 20 | ./decoder/DependencyDecoder.d \ 21 | ./decoder/DevelopmentThread.d \ 22 | ./decoder/HillClimbingDecoder.d 23 | 24 | 25 | # Each subdirectory must supply rules for building sources it contributes 26 | decoder/%.o: ../decoder/%.cpp 27 | @echo 'Building file: $<' 28 | @echo 'Invoking: GCC C++ Compiler' 29 | g++ -D__GXX_EXPERIMENTAL_CXX0X__ -D__cplusplus=201103L -O3 -Wall -c -fmessage-length=0 -std=c++0x -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<" 30 | @echo 'Finished building: $<' 31 | @echo ' ' 32 | 33 | 34 | -------------------------------------------------------------------------------- /Release/io/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../io/DependencyReader.cpp \ 8 | ../io/DependencyWriter.cpp 9 | 10 | OBJS += \ 11 | ./io/DependencyReader.o \ 12 | ./io/DependencyWriter.o 13 | 14 | CPP_DEPS += \ 15 | ./io/DependencyReader.d \ 16 | ./io/DependencyWriter.d 17 | 18 | 19 | # Each subdirectory must supply rules for building sources it contributes 20 | io/%.o: ../io/%.cpp 21 | @echo 'Building file: $<' 22 | @echo 'Invoking: GCC C++ Compiler' 23 | g++ -D__GXX_EXPERIMENTAL_CXX0X__ -D__cplusplus=201103L -O3 -Wall -c -fmessage-length=0 -std=c++0x -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<" 24 | @echo 'Finished building: $<' 25 | @echo ' ' 26 | 27 | 28 | -------------------------------------------------------------------------------- /Release/lattice_to_segmentation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Author : Reut Tsarfaty, July 2013 3 | # ligth modifs: Djame Seddah 4 | # +modif to support ptb's lattice files 5 | import sys 6 | 7 | if sys.argv[1] == "-ptb": 8 | ptb=1 9 | else: 10 | ptb=0 11 | 12 | 13 | 14 | 15 | prev_tok = "" 16 | out_line = "" 17 | first=1 18 | for line in sys.stdin: 19 | line = line.strip().split() 20 | if not line: 21 | #out_line += "\t".join([token,form]) 22 | if out_line: 23 | print out_line 24 | else: 25 | print "\n" 26 | prev_tok = "" 27 | out_line = "" 28 | #print "\n" 29 | continue 30 | 31 | if ptb == -1: #this code is bogus, the ptb hebrew files lacks the lemma field 32 | start, end, form, lemma, cpos, fpos, feats, token = line 33 | else: 34 | start, end, form = line[0:3] 35 | token = line[-1] 36 | 37 | if prev_tok == token: 38 | out_line += "".join([":",form]) 39 | prev_tok = token 40 | else: 41 | if first==1: #lame modif to avoid first line void 42 | first=0 43 | else: 44 | print out_line 45 | out_line = "" 46 | out_line += "\t".join([token,form]) 47 | prev_tok = token 48 | print "\n" 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /Release/makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | -include ../makefile.init 6 | 7 | RM := rm -rf 8 | 9 | # All of the sources participating in the build are defined here 10 | -include sources.mk 11 | -include util/subdir.mk 12 | -include io/subdir.mk 13 | -include decoder/subdir.mk 14 | -include subdir.mk 15 | -include objects.mk 16 | 17 | ifneq ($(MAKECMDGOALS),clean) 18 | ifneq ($(strip $(C++_DEPS)),) 19 | -include $(C++_DEPS) 20 | endif 21 | ifneq ($(strip $(C_DEPS)),) 22 | -include $(C_DEPS) 23 | endif 24 | ifneq ($(strip $(CC_DEPS)),) 25 | -include $(CC_DEPS) 26 | endif 27 | ifneq ($(strip $(CPP_DEPS)),) 28 | -include $(CPP_DEPS) 29 | endif 30 | ifneq ($(strip $(CXX_DEPS)),) 31 | -include $(CXX_DEPS) 32 | endif 33 | ifneq ($(strip $(C_UPPER_DEPS)),) 34 | -include $(C_UPPER_DEPS) 35 | endif 36 | endif 37 | 38 | -include ../makefile.defs 39 | 40 | # Add inputs and outputs from these tool invocations to the build variables 41 | 42 | # All Target 43 | all: SegParser 44 | 45 | # Tool invocations 46 | SegParser: $(OBJS) $(USER_OBJS) 47 | @echo 'Building target: $@' 48 | @echo 'Invoking: GCC C++ Linker' 49 | g++ -o "SegParser" $(OBJS) $(USER_OBJS) $(LIBS) 50 | @echo 'Finished building target: $@' 51 | @echo ' ' 52 | 53 | # Other Targets 54 | clean: 55 | -$(RM) $(OBJS)$(C++_DEPS)$(C_DEPS)$(CC_DEPS)$(CPP_DEPS)$(EXECUTABLES)$(CXX_DEPS)$(C_UPPER_DEPS) SegParser 56 | -@echo ' ' 57 | 58 | .PHONY: all clean dependents 59 | .SECONDARY: 60 | 61 | -include ../makefile.targets 62 | -------------------------------------------------------------------------------- /Release/objects.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | USER_OBJS := 6 | 7 | LIBS := -lboost_regex -lpthread 8 | 9 | -------------------------------------------------------------------------------- /Release/run.sh: -------------------------------------------------------------------------------- 1 | args=$1 2 | runid=$2 3 | 4 | ln -s $args.seg.cv2.ascii.train ../../data/$args/$args.train.$runid 5 | ln -s $args.seg.cv2.ascii.test ../../data/$args/$args.test.$runid 6 | 7 | ./SegParser train train-file:../../data/$args/$args.train.$runid model-name:../../data/$args/$args.model.$runid decode-type:non-proj test test-file:../../data/$args/$args.test.$runid seed:${runid} $@ 8 | 9 | rm ../../data/$args/$args.train.$runid 10 | rm ../../data/$args/$args.test.$runid 11 | 12 | -------------------------------------------------------------------------------- /Release/run_chinese.sh: -------------------------------------------------------------------------------- 1 | runid=$1 2 | 3 | ln -s ctb.seg.train ../data/ctb.train.$runid 4 | ln -s ctb.seg.dev ../data/ctb.test.$runid 5 | 6 | ./SegParser train test train-file:../data/ctb.train.$runid model-name:../runs/ctb.model.$runid test-file:../data/ctb.test.$runid output-file:../runs/ctb.out.$runid seed:14 earlystop:40 evalpunc:false C:0.001 train-converge:300 test-converge:300 $@ | tee ../runs/ctb.log.$runid 7 | 8 | rm ../data/ctb.train.$runid 9 | rm ../data/ctb.test.$runid 10 | 11 | -------------------------------------------------------------------------------- /Release/run_chinese_test.sh: -------------------------------------------------------------------------------- 1 | runid=$1 2 | 3 | ln -s ctb.seg.test ../data/ctb.test.$runid 4 | 5 | ./SegParser model-name:../runs/ctb.model.$runid test test-file:../data/ctb.test.$runid output-file:../runs/ctb.out.$runid seed:14 evalpunc:false test-converge:300 devthread:10 $@ 6 | 7 | rm ../data/ctb.test.$runid 8 | 9 | -------------------------------------------------------------------------------- /Release/run_classical.sh: -------------------------------------------------------------------------------- 1 | runid=$1 2 | 3 | ln -s qatar.seg.train ../data/qatar.train.$runid 4 | ln -s qatar.seg.test ../data/qatar.test.$runid 5 | 6 | ./SegParser train test train-file:../data/qatar.train.$runid model-name:../runs/qatar.model.$runid test-file:../data/qatar.test.$runid output-file:../runs/qatar.out.$runid seed:1 ho:false earlystop:20 evalpunc:true C:0.0001 train-converge:200 test-converge:200 savebest:false iters:5 $@ | tee ../runs/qatar.log.$runid 7 | 8 | rm ../data/qatar.train.$runid 9 | rm ../data/qatar.test.$runid 10 | 11 | -------------------------------------------------------------------------------- /Release/run_classical_test.sh: -------------------------------------------------------------------------------- 1 | runid=$1 2 | 3 | ln -s qatar.seg.test ../data/qatar.test.$runid 4 | 5 | ./SegParser model-name:../runs/qatar.model.$runid test test-file:../data/qatar.test.$runid output-file:../runs/qatar.out.$runid seed:1 ho:false evalpunc:true test-converge:200 devthread:10 $@ 6 | 7 | rm ../data/qatar.test.$runid 8 | 9 | -------------------------------------------------------------------------------- /Release/run_spmrl.sh: -------------------------------------------------------------------------------- 1 | runid=$1 2 | 3 | ln -s spmrl.seg.train ../data/spmrl.train.$runid 4 | ln -s spmrl.seg.dev ../data/spmrl.test.$runid 5 | 6 | ./SegParser train test train-file:../data/spmrl.train.$runid model-name:../runs/spmrl.model.$runid test-file:../data/spmrl.test.$runid output-file:../runs/spmrl.out.$runid seed:2 earlystop:20 evalpunc:true C:0.01 train-converge:200 test-converge:200 $@ | tee ../runs/spmrl.log.$runid 7 | 8 | rm ../data/spmrl.train.$runid 9 | rm ../data/spmrl.test.$runid 10 | 11 | -------------------------------------------------------------------------------- /Release/run_spmrl_test.sh: -------------------------------------------------------------------------------- 1 | runid=$1 2 | 3 | ln -s spmrl.seg.test ../data/spmrl.test.$runid 4 | 5 | ./SegParser model-name:../runs/spmrl.model.$runid test test-file:../data/spmrl.test.$runid output-file:../runs/spmrl.out.$runid seed:2 evalpunc:true test-converge:200 tedeval:true devthread:10 $@ 6 | 7 | rm ../data/spmrl.test.$runid 8 | 9 | -------------------------------------------------------------------------------- /Release/run_test.sh: -------------------------------------------------------------------------------- 1 | args=$1 2 | runid=$2 3 | 4 | ln -s $args.seg.test ../../data/$args/$args.test.$runid 5 | 6 | ./SegParser model-name:../../data/$args/$args.model.$runid decode-type:non-proj test test-file:../../data/$args/$args.test.$runid seed:${runid} $@ 7 | 8 | rm ../../data/$args/$args.test.$runid 9 | 10 | -------------------------------------------------------------------------------- /Release/sources.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | O_SRCS := 6 | CPP_SRCS := 7 | C_UPPER_SRCS := 8 | C_SRCS := 9 | S_UPPER_SRCS := 10 | OBJ_SRCS := 11 | ASM_SRCS := 12 | CXX_SRCS := 13 | C++_SRCS := 14 | CC_SRCS := 15 | OBJS := 16 | C++_DEPS := 17 | C_DEPS := 18 | CC_DEPS := 19 | CPP_DEPS := 20 | EXECUTABLES := 21 | CXX_DEPS := 22 | C_UPPER_DEPS := 23 | 24 | # Every subdirectory with source files must be described here 25 | SUBDIRS := \ 26 | util \ 27 | io \ 28 | decoder \ 29 | . \ 30 | 31 | -------------------------------------------------------------------------------- /Release/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../DependencyInstance.cpp \ 8 | ../DependencyPipe.cpp \ 9 | ../FeatureEncoder.cpp \ 10 | ../FeatureExtractor.cpp \ 11 | ../Options.cpp \ 12 | ../Parameters.cpp \ 13 | ../SegParser.cpp 14 | 15 | OBJS += \ 16 | ./DependencyInstance.o \ 17 | ./DependencyPipe.o \ 18 | ./FeatureEncoder.o \ 19 | ./FeatureExtractor.o \ 20 | ./Options.o \ 21 | ./Parameters.o \ 22 | ./SegParser.o 23 | 24 | CPP_DEPS += \ 25 | ./DependencyInstance.d \ 26 | ./DependencyPipe.d \ 27 | ./FeatureEncoder.d \ 28 | ./FeatureExtractor.d \ 29 | ./Options.d \ 30 | ./Parameters.d \ 31 | ./SegParser.d 32 | 33 | 34 | # Each subdirectory must supply rules for building sources it contributes 35 | %.o: ../%.cpp 36 | @echo 'Building file: $<' 37 | @echo 'Invoking: GCC C++ Compiler' 38 | g++ -D__GXX_EXPERIMENTAL_CXX0X__ -D__cplusplus=201103L -O3 -Wall -c -fmessage-length=0 -std=c++0x -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<" 39 | @echo 'Finished building: $<' 40 | @echo ' ' 41 | 42 | 43 | -------------------------------------------------------------------------------- /Release/test.txt: -------------------------------------------------------------------------------- 1 | 班汉·西巴阿差 2 | -------------------------------------------------------------------------------- /Release/util/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../util/Alphabet.cpp \ 8 | ../util/Constant.cpp \ 9 | ../util/FeatureAlphabet.cpp \ 10 | ../util/FeatureVector.cpp \ 11 | ../util/Logarithm.cpp \ 12 | ../util/SerializationUtils.cpp \ 13 | ../util/StringUtils.cpp 14 | 15 | OBJS += \ 16 | ./util/Alphabet.o \ 17 | ./util/Constant.o \ 18 | ./util/FeatureAlphabet.o \ 19 | ./util/FeatureVector.o \ 20 | ./util/Logarithm.o \ 21 | ./util/SerializationUtils.o \ 22 | ./util/StringUtils.o 23 | 24 | CPP_DEPS += \ 25 | ./util/Alphabet.d \ 26 | ./util/Constant.d \ 27 | ./util/FeatureAlphabet.d \ 28 | ./util/FeatureVector.d \ 29 | ./util/Logarithm.d \ 30 | ./util/SerializationUtils.d \ 31 | ./util/StringUtils.d 32 | 33 | 34 | # Each subdirectory must supply rules for building sources it contributes 35 | util/%.o: ../util/%.cpp 36 | @echo 'Building file: $<' 37 | @echo 'Invoking: GCC C++ Compiler' 38 | g++ -D__GXX_EXPERIMENTAL_CXX0X__ -D__cplusplus=201103L -O3 -Wall -c -fmessage-length=0 -std=c++0x -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@:%.o=%.d)" -o "$@" "$<" 39 | @echo 'Finished building: $<' 40 | @echo ' ' 41 | 42 | 43 | -------------------------------------------------------------------------------- /SegParser.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SegParser.cpp 3 | * 4 | * Created on: Mar 19, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "SegParser.h" 9 | #include 10 | #include "util/Random.h" 11 | #include 12 | #include "util/Timer.h" 13 | #include 14 | #include "util/SerializationUtils.h" 15 | #include 16 | 17 | namespace segparser { 18 | 19 | SegParser::SegParser(DependencyPipe* pipe, Options* options) 20 | : pipe(pipe), options(options), devTimes(0) { 21 | // Set up arrays 22 | parameters = new Parameters(pipe->dataAlphabet->size(), options); 23 | devParams = new Parameters(pipe->dataAlphabet->size(), options); 24 | pruner = NULL; 25 | if (options->train) { 26 | decoder = DependencyDecoder::createDependencyDecoder(options, options->learningMode, options->trainThread, true); 27 | decoder->initialize(); 28 | } 29 | else { 30 | decoder = NULL; 31 | } 32 | dt = new DevelopmentThread(); 33 | FeatureVector::initVec(pipe->dataAlphabet->size()); 34 | } 35 | 36 | void SegParser::closeDecoder() { 37 | if (decoder) 38 | decoder->shutdown(); 39 | } 40 | 41 | SegParser::~SegParser() { 42 | delete parameters; 43 | delete devParams; 44 | delete decoder; 45 | delete dt; 46 | 47 | delete pruner; 48 | } 49 | 50 | void SegParser::train(vector& il) { 51 | 52 | cout << "About to train" << endl; 53 | 54 | devTimes = 0; 55 | 56 | // construct pred instance list 57 | vector pred(il.size()); 58 | for (unsigned int i = 0; i < il.size(); ++i) { 59 | pred[i] = inst_ptr(new DependencyInstance()); 60 | *(pred[i].get()) = *(il[i].get()); 61 | } 62 | 63 | for(int i = 0; i < options->numIters; ++i) { 64 | 65 | cout << "========================" << endl; 66 | cout << "Iteration: " << i << endl; 67 | cout << "========================" << endl; 68 | cout << "Processed: "; 69 | cout.flush(); 70 | 71 | Timer timer; 72 | 73 | trainingIter(il, pred, i+1); 74 | 75 | double diff = timer.stop(); 76 | cout << "Training iter took: " << diff / 1000 << " secs." << endl; 77 | 78 | } 79 | 80 | parameters->averageParams(decoder->getUpdateTimes()); 81 | 82 | // wait until dev finish 83 | if (options->test) { 84 | if (dt->isDevTesting) 85 | pthread_join(dt->workThread, NULL); 86 | } 87 | 88 | if (options->saveBestModel) { 89 | cout << "Best model performance: " << options->bestScore << endl; 90 | } 91 | } 92 | 93 | void SegParser::trainingIter(vector& goldList, vector& predList, int iter) { 94 | 95 | Timer timer; 96 | 97 | for(unsigned int i = 0; i < goldList.size(); ++i) { 98 | if((i+1) % 100 == 0) { 99 | cout << " " << (i+1); 100 | double diff = timer.stop(); 101 | cout << " (time=" << (int)(diff / 1000) << "s)"; 102 | cout.flush(); 103 | } 104 | 105 | inst_ptr gold = goldList[i]; 106 | inst_ptr pred = predList[i]; 107 | 108 | FeatureExtractor fe(pred.get(), this, parameters, options->trainThread); 109 | 110 | string str; 111 | 112 | assert(gold->fv.binaryIndex.size() > 0); 113 | 114 | decoder->train(gold.get(), pred.get(), &fe, iter); 115 | 116 | if (options->useSP) { 117 | uint64_t code = pipe->fe->genCodePF(HighOrder::SEG_PROB, 0); 118 | int index = pipe->dataAlphabet->lookupIndex(TemplateType::THighOrder, code, false); 119 | if (index > 0 && parameters->parameters[index] < 0.0) { 120 | parameters->parameters[index] = 0.0; 121 | } 122 | } 123 | 124 | } 125 | 126 | cout << endl; 127 | 128 | cout << " " << goldList.size() << " instances" << endl; 129 | 130 | if (options->test) 131 | checkDevStatus(iter); 132 | } 133 | 134 | void SegParser::checkDevStatus(int iter) { 135 | if (dt->isDevTesting) { 136 | cout << "processing sentences: "; 137 | 138 | pthread_mutex_lock(&dt->finishMutex); 139 | cout << dt->currFinishID << " to "; 140 | pthread_mutex_unlock(&dt->finishMutex); 141 | 142 | pthread_mutex_lock(&dt->processMutex); 143 | cout << dt->currProcessID << endl; 144 | pthread_mutex_unlock(&dt->processMutex); 145 | 146 | cout << "Wait for testing to finish." << endl; 147 | pthread_join(dt->workThread, NULL); 148 | } 149 | 150 | // start new thread for dev 151 | string devfile = options->testFile; 152 | string devoutfile = options->outFile; 153 | 154 | cout << "build dev params" << endl; 155 | devParams->copyParams(parameters); 156 | devParams->averageParams(decoder->getUpdateTimes()); 157 | 158 | cout << "start new dev " << devTimes << endl; 159 | dt->start(devfile, devoutfile, this, false); 160 | devTimes++; 161 | } 162 | 163 | /////////////////////////////////////////////////////// 164 | // Saving and loading models 165 | /////////////////////////////////////////////////////// 166 | void SegParser::outputWeight(ofstream& fout, int type, Parameters* params) { 167 | unordered_map* intmap = pipe->dataAlphabet->getMap(type); 168 | for (auto kv : (*intmap)) { 169 | uint64_t s = kv.first; 170 | int index = kv.second; 171 | if (index > 0) { 172 | fout << s << "\t" << parameters->parameters[index] << "\t" << parameters->total[index] << endl; 173 | } 174 | } 175 | } 176 | 177 | void SegParser::outputWeight(string fStr) { 178 | cout << "output feature weight to " << fStr << endl; 179 | ofstream fout(fStr.c_str()); 180 | 181 | outputWeight(fout, TemplateType::TArc, parameters); 182 | 183 | fout.close(); 184 | } 185 | 186 | void SegParser::saveModel(string file, Parameters* params) { 187 | FILE *fs = fopen(file.c_str(), "wb"); 188 | params->writeParams(fs); 189 | pipe->dataAlphabet->writeObject(fs); 190 | pipe->typeAlphabet->writeObject(fs); 191 | pipe->posAlphabet->writeObject(fs); 192 | pipe->lexAlphabet->writeObject(fs); 193 | fclose(fs); 194 | } 195 | 196 | void SegParser::loadModel(string file) { 197 | FILE *fs = fopen(file.c_str(), "rb"); 198 | parameters->readParams(fs); 199 | pipe->dataAlphabet->readObject(fs); 200 | pipe->typeAlphabet->readObject(fs); 201 | pipe->posAlphabet->readObject(fs); 202 | pipe->lexAlphabet->readObject(fs); 203 | fclose(fs); 204 | 205 | pipe->closeAlphabets(); 206 | pipe->setAndCheckOffset(); 207 | 208 | parameters->total.clear(); 209 | parameters->total.resize(parameters->parameters.size()); 210 | parameters->size = parameters->parameters.size(); 211 | } 212 | 213 | void SegParser::evaluatePruning() { 214 | cout << "Evaluate pruning quality..." << endl; 215 | DependencyReader reader(options, options->testFile); 216 | inst_ptr gold = reader.nextInstance(); 217 | 218 | int numSeg = 0; 219 | double oracle = 0.0; 220 | 221 | while(gold) { 222 | gold->setInstIds(pipe, options); 223 | DependencyInstance pred; 224 | pred = *(gold.get()); 225 | 226 | PrunerFeatureExtractor pfe; 227 | pfe.init(&pred, this, 1); 228 | 229 | for (int i = 1; i < pred.numWord; ++i) { 230 | WordInstance& word = pred.word[i]; 231 | for (int j = 0; j < word.getCurrSeg().size(); ++j) { 232 | numSeg++; 233 | vector tmpPruned; 234 | HeadIndex m(i, j); 235 | pfe.prune(&pred, m, tmpPruned); 236 | 237 | HeadIndex& goldDep = gold->getElement(i, j).dep; 238 | int goldDepIndex = gold->wordToSeg(goldDep); 239 | 240 | vector pruned; 241 | int p = 0; 242 | for (int hw = 0; hw < pred.numWord; ++hw) { 243 | SegInstance& headSeg = pred.word[hw].getCurrSeg(); 244 | for (int hs = 0; hs < headSeg.size(); ++hs) { 245 | if (hw != m.hWord || hs != m.hSeg) { 246 | if (!tmpPruned[p]) { 247 | pruned.push_back(false); 248 | } 249 | else { 250 | pruned.push_back(true); 251 | } 252 | p++; 253 | } 254 | else { 255 | pruned.push_back(true); 256 | } 257 | } 258 | } 259 | 260 | if (!pruned[goldDepIndex]) 261 | oracle++; 262 | } 263 | } 264 | 265 | gold = reader.nextInstance(); 266 | } 267 | 268 | cout << "Pruning recall: " << oracle / numSeg << endl; 269 | } 270 | 271 | } /* namespace segparser */ 272 | 273 | using namespace segparser; 274 | 275 | int main(int argc, char** argv) { 276 | //test1(); 277 | 278 | Options options; 279 | options.processArguments(argc, argv); 280 | 281 | Options prunerOptions = options; 282 | prunerOptions.setPrunerOptions(); 283 | 284 | SegParser* pruner = NULL; 285 | DependencyPipe prunerPipe(&prunerOptions); 286 | 287 | DependencyPipe pipe(&options); 288 | 289 | if (options.train) { 290 | 291 | if (options.trainPruner) { 292 | 293 | cout << "Pruner flags:" << endl; 294 | prunerOptions.outputArg(); 295 | 296 | prunerPipe.loadCoarseMap(prunerOptions.trainFile); 297 | 298 | vector trainingData = prunerPipe.createInstances(prunerOptions.trainFile); 299 | 300 | pruner = new SegParser(&prunerPipe, &prunerOptions); 301 | pruner->pruner = NULL; 302 | 303 | int numFeats = prunerPipe.dataAlphabet->size() - 1; 304 | int numTypes = prunerPipe.typeAlphabet->size() - 1; 305 | cout << "Pruner Num Feats: " << numFeats << endl; 306 | cout << "Pruner Num Edge Labels: " << numTypes << endl; 307 | 308 | pruner->train(trainingData); 309 | pruner->closeDecoder(); 310 | 311 | pruner->evaluatePruning(); 312 | } 313 | 314 | cout << "Model flags:" << endl; 315 | options.outputArg(); 316 | 317 | pipe.loadCoarseMap(options.trainFile); 318 | 319 | vector trainingData = pipe.createInstances(options.trainFile); 320 | 321 | //pipe.closeAlphabets(); 322 | 323 | SegParser sp(&pipe, &options); 324 | sp.pruner = pruner; 325 | 326 | int numFeats = pipe.dataAlphabet->size() - 1; 327 | int numTypes = pipe.typeAlphabet->size() - 1; 328 | cout << "Num Feats: " << numFeats << endl; 329 | cout << "Num Edge Labels: " << numTypes << endl; 330 | 331 | sp.train(trainingData); 332 | sp.closeDecoder(); 333 | } 334 | 335 | if (options.test) { 336 | DependencyPipe testPipe(&options); 337 | testPipe.loadCoarseMap(options.testFile); 338 | 339 | SegParser testSp(&testPipe, &options); 340 | 341 | cout << "\nLoading model ... "; 342 | cout.flush(); 343 | pruner = NULL; 344 | if (options.trainPruner) { 345 | 346 | prunerPipe.loadCoarseMap(prunerOptions.testFile); 347 | 348 | pruner = new SegParser(&prunerPipe, &prunerOptions); 349 | pruner->pruner = NULL; 350 | pruner->loadModel(options.modelName + ".pruner"); 351 | 352 | int numFeats = prunerPipe.dataAlphabet->size() - 1; 353 | int numTypes = prunerPipe.typeAlphabet->size() - 1; 354 | cout << "Pruner Num Feats: " << numFeats << endl; 355 | cout << "Pruner Num Edge Labels: " << numTypes << endl; 356 | } 357 | testSp.pruner = pruner; 358 | testSp.loadModel(options.modelName); 359 | cout << "done." << endl; 360 | 361 | int numFeats = testPipe.dataAlphabet->size() - 1; 362 | int numTypes = testPipe.typeAlphabet->size() - 1; 363 | cout << "Num Feats: " << numFeats << endl; 364 | cout << "Num Edge Labels: " << numTypes << endl; 365 | 366 | //pipe.closeAlphabets(); 367 | 368 | // run multi-thread to test 369 | string devfile = options.testFile; 370 | string devoutfile = options.outFile; 371 | cout << "build dev params" << endl; 372 | testSp.devParams->copyParams(testSp.parameters); 373 | testSp.dt->start(devfile, devoutfile, &testSp, true); 374 | 375 | // wait until all finishes 376 | pthread_join(testSp.dt->workThread, NULL); 377 | testSp.closeDecoder(); 378 | } 379 | 380 | return 0; 381 | } 382 | 383 | 384 | -------------------------------------------------------------------------------- /SegParser.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SegParser.h 3 | * 4 | * Created on: Mar 19, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef SEGPARSER_H_ 9 | #define SEGPARSER_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include "DependencyPipe.h" 15 | #include "decoder/DevelopmentThread.h" 16 | #include "Parameters.h" 17 | #include "Options.h" 18 | #include "decoder/DependencyDecoder.h" 19 | 20 | namespace segparser { 21 | 22 | using namespace std; 23 | using namespace boost; 24 | 25 | class Parameters; 26 | class DependencyDecoder; 27 | class DevelopmentThread; 28 | 29 | class SegParser { 30 | public: 31 | SegParser(DependencyPipe* pipe, Options* options); 32 | virtual ~SegParser(); 33 | void train(vector& il); 34 | void trainingIter(vector& goldList, vector& predList, int iter); 35 | void checkDevStatus(int iter); 36 | 37 | void outputWeight(ofstream& fout, int type, Parameters* params); 38 | void outputWeight(string fStr); 39 | void loadModel(string file); 40 | void saveModel(string file, Parameters* params); 41 | 42 | void closeDecoder(); 43 | 44 | void evaluatePruning(); 45 | 46 | DependencyPipe* pipe; 47 | DependencyDecoder* decoder; 48 | Parameters* parameters; 49 | Parameters* devParams; 50 | DevelopmentThread* dt; 51 | Options* options; 52 | SegParser* pruner; 53 | 54 | private: 55 | int devTimes; 56 | }; 57 | 58 | } /* namespace segparser */ 59 | #endif /* SEGPARSER_H_ */ 60 | -------------------------------------------------------------------------------- /TedWrappers_20131015/SharedTaskCommon.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/SharedTaskCommon.pyc -------------------------------------------------------------------------------- /TedWrappers_20131015/TedEvalApps.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/TedEvalApps.jar -------------------------------------------------------------------------------- /TedWrappers_20131015/TedPart.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/TedPart.jar -------------------------------------------------------------------------------- /TedWrappers_20131015/cleanconll.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # script that clean treebank and treebank output 4 | # for use with tedeval 5 | # Djame Seddah 6 | 7 | use strict; 8 | 9 | 10 | use constant { 11 | # for conll data 12 | ID => 0, 13 | FORM => 1, 14 | LEMMA => 2, 15 | CPOS => 3, 16 | FPOS => 4, 17 | FEAT => 5, 18 | HEAD => 6, 19 | DEPREL => 7, 20 | PHEAD => 8, 21 | PDEPREL => 9, 22 | SOURCETOKEN => 10, 23 | # for morfette data 24 | FORMM =>0, 25 | LEMMAM=> 1, 26 | FEATM=> 2 27 | }; 28 | 29 | my $kk=0; 30 | if ($ARGV[0] eq "-pass"){ 31 | $kk=1; 32 | } 33 | 34 | 35 | while(<>){ 36 | chomp; 37 | my $line=$_; 38 | if ($line=~/^\s*$/){ print "\n"; next;} 39 | if ($kk ==1){ print "$line\n"; next;} # just for debogging' sake (like do nothing) 40 | my @FC=split(/\t/,$line); 41 | foreach my $field (LEMMA,CPOS,FPOS,FEAT,DEPREL){ 42 | $FC[DEPREL]=~s/^(.+)\|.+/$1/; # beware, destructive operation. Tree won't be able to deprojectivize (we strip some information) 43 | $FC[$field]=~s/^[_|-]+$/dummy/; 44 | $FC[$field]=~s/\://g; # that one I like... 45 | $FC[$field]=~s/-(..)B-/$1B/; 46 | } 47 | $FC[FEAT]="_"; 48 | $FC[FORM]=~s/\://g; # for fuck's sake putain.. 49 | $FC[FORM]=~s/-(..)B-/$1B/; 50 | # we print the first 8 51 | print join("\t",@FC[0..7]),"\t"; 52 | print join("\t",@FC[6..7]); 53 | if (defined $FC[SOURCETOKEN]){ 54 | print "\t",$FC[SOURCETOKEN]; 55 | } 56 | print "\n"; 57 | } 58 | 59 | 60 | -------------------------------------------------------------------------------- /TedWrappers_20131015/cleanptb.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # script that clean treebank and treebank output 4 | # for use with tedeval 5 | # Djame Seddah 6 | 7 | use strict; 8 | 9 | 10 | my $kk=0; 11 | if ($ARGV[0] eq "-pass"){ 12 | $kk=1; 13 | } 14 | 15 | 16 | while(<>){ 17 | chomp; 18 | my $line=$_; 19 | $line=~s/^\( /(TOP /; 20 | #if ($line=~/^\s*$/){ print "\n"; next;} 21 | if ($kk ==1){ print "$line\n"; next;} # just for debogging' sake (like do nothing) 22 | $line=~s/##[^#]+##//g; # removing all features 23 | # magical regexp from releaf.pl 24 | my $preterm='\(([^() \t]+)[ \t]+([^() \t]+)\)';# match (DT The) or (NC samere_en_short) 25 | $line=~s/$preterm/"(".&clean_all($1)." ".&clean_all($2).")"/ge; 26 | print "$line\n"; 27 | } 28 | 29 | 30 | sub clean_all{ 31 | my $string= shift; 32 | #$string=~s/^[_|-]+$/dummy/; 33 | $string=~s/\://g; # that one I like... 34 | $string=~s/-(..)B-/$1B/; # probably not necessary 35 | return $string; 36 | } 37 | -------------------------------------------------------------------------------- /TedWrappers_20131015/debug/check_sourceid.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | 4 | 5 | use strict; 6 | 7 | my $sent=0; 8 | my $i++; 9 | my $j=0; 10 | while(<>){ 11 | chomp; 12 | my $line=$_; 13 | $j++; 14 | if ($line=~/^\s*$/){$sent++; $i=0; next;} 15 | my @FC=split(/\t/,$line); 16 | $i++; 17 | if ($FC[$#FC] !~m/^[0-9]+$/){ print "sentence $sent, token $i (line $j)\n"; exit;}; 18 | 19 | } -------------------------------------------------------------------------------- /TedWrappers_20131015/debug/do_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | for file in `ls *.lattices`; do 4 | echo "processing $file" 5 | ./check_sourceid.pl $file 6 | done 7 | 8 | -------------------------------------------------------------------------------- /TedWrappers_20131015/genere_tfm_tedeval.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | 4 | use strict; 5 | my $LANG; 6 | my $GOLD; 7 | my $TYPE; 8 | while(<>){ 9 | chomp; 10 | my $line=$_; 11 | if($line=~/(HEBREW)/i) { 12 | $LANG="HEBREW"; 13 | }elsif($line=~/(ARABIC)/i) { 14 | $LANG="ARABIC"; 15 | } 16 | 17 | if ($line=~/ptb/){ 18 | $TYPE="ptb"; 19 | }elsif ($line=~/conll/){ 20 | $TYPE="conll"; 21 | } 22 | my $Lang=ucfirst lc $LANG; 23 | #print "$Lang\n"; next; 24 | my $file=$line; 25 | $GOLD="../READY_TO_SHIP_FINAL/${LANG}_SPMRL/gold/$TYPE/test/test.$Lang.gold.$TYPE"; 26 | my $TFM="tedeval.sh --unlabeled --$TYPE --any $LANG -k -g $GOLD -s $file | tee $file.djam_log ; cat $file.4tedeval.evalted.res.ted-unlabeled |grep \"AVG:\"| perl -ne 'chomp ; print \"\$_\\t$file\\n\"'" ; 27 | #print STDERR `eval $CMD | tee $file.log`; 28 | print $TFM."\n"; 29 | } 30 | -------------------------------------------------------------------------------- /TedWrappers_20131015/get_cutoffed_sent.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright (c) 2001 by David Chiang. All rights reserved. 3 | # modif to cope with conll files by DjamSeddah (2013) 4 | # usage: lines 2 3 < test.mrg or lines -c 2 3 < test.conll 5 | 6 | use strict; 7 | my $KK="\n"; 8 | if ($ARGV[0] eq "-c"){ 9 | $KK="\n\n"; 10 | shift @ARGV; 11 | }elsif($ARGV[0] eq '-mada'){ 12 | $KK="--------------\nSENTENCE BREAK\n--------------\n"; 13 | shift @ARGV; 14 | } 15 | 16 | my $CUTOFF; 17 | if ($ARGV[0] eq "-K"){ 18 | $CUTOFF=$ARGV[1] or die "cut-off lenght not given.\n"; 19 | shift @ARGV; #lame I know, but that case was inserted way after the rest.. 20 | shift @ARGV; 21 | } 22 | 23 | 24 | open FICIN,"<$ARGV[0]" or die "[get_cutoffed_sent.pl] problem with $ARGV[0] or no file given\n"; 25 | my @lines2skip=; 26 | chomp @lines2skip; 27 | my %H= map { $_=~s/^\s*([0-9]+)\s*$/$1/; $_ => 1 } @lines2skip; 28 | 29 | #print join("__", keys %H),"__ICI\n"; 30 | 31 | #die; 32 | $/=$KK; 33 | 34 | 35 | my $i = 1; 36 | my $skipped=0; 37 | my $total=0; 38 | while () { 39 | # if ($_=~/^#/){print "$_";} # print comment 40 | my $len=&get_lenght($_); 41 | if ($len>$CUTOFF) { 42 | 43 | #print &get_lenght($_),"\n"; 44 | print "$i\n"; 45 | $skipped++; 46 | }else{ 47 | $total=$total+$len; 48 | # print STDERR "$i\n"; 49 | 50 | } 51 | $i++; 52 | } 53 | 54 | print STDERR "$skipped sentences skipped\n"; 55 | my $perc=($skipped/$i)*100; 56 | print STDERR "$perc \% of sentences removed ($skipped / $i)\n"; 57 | my $avg_lenght=$total/$i; 58 | print STDERR "Avg = ".$avg_lenght."\n"; 59 | 60 | sub get_lenght{ 61 | my $sent=shift; 62 | my $count= () = $sent =~ /\n/g; 63 | return $count; 64 | } 65 | -------------------------------------------------------------------------------- /TedWrappers_20131015/get_ted_res.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | 4 | #use strict; 5 | 6 | 7 | #------------------------------------------------------------ 8 | # Sentence TED Exact #Spans TED 9 | # ID Length Accuracy match test gold gen Distance Normalization 10 | # gold gen L1 L2 L1 - L2 11 | #_____________________________________________________________________________________________________ 12 | my @td=qw(LEN ACC EX_gold Ex_gen Spans_test Spans_gold Spans_gen Dist_L1 Dist_L2 Dist_L1-L2 Norm file); 13 | 14 | while(<>){ 15 | chomp; 16 | my $line=$_; 17 | $line=~s/AVG:\s+//g; 18 | $line=~s/,/./g; 19 | my @res=split(/\s+/,$line); 20 | my %Hres=(); 21 | my $i=0; 22 | foreach my $el (@res){ 23 | my $key=$td[$i++]; 24 | $Hres{$key}=($el); 25 | #print "$key\t$el\n"; 26 | } 27 | #print qw(ACC EX_gold Ex_gen Norm Spans_test Spans_gold Spans_gen Dist_L1 Dist_L2 Dist_L1-L2); 28 | foreach my $key (qw(ACC EX_gold Ex_gen Norm Spans_test Spans_gold Spans_gen Dist_L1 Dist_L2 Dist_L1-L2)){ 29 | print "$key: $Hres{$key}\t"; 30 | } 31 | print "file: $Hres{file}\n"; 32 | } 33 | -------------------------------------------------------------------------------- /TedWrappers_20131015/lattice_to_segmentation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Author : Reut Tsarfaty, July 2013 3 | # ligth modifs: Djame Seddah 4 | # +modif to support ptb's lattice files 5 | import sys 6 | 7 | if sys.argv[1] == "-ptb": 8 | ptb=1 9 | else: 10 | ptb=0 11 | 12 | 13 | 14 | 15 | prev_tok = "" 16 | out_line = "" 17 | first=1 18 | for line in sys.stdin: 19 | line = line.strip().split() 20 | if not line: 21 | #out_line += "\t".join([token,form]) 22 | if out_line: 23 | print out_line 24 | else: 25 | print "\n" 26 | prev_tok = "" 27 | out_line = "" 28 | #print "\n" 29 | continue 30 | 31 | if ptb == -1: #this code is bogus, the ptb hebrew files lacks the lemma field 32 | start, end, form, lemma, cpos, fpos, feats, token = line 33 | else: 34 | start, end, form = line[0:3] 35 | token = line[-1] 36 | 37 | if prev_tok == token: 38 | out_line += "".join([":",form]) 39 | prev_tok = token 40 | else: 41 | if first==1: #lame modif to avoid first line void 42 | first=0 43 | else: 44 | print out_line 45 | out_line = "" 46 | out_line += "\t".join([token,form]) 47 | prev_tok = token 48 | print "\n" 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /TedWrappers_20131015/lines: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright (c) 2001 by David Chiang. All rights reserved. 3 | # modif to cope with conll files by DjamSeddah (2013) 4 | # usage: lines 2 3 < test.mrg or lines -c 2 3 < test.conll 5 | 6 | 7 | # added by djame 8 | if (($ARGV[0] eq "-c")||($ARGV[0] eq "-L")){ 9 | $/="\n\n"; 10 | shift @ARGV; 11 | }elsif($ARGV[0] eq '-mada'){ 12 | $/="--------------\nSENTENCE BREAK\n--------------\n"; 13 | shift @ARGV; 14 | } 15 | 16 | 17 | if ($ARGV[0] eq "-ptb"){ 18 | shift @ARGV; 19 | #default mode for the sake of being compatible with one script 20 | } 21 | 22 | 23 | if ($ARGV[0] eq "-p"){ 24 | $DISPLAYNUM=1; 25 | shift @ARGV; 26 | } 27 | 28 | #if ($#ARGV <2 ) { 29 | # printf "Usage: lines \n"; 30 | # die; 31 | #} 32 | 33 | $start = $ARGV[0]; 34 | #shift @ARGV; 35 | $stop = $ARGV[1]; 36 | 37 | if (!defined($ARGV[1])){ 38 | $stop=$start+1; 39 | } 40 | 41 | 42 | $i = 1; 43 | 44 | while ($i < $start && ) { 45 | $i++; 46 | } 47 | 48 | while ($i >= $start && $i < $stop && defined($_ = )) { 49 | print $_ if ($DISPLAYNUM != 1); 50 | $i++; 51 | } 52 | 53 | print $i if ($DISPLAYNUM == 1); 54 | 55 | -------------------------------------------------------------------------------- /TedWrappers_20131015/pproj_24934/conllx.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /TedWrappers_20131015/pproj_24934/pproj_24934_pseudo.info: -------------------------------------------------------------------------------- 1 | CONFIGURATION 2 | Configuration name: pproj_24934 3 | Configuration type: pseudo 4 | Created: Fri Sep 05 22:19:36 EDT 2014 5 | 6 | SYSTEM 7 | Operating system architecture: amd64 8 | Operating system name: Linux 9 | JRE vendor name: Oracle Corporation 10 | JRE version number: 1.8.0_05 11 | 12 | MALTPARSER 13 | Version: 1.7.2 14 | Build date: September 25 2012 15 | 16 | SETTINGS 17 | 2planar 18 | reduceonswitch (-2pr) false 19 | config 20 | logfile (-lfi) stdout 21 | workingdir ( -w) user.dir 22 | name ( -c) pproj_24934 23 | logging ( -cl) info 24 | flowchart ( -m) proj 25 | type ( -t) singlemalt 26 | url ( -u) 27 | covington 28 | allow_shift ( -cs) false 29 | allow_root ( -cr) true 30 | graph 31 | max_sentence_length (-gsl) 256 32 | root_label (-grl) ROOT 33 | head_rules (-ghr) 34 | guide 35 | features ( -F) 36 | data_split_threshold ( -T) 50 37 | kbest_type ( -kt) rank 38 | data_split_structure ( -s) 39 | data_split_column ( -d) 40 | learner ( -l) liblinear 41 | decision_settings (-gds) T.TRANS+A.DEPREL 42 | classitem_separator (-gcs) ~ 43 | kbest ( -k) -1 44 | input 45 | charset ( -ic) UTF-8 46 | reader ( -ir) tab 47 | reader_options (-iro) 48 | format ( -if) /appdata/dataformat/conllx.xml 49 | infile ( -i) /dev/stdin 50 | iterations ( -it) 1 51 | lib 52 | external ( -lx) 53 | save_instance_files ( -li) false 54 | options ( -lo) 55 | verbosity ( -lv) silent 56 | multiplanar 57 | planar_root_handling (-prh) normal 58 | nivre 59 | allow_reduce ( -ne) false 60 | allow_root ( -nr) true 61 | output 62 | charset ( -oc) UTF-8 63 | outfile ( -o) /dev/stdout 64 | format ( -of) 65 | writer_options (-owo) 66 | writer ( -ow) tab 67 | planar 68 | no_covered_roots (-pcov) false 69 | acyclicity (-pacy) true 70 | connectedness (-pcon) none 71 | pproj 72 | marking_strategy ( -pp) head 73 | lifting_order (-plo) shortest 74 | covered_root (-pcr) none 75 | singlemalt 76 | mode ( -sm) parse 77 | diagnostics ( -di) false 78 | use_partial_tree ( -up) false 79 | propagation ( -fp) 80 | parsing_algorithm ( -a) nivreeager 81 | guide_model ( -gm) single 82 | null_value ( -nv) one 83 | diafile (-dif) stdout 84 | -------------------------------------------------------------------------------- /TedWrappers_20131015/reprojectivize.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #MALTHOME=$SHARED/TEDEVALSTUFF/maltparser-1.7.2 # to change to fit your own install 3 | MALTHOME=~/public/workspace/Code/tedeval/maltparser-1.7.2 4 | 5 | if test "$1" = "-ptb" ; then # do nothing if const. file 6 | echo "ptb file, doing nothing" > /dev/stderr 7 | cat 8 | else # reprojectivize the data 9 | java -jar $MALTHOME/maltparser-1.7.2.jar -c pproj_$$ -m proj -pp head -i /dev/stdin -o /dev/stdout 10 | rm -f pproj_$$.mco 11 | 12 | fi 13 | 14 | 15 | -------------------------------------------------------------------------------- /TedWrappers_20131015/skip_lines.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Copyright (c) 2001 by David Chiang. All rights reserved. 3 | # modif to cope with conll files by DjamSeddah (2013) 4 | # usage: lines 2 3 < test.mrg or lines -c 2 3 < test.conll 5 | 6 | use strict; 7 | use Data::Dumper; 8 | use Getopt::Long; 9 | 10 | # to generate the small test sets version 11 | # will add the option later 12 | my @test5knoskip=qw/145 436 149 285 223 291 409 486 319/; 13 | my @dev5knoskip=qw/126 337 166 338 203 157 388 493 238/; 14 | 15 | # now those are the right ones (la putain de sa mère !!) 16 | my @test5k=qw/153 436 152 285 223 291 409 319 /; 17 | my @dev5k=qw/130 337 169 338 209 161 388 493 256 /; 18 | 19 | 20 | # ARABIC= 145 HEBREW 223 21 | my @lang=qw/ARABIC BASQUE FRENCH GERMAN HEBREW HUNGARIAN KOREAN POLISH SWEDISH/; 22 | my %data=(); 23 | my $i=0; 24 | foreach my $l (@lang){ 25 | $data{$l}{dev}=$dev5k[$i]; 26 | $data{$l}{test}=$test5k[$i]; 27 | $i++; 28 | } 29 | #print Dumper(\%data); 30 | 31 | 32 | 33 | my $KK="\n"; 34 | if ($ARGV[0] eq "-c"){ 35 | $KK="\n\n"; 36 | shift @ARGV; 37 | }elsif($ARGV[0] eq '-mada'){ 38 | $KK="--------------\nSENTENCE BREAK\n--------------\n"; 39 | shift @ARGV; 40 | } 41 | my $fiveK=0; 42 | my $pref="test"; 43 | my $lang=""; 44 | if ($ARGV[0] eq "-5k"){ 45 | if(defined $ARGV[1]){ 46 | $lang=uc $ARGV[1]; 47 | shift @ARGV; 48 | }else{ 49 | die "-5k must be followed by a language (Arabic,French..)\n"; 50 | } 51 | $fiveK=1; 52 | shift @ARGV; 53 | } 54 | 55 | 56 | 57 | open FICIN,"<$ARGV[0]" or die "[skip_lines.pl] problem with $ARGV[0] or no file given\n"; 58 | my @lines2skip=; 59 | chomp @lines2skip; 60 | my %H= map { $_=~s/^\s*([0-9]+)\s*$/$1/; $_ => 1 } @lines2skip; 61 | 62 | #print join("__", keys %H),"__ICI\n"; 63 | 64 | #die; 65 | $/=$KK; 66 | 67 | 68 | my $i = 1; #line read, even if skipped 69 | my $j=1; #line effectively output 70 | my $skipped=0; 71 | LOOP: while () { 72 | # if ($_=~/^#/){print "$_";} # print comment 73 | if (!exists $H{$i}){ 74 | if ( ($fiveK == 1) && ($j >$data{$lang}{$pref}) ){ 75 | # we simply exit 76 | last LOOP; # lame and all but is there a simplest way to exit ? 77 | } 78 | print "$_"; 79 | $j++; 80 | }else{ 81 | $skipped++; 82 | } 83 | $i++; 84 | } 85 | OUT: 86 | print STDERR "$skipped sentences skipped\n"; 87 | print STDERR "$lang = $data{$lang}{$pref} sentences\n" if ($fiveK == 1); 88 | 89 | __END__ 90 | -------------------------------------------------------------------------------- /TedWrappers_20131015/tedeval-2.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/tedeval-2.2.jar -------------------------------------------------------------------------------- /TedWrappers_20131015/tedeval.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/tedeval.jar -------------------------------------------------------------------------------- /TedWrappers_20131015/tedeval_cross2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | 4 | PROGDIR=`dirname $0` 5 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar 6 | TEDEVALJARAPP=$PROGDIR/TedEvalApps.jar 7 | 8 | 9 | PREF=$FINAL/READY_TO_SHIP_FINAL/FRENCH_SPMRL/ 10 | GOLDCONLL=$PREF/gold/conll/test/test.French.gold.conll 11 | GOLDPTB=$PREF/gold/ptb/test/test.French.gold.ptb 12 | TESTCONLL=$1 13 | TESTPTB=$2 14 | 15 | #java -Xmx768m -cop /Archive/workspace/unipar/bin/ applications.Dtreebank2Ftreebank 16 | 17 | 18 | cat $GOLDPTB | perl -pe 's/^\( /(TOP /' > $GOLDPTB.4tedeval 19 | cat $TESTPTB | perl -pe 's/^\( /(TOP /' > $TESTPTB.4tedeval.noeval 20 | GOLDPTB=$GOLDPTB.4tedeval 21 | TESTPTB=$TESTPTB.4tedeval.noeval 22 | 23 | java -Xmx768m -cp $TEDEVALJARAPP:$TEDEVALJAR:. applications.Dtreebank2Ftreebank $GOLDCONLL $GOLDCONLL.ftrees 24 | 25 | java -Xmx768m -cp $TEDEVALJARAPP:$TEDEVALJAR:. applications.Dtreebank2Ftreebank $TESTCONLL $TESTCONLL.ftrees 26 | 27 | java -Xmx768m -jar $TEDEVALJAR -p1 $TESTCONLL.ftrees -g1 $GOLDCONLL.ftrees -o1 $TESTCONLL.tedeval-res -p2 $TESTPTB -g2 $GOLDPTB -o2 $TESTPTB.tedeval-crossfram.res 28 | -------------------------------------------------------------------------------- /TedWrappers_20131015/tedeval_debug.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/TedWrappers_20131015/tedeval_debug.jar -------------------------------------------------------------------------------- /TedWrappers_20131015/tedeval_seg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # wrapper script to make tedeval work on SPMRL Shared task data set 4 | # Djame Seddah 5 | 6 | # version August 19, 03:49 7 | # fixing java.nullPointer exception (removing sentence 908 and 1889 from dev arabic pred+pred) 8 | # version August 18, 02:14 9 | # fixing java.nullPointer exception (removing sentence 799 from test test arabic pred+pred) 10 | 11 | 12 | 13 | # see 14 | # http://stackoverflow.com/questions/402377/using-getopts-in-bash-shell-script-to-get-long-and-short-command-line-options/7680682#7680682 15 | # options 16 | # -d (debug version), -n 17 | # -labeled, -unlalebed (*) 18 | # -ptb, -conll (*) 19 | # -ar (for arabic, default hebrew) 20 | # -test (test set gold file used, default dev) 21 | # -cut (cut-off lenght + bad sentences removed, fixed?) 22 | # -predfile FILE (predicted parsed file) 23 | # -predmap FILE (predicted mapping file) # if not given, calculated 24 | # -gold FILE 25 | # -begin starting line to be evaluated, 1 if nothing 26 | # -end end+1 line to be evaluated +1000000 if nothing 27 | 28 | 29 | set -x 30 | 31 | # VARIABLES 32 | PROGDIR=`dirname $0` 33 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar 34 | #TEDEVALJAR=$PROGDIR/tedeval.jar 35 | LABEL="-unlabeled" 36 | TYPE="conll" 37 | TYPENAME=conll 38 | ARG="-c" 39 | LANGUAGE="" 40 | SKIPFILE=/dev/null 41 | PREF=test 42 | DOCUT=1 43 | CUTOFF=700 #should be a parameter, later... 44 | PREDFILE="" 45 | PREDMAP="" 46 | PREDLAT="" 47 | GOLDFILE="" 48 | GOLDLAT="" 49 | LATSUF=tobeparsed.gold_tagged+gold_token.lattices 50 | START=1 51 | END=100000 # let's hope that no file will ever be that long (linewise) 52 | SPMRLDATA_ROOTDIR=$SHARED/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 53 | FIVEK="" 54 | GOLDSEG="" 55 | PREDSEG="" 56 | 57 | if [ -z $1 ] ; then 58 | echo "tedeval.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 59 | -D | --debug use tedeval + debug outputs 60 | -n | --new use latest tedeval-2.2.jar (default is 2.1 61 | -u | --unlabeled unlabeled evaluation (default) 62 | -l | --labeled labeled evaluation 63 | -p | --ptb evaluate const. files 64 | -c | --conll evaluate conll files (default) 65 | -a | --arabic dev mode for Arabic, do not use 66 | -h | --hebrew dev mode for Hebrew, do not use 67 | -y | --any LANGUAGE LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...) 68 | -t | --test dev mode, test file, do not use 69 | -d | --dev dev mode, test file, do not use 70 | -k | --cut CUTOFF mode (tedeval is really slow for long sentences , length cutoff hardcoded to 70 71 | -s | --system FILE test file to be evaluated 72 | -g | --gold FILE gold standard file ;; 73 | -P | --P FILE pred seg 74 | -G | --G FILE gold seg 75 | -L | --predlat FILE predicted lattice files as provided by SPMRL. If not given, spmrl one will be used 76 | -m | --predmap mapping for predicted files (use it only when with non-spmrl predicted file . Generated from predlat file otherwise 77 | -b | --begin line ID to start evaluate 78 | -e | --end line ID+1 to stop the evaluation 79 | --spmrldata_rootdir root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 80 | --help 81 | " ; 82 | exit 83 | fi 84 | 85 | echo "###########################" 86 | echo "Running\: tedeval.sh $@" 87 | echo "###########################" 88 | echo "\n" 89 | TEMP=`getopt -o DnlupcAHy:tdks:m:L:g:b:e:P:G: --long help,debug,new,labeled,unlabeled,ptb,conll,Arabic,Hebrew,any:,test,dev,cut,predfile:,predmap:,predlat:,gold:,begin:,end:,spmrldata_rootdir,fivek,P:,G: -- "$@"` 90 | 91 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi 92 | 93 | # Note the quotes around `$TEMP': they are essential! 94 | eval set -- "$TEMP" 95 | 96 | 97 | while true; do 98 | case "$1" in 99 | -D | --debug ) TEDEVALJAR=$PROGDIR/tedeval_debug.jar; shift ;; 100 | -n | --new ) TEDEVALJAR=$PROGDIR/tedeval-2.2.jar; shift ;; 101 | -P | --P ) PREDSEG="$2" ; shift 2;; 102 | -G | --G ) GOLDSEG="$2" ; shift 2;; 103 | -u | --unlabeled ) LABEL="-unlabeled" ; shift ;; 104 | -l | --labeled ) LABEL="" ; shift ;; 105 | -p | --ptb ) TYPE=ptb ; TYPENAME=bracketed ; ARG="" ; shift ;; 106 | -c | --conll ) TYPE=conll ; TYPENAME=conll ; ARG="-c" ; shift ;; 107 | -A | --arabic ) LANGUAGE=ARABIC ;SUF=""; SKIPFILE="" ; exit ; shift ;; 108 | -H | --hebrew ) LANGUAGE=HEBREW ; shift ;; 109 | -y | --any ) LANGUAGE="$2" ; shift 2 ;; 110 | -t | --test ) PREF="test" ; shift ;; 111 | -d | --dev ) PREF="dev" ; shift ;; 112 | -k | --cut ) DOCUT=1 ; shift ;; 113 | -s | --system ) PREDFILE="$2" ; shift 2;; 114 | -g | --gold ) GOLDFILE="$2" ; shift 2;; 115 | -m | --predmap ) PREDMAP="$2" ; shift 2;; 116 | -L | --predlat ) PREDLAT="$2" ; shift 2;; 117 | -b | --begin ) START="$2" ; shift 2;; 118 | -e | --end ) END="$2" ; shift 2;; 119 | -R | --spmrldata_rootdir ) SPMRLDATA_ROOTDIR="$2" ; shift 2;; 120 | --fivek ) FIVEK="-5k" ; shift ;; 121 | --help ) 122 | echo "tedeval_simple.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 123 | -D | --debug use tedeval + debug outputs 124 | -n | --new use latest tedeval-2.2.jar (default is 2.1 125 | -u | --unlabeled unlabeled evaluation (default) 126 | -l | --labeled labeled evaluation 127 | -p | --ptb evaluate const. files 128 | -c | --conll evaluate conll files (default) 129 | -a | --arabic dev mode for Arabic, do not use 130 | -h | --hebrew dev mode for Hebrew, do not use 131 | -y | --any LANGUAGE LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...) 132 | -t | --test dev mode, test file, do not use 133 | -d | --dev dev mode, test file, do not use 134 | -k | --cut CUTOFF mode (tedeval is really slow for long sentences , length cutoff hardcoded to 70 135 | -s | --system FILE test file to be evaluated 136 | -g | --gold FILE gold standard file ;; 137 | -P | --P FILE pred seg 138 | -G | --G FILE gold seg 139 | -b | --begin line ID to start evaluate 140 | -e | --end line ID+1 to stop the evaluation 141 | --spmrldata_rootdir root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 142 | --help 143 | " ; 144 | shift ; exit 1;; 145 | -- ) shift; break ;; 146 | * ) break ;; 147 | esac 148 | done 149 | 150 | 151 | # *** INIT 152 | # arabic lines bug 153 | #START=699 # should be 799 but 100 sentences > 70 were removed 154 | #START=167 # same bug as 699 but in labeled 155 | 156 | #END=700 157 | #END=168 158 | LDIR=`echo $LANGUAGE | perl -ne 'print uc $_'`_SPMRL 159 | LANGUPPED=`echo $LANGUAGE | perl -ne 'print uc $_'` 160 | LANGUAGE=`echo $LANGUAGE|perl -p -ne '$_ = ucfirst lc $_ ;'` 161 | if [ ! -z "$FIVEK" ] ; then 162 | FIVEK="-5k $LANGUPPED" 163 | fi 164 | 165 | 166 | if [ -z "$PREDFILE" ]; then 167 | PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed$SUF 168 | fi 169 | 170 | if [ ! -f "$PREDFILE" ] ; then 171 | echo "PREDFILE: $PREDFILE not found" 172 | exit 0 173 | fi 174 | 175 | echo "LDIR = $LDIR LANGUAGE $LANGUAGE PREDLAT $PREDLAT" 176 | #exit 177 | 178 | 179 | 180 | 181 | # we're always picking the 182 | if [ -z "$GOLDFILE" ] ; then 183 | GOLDFILE=${SPMRLDATA_ROOTDIR}/${LDIR}/gold/$TYPE/${PREF}/${PREF}.${LANGUAGE}.gold.$TYPE 184 | fi 185 | 186 | 187 | 188 | # checking all files 189 | 190 | 191 | if [ -f "$GOLDFILE" ] ; then 192 | echo "gold: $GOLDFILE found" 193 | else 194 | wc -L $GOLDFILE 195 | echo "gold: $GOLDFILE not found" 196 | exit 197 | fi 198 | 199 | 200 | 201 | if [ -f "$PREDFILE" ] ; then 202 | echo "pred file: $PREDFILE found" 203 | else 204 | echo "pred file: $PREDFILE not found" 205 | exit 206 | fi 207 | 208 | 209 | 210 | #exit 211 | 212 | 213 | #PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices 214 | 215 | # dev.Hebrew.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.parsed.lattices.5k 216 | # test.Arabic.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed.full 217 | 218 | 219 | # fixing the blank line bug 220 | #perl -i.bak -pe 's/^\s*$/\n/g' $GOLFILE $PREDFILE $PREDLAT $GOLDLAT 221 | 222 | #################################################### 223 | ## real workd done here 224 | ###################################################### 225 | 226 | if test $DOCUT = 1 ; then 227 | echo "generating lines to be skipped" 228 | cat $GOLDFILE | perl -pe 's/^\s*$/\n/' | $PROGDIR/get_cutoffed_sent.pl -c -K $CUTOFF /dev/null > $GOLDFILE.tobeskipped 229 | if test "$PREF.$LANGUAGE" = "test.Arabic" ; then 230 | #echo -e "\n799" >> $GOLDFILELAT.tobeskipped # buggy sentence for unlabeld evaluation on arabic test 231 | echo "\n" >> $GOLDFILE.tobeskipped 232 | elif test "$PREF.$LANGUAGE" = "dev.Arabic" ; then 233 | #echo -e "\n904\n1889" >> $GOLDFILELAT.tobeskipped #buggy sentence for unlabeld evaluation on arabic dev 234 | echo "\n" >> $GOLDFILE.tobeskipped 235 | 236 | fi 237 | SKIPFILE=$GOLDFILE.tobeskipped 238 | else 239 | SKIPFILE=/dev/null 240 | fi 241 | 242 | 243 | #exit 244 | 245 | echo "generating normalized files" 246 | echo -e "\t==> gold" 247 | wc $ARG $GOLDFILE 248 | cat $GOLDFILE|perl -pe 's/^\s*$/\n/'| $PROGDIR/skip_lines.pl $ARG $FIVEK $SKIPFILE | $PROGDIR/lines $ARG $START $END | $PROGDIR/reprojectivize.sh -$TYPE| $PROGDIR/clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $GOLDFILE.4tedeval.$$ 249 | wc $ARG $GOLDFILE.4tedeval.$$ 250 | cat $GOLDSEG > $GOLDSEG.4tedeval.$$ 251 | 252 | echo -e "\t==> pred" 253 | wc $ARG $PREDFILE 254 | cat $PREDFILE|perl -pe 's/^\s*$/\n/'|$PROGDIR/skip_lines.pl $ARG $FIVEK $SKIPFILE | $PROGDIR/lines $ARG $START $END | $PROGDIR/reprojectivize.sh -$TYPE| $PROGDIR/clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $PREDFILE.4tedeval.$$ #no idea why twice 255 | wc $ARG $PREDFILE.4tedeval.$$ 256 | cat $PREDSEG > $PREDSEG.4tedeval.$$ 257 | #exit 258 | 259 | # that was to generate a fake arabic parsed file 260 | #cat $PREDLAT | lines $ARG 1 $END | cut -f2-7 | add_fake_col.pl | clean$TYPE.pl | perl -pe 's/^\s*$/\n/' |uniq> $PREDLAT.fake 261 | 262 | # normal gold vs pred (LABELED) 263 | 264 | if test $LABEL = "-unlabeled" ; then 265 | SUF="-unlabeled" 266 | else 267 | SUF="-labeled" 268 | fi 269 | 270 | #java -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF 271 | java -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -sg $GOLDSEG.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -sp $PREDSEG.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF 272 | file="$PREDFILE.4tedeval.simple_tedeval.res$SUF.ted" 273 | 274 | 275 | 276 | #echo "java -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF" > /dev/stderr 277 | echo "java -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -sg $GOLDSEG.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -sp $PREDSEG.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF" > /dev/stderr 278 | echo " " 279 | cat $file | grep "AVG:"| perl -p -s -e 'chomp ; s/(.)$/\1\t$file \n/' -- -file=$file | $PROGDIR/get_ted_res.pl 280 | 281 | 282 | echo -e "\n\n" 283 | 284 | rm -f $GOLDFILE.4tedeval.$$ $PREDFILE.4tedeval.$$ $GOLDSEG.4tedeval.$$ $PREDSEG.4tedeval.$$ 285 | 286 | #eval gold vs gold => 100% 287 | #java -Xmx3g -jar $TEDEVALJAR -g $GOLDFILE.4tedeval -sg $GOLDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE 288 | 289 | # eval pred vs gold 290 | #java -server -Xmx3g -jar $TEDEVALJAR -g $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE 291 | 292 | # eval pred vs pred => 100% 293 | #java -server -Xmx3g -jar $TEDEVALJAR -g $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $PREDFILE.4tedeval.$$ -sp $PREDLAT.4tedeval.mapping -format $TYPE 294 | 295 | exit 296 | 297 | -------------------------------------------------------------------------------- /TedWrappers_20131015/tedeval_simple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # wrapper script to make tedeval work on SPMRL Shared task data set 4 | # Djame Seddah 5 | 6 | # version August 19, 03:49 7 | # fixing java.nullPointer exception (removing sentence 908 and 1889 from dev arabic pred+pred) 8 | # version August 18, 02:14 9 | # fixing java.nullPointer exception (removing sentence 799 from test test arabic pred+pred) 10 | 11 | 12 | 13 | # see 14 | # http://stackoverflow.com/questions/402377/using-getopts-in-bash-shell-script-to-get-long-and-short-command-line-options/7680682#7680682 15 | # options 16 | # -d (debug version), -n 17 | # -labeled, -unlalebed (*) 18 | # -ptb, -conll (*) 19 | # -ar (for arabic, default hebrew) 20 | # -test (test set gold file used, default dev) 21 | # -cut (cut-off lenght + bad sentences removed, fixed?) 22 | # -predfile FILE (predicted parsed file) 23 | # -predmap FILE (predicted mapping file) # if not given, calculated 24 | # -gold FILE 25 | # -begin starting line to be evaluated, 1 if nothing 26 | # -end end+1 line to be evaluated +1000000 if nothing 27 | 28 | 29 | set -x 30 | 31 | # VARIABLES 32 | PROGDIR=`dirname $0` 33 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar 34 | #TEDEVALJAR=$PROGDIR/tedeval.jar 35 | LABEL="-unlabeled" 36 | TYPE="conll" 37 | TYPENAME=conll 38 | ARG="-c" 39 | LANGUAGE="" 40 | SKIPFILE=/dev/null 41 | PREF=test 42 | DOCUT=1 43 | CUTOFF=70 #should be a parameter, later... 44 | PREDFILE="" 45 | PREDMAP="" 46 | PREDLAT="" 47 | GOLDFILE="" 48 | GOLDLAT="" 49 | LATSUF=tobeparsed.gold_tagged+gold_token.lattices 50 | START=1 51 | END=100000 # let's hope that no file will ever be that long (linewise) 52 | SPMRLDATA_ROOTDIR=$SHARED/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 53 | FIVEK="" 54 | 55 | if [ -z $1 ] ; then 56 | echo "tedeval.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 57 | -D | --debug use tedeval + debug outputs 58 | -n | --new use latest tedeval-2.2.jar (default is 2.1 59 | -u | --unlabeled unlabeled evaluation (default) 60 | -l | --labeled labeled evaluation 61 | -p | --ptb evaluate const. files 62 | -c | --conll evaluate conll files (default) 63 | -a | --arabic dev mode for Arabic, do not use 64 | -h | --hebrew dev mode for Hebrew, do not use 65 | -y | --any LANGUAGE LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...) 66 | -t | --test dev mode, test file, do not use 67 | -d | --dev dev mode, test file, do not use 68 | -k | --cut CUTOFF mode (tedeval is really slow for long sentences , length cutoff hardcoded to 70 69 | -s | --system FILE test file to be evaluated 70 | -g | --gold FILE gold standard file ;; 71 | -L | --predlat FILE predicted lattice files as provided by SPMRL. If not given, spmrl one will be used 72 | -m | --predmap mapping for predicted files (use it only when with non-spmrl predicted file . Generated from predlat file otherwise 73 | -b | --begin line ID to start evaluate 74 | -e | --end line ID+1 to stop the evaluation 75 | --spmrldata_rootdir root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 76 | --help 77 | " ; 78 | exit 79 | fi 80 | 81 | echo "###########################" 82 | echo "Running\: tedeval.sh $@" 83 | echo "###########################" 84 | echo "\n" 85 | TEMP=`getopt -o DnlupcAHy:tdks:m:L:g:b:e: --long help,debug,new,labeled,unlabeled,ptb,conll,Arabic,Hebrew,any:,test,dev,cut,predfile:,predmap:,predlat:,gold:,begin:,end:,spmrldata_rootdir,fivek -- "$@"` 86 | 87 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi 88 | 89 | # Note the quotes around `$TEMP': they are essential! 90 | eval set -- "$TEMP" 91 | 92 | 93 | while true; do 94 | case "$1" in 95 | -D | --debug ) TEDEVALJAR=$PROGDIR/tedeval_debug.jar; shift ;; 96 | -n | --new ) TEDEVALJAR=$PROGDIR/tedeval-2.2.jar; shift ;; 97 | -u | --unlabeled ) LABEL="-unlabeled" ; shift ;; 98 | -l | --labeled ) LABEL="" ; shift ;; 99 | -p | --ptb ) TYPE=ptb ; TYPENAME=bracketed ; ARG="" ; shift ;; 100 | -c | --conll ) TYPE=conll ; TYPENAME=conll ; ARG="-c" ; shift ;; 101 | -A | --arabic ) LANGUAGE=ARABIC ;SUF=""; SKIPFILE="" ; exit ; shift ;; 102 | -H | --hebrew ) LANGUAGE=HEBREW ; shift ;; 103 | -y | --any ) LANGUAGE="$2" ; shift 2 ;; 104 | -t | --test ) PREF="test" ; shift ;; 105 | -d | --dev ) PREF="dev" ; shift ;; 106 | -k | --cut ) DOCUT=1 ; shift ;; 107 | -s | --system ) PREDFILE="$2" ; shift 2;; 108 | -g | --gold ) GOLDFILE="$2" ; shift 2;; 109 | -m | --predmap ) PREDMAP="$2" ; shift 2;; 110 | -L | --predlat ) PREDLAT="$2" ; shift 2;; 111 | -b | --begin ) START="$2" ; shift 2;; 112 | -e | --end ) END="$2" ; shift 2;; 113 | -R | --spmrldata_rootdir ) SPMRLDATA_ROOTDIR="$2" ; shift 2;; 114 | --fivek ) FIVEK="-5k" ; shift ;; 115 | --help ) 116 | echo "tedeval_simple.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 117 | -D | --debug use tedeval + debug outputs 118 | -n | --new use latest tedeval-2.2.jar (default is 2.1 119 | -u | --unlabeled unlabeled evaluation (default) 120 | -l | --labeled labeled evaluation 121 | -p | --ptb evaluate const. files 122 | -c | --conll evaluate conll files (default) 123 | -a | --arabic dev mode for Arabic, do not use 124 | -h | --hebrew dev mode for Hebrew, do not use 125 | -y | --any LANGUAGE LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...) 126 | -t | --test dev mode, test file, do not use 127 | -d | --dev dev mode, test file, do not use 128 | -k | --cut CUTOFF mode (tedeval is really slow for long sentences , length cutoff hardcoded to 70 129 | -s | --system FILE test file to be evaluated 130 | -g | --gold FILE gold standard file ;; 131 | -b | --begin line ID to start evaluate 132 | -e | --end line ID+1 to stop the evaluation 133 | --spmrldata_rootdir root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 134 | --help 135 | " ; 136 | shift ; exit 1;; 137 | -- ) shift; break ;; 138 | * ) break ;; 139 | esac 140 | done 141 | 142 | 143 | # *** INIT 144 | # arabic lines bug 145 | #START=699 # should be 799 but 100 sentences > 70 were removed 146 | #START=167 # same bug as 699 but in labeled 147 | 148 | #END=700 149 | #END=168 150 | LDIR=`echo $LANGUAGE | perl -ne 'print uc $_'`_SPMRL 151 | LANGUPPED=`echo $LANGUAGE | perl -ne 'print uc $_'` 152 | LANGUAGE=`echo $LANGUAGE|perl -p -ne '$_ = ucfirst lc $_ ;'` 153 | if [ ! -z "$FIVEK" ] ; then 154 | FIVEK="-5k $LANGUPPED" 155 | fi 156 | 157 | 158 | if [ -z "$PREDFILE" ]; then 159 | PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed$SUF 160 | fi 161 | 162 | if [ ! -f "$PREDFILE" ] ; then 163 | echo "PREDFILE: $PREDFILE not found" 164 | exit 0 165 | fi 166 | 167 | echo "LDIR = $LDIR LANGUAGE $LANGUAGE PREDLAT $PREDLAT" 168 | #exit 169 | 170 | 171 | 172 | 173 | # we're always picking the 174 | if [ -z "$GOLDFILE" ] ; then 175 | GOLDFILE=${SPMRLDATA_ROOTDIR}/${LDIR}/gold/$TYPE/${PREF}/${PREF}.${LANGUAGE}.gold.$TYPE 176 | fi 177 | 178 | 179 | 180 | # checking all files 181 | 182 | 183 | if [ -f "$GOLDFILE" ] ; then 184 | echo "gold: $GOLDFILE found" 185 | else 186 | wc -L $GOLDFILE 187 | echo "gold: $GOLDFILE not found" 188 | exit 189 | fi 190 | 191 | 192 | 193 | if [ -f "$PREDFILE" ] ; then 194 | echo "pred file: $PREDFILE found" 195 | else 196 | echo "pred file: $PREDFILE not found" 197 | exit 198 | fi 199 | 200 | 201 | 202 | #exit 203 | 204 | 205 | #PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices 206 | 207 | # dev.Hebrew.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.parsed.lattices.5k 208 | # test.Arabic.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed.full 209 | 210 | 211 | # fixing the blank line bug 212 | #perl -i.bak -pe 's/^\s*$/\n/g' $GOLFILE $PREDFILE $PREDLAT $GOLDLAT 213 | 214 | #################################################### 215 | ## real workd done here 216 | ###################################################### 217 | 218 | if test $DOCUT = 1 ; then 219 | echo "generating lines to be skipped" 220 | cat $GOLDFILE | perl -pe 's/^\s*$/\n/' | ./get_cutoffed_sent.pl -c -K 70 /dev/null > $GOLDFILE.tobeskipped 221 | if test "$PREF.$LANGUAGE" = "test.Arabic" ; then 222 | #echo -e "\n799" >> $GOLDFILELAT.tobeskipped # buggy sentence for unlabeld evaluation on arabic test 223 | echo "\n" >> $GOLDFILE.tobeskipped 224 | elif test "$PREF.$LANGUAGE" = "dev.Arabic" ; then 225 | #echo -e "\n904\n1889" >> $GOLDFILELAT.tobeskipped #buggy sentence for unlabeld evaluation on arabic dev 226 | echo "\n" >> $GOLDFILE.tobeskipped 227 | 228 | fi 229 | SKIPFILE=$GOLDFILE.tobeskipped 230 | else 231 | SKIPFILE=/dev/null 232 | fi 233 | 234 | 235 | #exit 236 | 237 | echo "generating normalized files" 238 | echo -e "\t==> gold" 239 | wc $ARG $GOLDFILE 240 | cat $GOLDFILE|perl -pe 's/^\s*$/\n/'| ./skip_lines.pl $ARG $FIVEK $SKIPFILE | ./lines $ARG $START $END | ./reprojectivize.sh -$TYPE| ./clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $GOLDFILE.4tedeval.$$ 241 | wc $ARG $GOLDFILE.4tedeval.$$ 242 | 243 | echo -e "\t==> pred" 244 | wc $ARG $PREDFILE 245 | cat $PREDFILE|perl -pe 's/^\s*$/\n/'|./skip_lines.pl $ARG $FIVEK $SKIPFILE | ./lines $ARG $START $END | ./reprojectivize.sh -$TYPE| ./clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $PREDFILE.4tedeval.$$ #no idea why twice 246 | wc $ARG $PREDFILE.4tedeval.$$ 247 | #exit 248 | 249 | # that was to generate a fake arabic parsed file 250 | #cat $PREDLAT | lines $ARG 1 $END | cut -f2-7 | add_fake_col.pl | clean$TYPE.pl | perl -pe 's/^\s*$/\n/' |uniq> $PREDLAT.fake 251 | 252 | # normal gold vs pred (LABELED) 253 | 254 | if test $LABEL = "-unlabeled" ; then 255 | SUF="-unlabeled" 256 | else 257 | SUF="-labeled" 258 | fi 259 | 260 | java -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF 261 | file="$PREDFILE.4tedeval.simple_tedeval.res$SUF.ted" 262 | 263 | 264 | 265 | echo "java -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF" > /dev/stderr 266 | echo " " 267 | cat $file | grep "AVG:"| perl -p -s -e 'chomp ; s/(.)$/\1\t$file \n/' -- -file=$file | ./get_ted_res.pl 268 | 269 | 270 | echo -e "\n\n" 271 | 272 | rm -f $GOLDFILE.4tedeval.$$ $PREDFILE.4tedeval.$$ 273 | 274 | #eval gold vs gold => 100% 275 | #java -Xmx3g -jar $TEDEVALJAR -g $GOLDFILE.4tedeval -sg $GOLDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE 276 | 277 | # eval pred vs gold 278 | #java -server -Xmx3g -jar $TEDEVALJAR -g $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE 279 | 280 | # eval pred vs pred => 100% 281 | #java -server -Xmx3g -jar $TEDEVALJAR -g $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $PREDFILE.4tedeval.$$ -sp $PREDLAT.4tedeval.mapping -format $TYPE 282 | 283 | exit 284 | 285 | -------------------------------------------------------------------------------- /TedWrappers_20131015/tedeval_simple.sh.good: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # wrapper script to make tedeval work on SPMRL Shared task data set 4 | # Djame Seddah 5 | 6 | # version August 19, 03:49 7 | # fixing java.nullPointer exception (removing sentence 908 and 1889 from dev arabic pred+pred) 8 | # version August 18, 02:14 9 | # fixing java.nullPointer exception (removing sentence 799 from test test arabic pred+pred) 10 | 11 | 12 | 13 | # see 14 | # http://stackoverflow.com/questions/402377/using-getopts-in-bash-shell-script-to-get-long-and-short-command-line-options/7680682#7680682 15 | # options 16 | # -d (debug version), -n 17 | # -labeled, -unlalebed (*) 18 | # -ptb, -conll (*) 19 | # -ar (for arabic, default hebrew) 20 | # -test (test set gold file used, default dev) 21 | # -cut (cut-off lenght + bad sentences removed, fixed?) 22 | # -predfile FILE (predicted parsed file) 23 | # -predmap FILE (predicted mapping file) # if not given, calculated 24 | # -gold FILE 25 | # -begin starting line to be evaluated, 1 if nothing 26 | # -end end+1 line to be evaluated +1000000 if nothing 27 | 28 | 29 | #set -x 30 | 31 | # VARIABLES 32 | PROGDIR=`dirname $0` 33 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar 34 | #TEDEVALJAR=$PROGDIR/tedeval.jar 35 | LABEL="-unlabeled" 36 | TYPE="conll" 37 | TYPENAME=conll 38 | ARG="-c" 39 | LANGUAGE="" 40 | SKIPFILE=/dev/null 41 | PREF=test 42 | DOCUT=1 43 | CUTOFF=70 #should be a parameter, later... 44 | PREDFILE="" 45 | PREDMAP="" 46 | PREDLAT="" 47 | GOLDFILE="" 48 | GOLDLAT="" 49 | LATSUF=tobeparsed.gold_tagged+gold_token.lattices 50 | START=1 51 | END=100000 # let's hope that no file will ever be that long (linewise) 52 | SPMRLDATA_ROOTDIR=$SHARED/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 53 | 54 | 55 | if [ -z $1 ] ; then 56 | echo "tedeval.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 57 | -D | --debug use tedeval + debug outputs 58 | -n | --new use latest tedeval-2.2.jar (default is 2.1 59 | -u | --unlabeled unlabeled evaluation (default) 60 | -l | --labeled labeled evaluation 61 | -p | --ptb evaluate const. files 62 | -c | --conll evaluate conll files (default) 63 | -a | --arabic dev mode for Arabic, do not use 64 | -h | --hebrew dev mode for Hebrew, do not use 65 | -y | --any LANGUAGE LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...) 66 | -t | --test dev mode, test file, do not use 67 | -d | --dev dev mode, test file, do not use 68 | -k | --cut CUTOFF mode (tedeval is really slow for long sentences , length cutoff hardcoded to 70 69 | -s | --system FILE test file to be evaluated 70 | -g | --gold FILE gold standard file ;; 71 | -L | --predlat FILE predicted lattice files as provided by SPMRL. If not given, spmrl one will be used 72 | -m | --predmap mapping for predicted files (use it only when with non-spmrl predicted file . Generated from predlat file otherwise 73 | -b | --begin line ID to start evaluate 74 | -e | --end line ID+1 to stop the evaluation 75 | --spmrldata_rootdir root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 76 | --help 77 | " ; 78 | exit 79 | fi 80 | 81 | echo "###########################" 82 | echo "Running\: tedeval.sh $@" 83 | echo "###########################" 84 | echo "\n" 85 | TEMP=`/sw/bin/getopt -o DnlupcAHy:tdks:m:L:g:b:e: --long help,debug,new,labeled,unlabeled,ptb,conll,Arabic,Hebrew,any:,test,dev,cut,predfile:,predmap:,predlat:,gold:,begin:,end:,spmrldata_rootdir -- "$@"` 86 | 87 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi 88 | 89 | # Note the quotes around `$TEMP': they are essential! 90 | eval set -- "$TEMP" 91 | 92 | 93 | while true; do 94 | case "$1" in 95 | -D | --debug ) TEDEVALJAR=$PROGDIR/tedeval_debug.jar; shift ;; 96 | -n | --new ) TEDEVALJAR=$PROGDIR/tedeval-2.2.jar; shift ;; 97 | -u | --unlabeled ) LABEL="-unlabeled" ; shift ;; 98 | -l | --labeled ) LABEL="" ; shift ;; 99 | -p | --ptb ) TYPE=ptb ; TYPENAME=bracketed ; ARG="" ; shift ;; 100 | -c | --conll ) TYPE=conll ; TYPENAME=conll ; ARG="-c" ; shift ;; 101 | -A | --arabic ) LANGUAGE=ARABIC ;SUF=""; SKIPFILE=""; echo "ici arabic" ; exit ; shift ;; 102 | -H | --hebrew ) LANGUAGE=HEBREW ; shift ;; 103 | -y | --any ) LANGUAGE="$2" ; shift 2 ;; 104 | -t | --test ) PREF="test" ; shift ;; 105 | -d | --dev ) PREF="dev" ; shift ;; 106 | -k | --cut ) DOCUT=1 ; shift ;; 107 | -s | --system ) PREDFILE="$2" ; shift 2;; 108 | -g | --gold ) GOLDFILE="$2" ; shift 2;; 109 | -m | --predmap ) PREDMAP="$2" ; shift 2;; 110 | -L | --predlat ) PREDLAT="$2" ; shift 2;; 111 | -b | --begin ) START="$2" ; shift 2;; 112 | -e | --end ) END="$2" ; shift 2;; 113 | -R | --spmrldata_rootdir ) SPMRLDATA_ROOTDIR="$2" ; shift 2;; 114 | --help ) 115 | echo "tedeval_simple.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 116 | -D | --debug use tedeval + debug outputs 117 | -n | --new use latest tedeval-2.2.jar (default is 2.1 118 | -u | --unlabeled unlabeled evaluation (default) 119 | -l | --labeled labeled evaluation 120 | -p | --ptb evaluate const. files 121 | -c | --conll evaluate conll files (default) 122 | -a | --arabic dev mode for Arabic, do not use 123 | -h | --hebrew dev mode for Hebrew, do not use 124 | -y | --any LANGUAGE LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...) 125 | -t | --test dev mode, test file, do not use 126 | -d | --dev dev mode, test file, do not use 127 | -k | --cut CUTOFF mode (tedeval is really slow for long sentences , length cutoff hardcoded to 70 128 | -s | --system FILE test file to be evaluated 129 | -g | --gold FILE gold standard file ;; 130 | -b | --begin line ID to start evaluate 131 | -e | --end line ID+1 to stop the evaluation 132 | --spmrldata_rootdir root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 133 | --help 134 | " ; 135 | shift ; exit 1;; 136 | -- ) shift; break ;; 137 | * ) break ;; 138 | esac 139 | done 140 | 141 | # arabic lines bug 142 | #START=699 # should be 799 but 100 sentences > 70 were removed 143 | #START=167 # same bug as 699 but in labeled 144 | 145 | #END=700 146 | #END=168 147 | LDIR=`echo $LANGUAGE | perl -ne 'print uc $_'`_SPMRL 148 | LANGUAGE=`echo $LANGUAGE|perl -p -ne '$_ = ucfirst lc $_ ;'` 149 | 150 | if [ -z "$PREDFILE" ]; then 151 | PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed$SUF 152 | fi 153 | 154 | if [ ! -f "$PREDFILE" ] ; then 155 | echo "PREDFILE: $PREDFILE not found" 156 | exit 0 157 | fi 158 | 159 | echo "LDIR = $LDIR LANGUAGE $LANGUAGE PREDLAT $PREDLAT" 160 | #exit 161 | 162 | 163 | 164 | 165 | # we're always picking the 166 | if [ -z "$GOLDFILE" ] ; then 167 | GOLDFILE=${PREF}.${LANGUAGE}.gold.$TYPE 168 | fi 169 | 170 | 171 | 172 | # checking all files 173 | 174 | 175 | if [ -f "$GOLDFILE" ] ; then 176 | echo "gold: $GOLDFILE found" 177 | else 178 | wc -L $GOLDFILE 179 | echo "gold: $GOLDFILE not found" 180 | exit 181 | fi 182 | 183 | 184 | 185 | if [ -f "$PREDFILE" ] ; then 186 | echo "pred file: $PREDFILE found" 187 | else 188 | echo "pred file: $PREDFILE not found" 189 | exit 190 | fi 191 | 192 | 193 | 194 | #exit 195 | 196 | 197 | #PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices 198 | 199 | # dev.Hebrew.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.parsed.lattices.5k 200 | # test.Arabic.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed.full 201 | 202 | 203 | # fixing the blank line bug 204 | #perl -i.bak -pe 's/^\s*$/\n/g' $GOLFILE $PREDFILE $PREDLAT $GOLDLAT 205 | 206 | #################################################### 207 | ## real workd done here 208 | ###################################################### 209 | 210 | if test $DOCUT = 1 ; then 211 | echo "generating lines to be skipped" 212 | cat $GOLDFILE | perl -pe 's/^\s*$/\n/' |get_cutoffed_sent.pl -c -K 70 /dev/null > $GOLDFILE.tobeskipped 213 | if test "$PREF.$LANGUAGE" = "test.Arabic" ; then 214 | #echo -e "\n799" >> $GOLDFILELAT.tobeskipped # buggy sentence for unlabeld evaluation on arabic test 215 | echo "\n" >> $GOLDFILE.tobeskipped 216 | elif test "$PREF.$LANGUAGE" = "dev.Arabic" ; then 217 | #echo -e "\n904\n1889" >> $GOLDFILELAT.tobeskipped #buggy sentence for unlabeld evaluation on arabic dev 218 | echo "\n" >> $GOLDFILE.tobeskipped 219 | 220 | fi 221 | SKIPFILE=$GOLDFILE.tobeskipped 222 | else 223 | SKIPFILE=/dev/null 224 | fi 225 | 226 | 227 | #exit 228 | 229 | echo "generating normalized files" 230 | echo -e "\t==> gold" 231 | wc $ARG $GOLDFILE 232 | cat $GOLDFILE|perl -pe 's/^\s*$/\n/'| skip_lines.pl $ARG $SKIPFILE | lines $ARG $START $END | reprojectivize.sh -$TYPE| clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $GOLDFILE.4tedeval.$$ 233 | wc $ARG $GOLDFILE.4tedeval.$$ 234 | 235 | echo -e "\t==> pred" 236 | wc $ARG $PREDFILE 237 | cat $PREDFILE|perl -pe 's/^\s*$/\n/'|skip_lines.pl $ARG $SKIPFILE | lines $ARG $START $END | reprojectivize.sh -$TYPE| clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $PREDFILE.4tedeval #no idea why twice 238 | wc $ARG $PREDFILE.4tedeval 239 | #exit 240 | 241 | # that was to generate a fake arabic parsed file 242 | #cat $PREDLAT | lines $ARG 1 $END | cut -f2-7 | add_fake_col.pl | clean$TYPE.pl | perl -pe 's/^\s*$/\n/' |uniq> $PREDLAT.fake 243 | 244 | # normal gold vs pred (LABELED) 245 | 246 | java -server -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -p $PREDFILE.4tedeval-format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF 247 | if test $LABEL = "-unlabeled" ; then 248 | SUF="-unlabeled" 249 | else 250 | SUF="-labeled" 251 | fi 252 | 253 | echo "java -server -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -p $PREDFILE.4tedeval-format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF" 254 | 255 | echo "\n\n" 256 | #rm -f $GOLDFILE.4tedeval.$$ $GOLDLAT.4tedeval.mapping.$$ $PREDMAP 257 | 258 | #eval gold vs gold => 100% 259 | #java -Xmx3g -jar $TEDEVALJAR -g $GOLDFILE.4tedeval -sg $GOLDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE 260 | 261 | # eval pred vs gold 262 | #java -server -Xmx3g -jar $TEDEVALJAR -g $PREDFILE.4tedeval -sg $PREDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE 263 | 264 | # eval pred vs pred => 100% 265 | #java -server -Xmx3g -jar $TEDEVALJAR -g $PREDFILE.4tedeval -sg $PREDLAT.4tedeval.mapping -p $PREDFILE.4tedeval -sp $PREDLAT.4tedeval.mapping -format $TYPE 266 | 267 | exit 268 | 269 | -------------------------------------------------------------------------------- /TedWrappers_20131015/tedeval_simple.sh.old: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # wrapper script to make tedeval work on SPMRL Shared task data set 4 | # Djame Seddah 5 | 6 | # version August 19, 03:49 7 | # fixing java.nullPointer exception (removing sentence 908 and 1889 from dev arabic pred+pred) 8 | # version August 18, 02:14 9 | # fixing java.nullPointer exception (removing sentence 799 from test test arabic pred+pred) 10 | 11 | 12 | 13 | # see 14 | # http://stackoverflow.com/questions/402377/using-getopts-in-bash-shell-script-to-get-long-and-short-command-line-options/7680682#7680682 15 | # options 16 | # -d (debug version), -n 17 | # -labeled, -unlalebed (*) 18 | # -ptb, -conll (*) 19 | # -ar (for arabic, default hebrew) 20 | # -test (test set gold file used, default dev) 21 | # -cut (cut-off lenght + bad sentences removed, fixed?) 22 | # -predfile FILE (predicted parsed file) 23 | # -predmap FILE (predicted mapping file) # if not given, calculated 24 | # -gold FILE 25 | # -begin starting line to be evaluated, 1 if nothing 26 | # -end end+1 line to be evaluated +1000000 if nothing 27 | 28 | 29 | #set -x 30 | 31 | # VARIABLES 32 | PROGDIR=`dirname $0` 33 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar 34 | #TEDEVALJAR=$PROGDIR/tedeval.jar 35 | LABEL="-unlabeled" 36 | TYPE="conll" 37 | TYPENAME=conll 38 | ARG="-c" 39 | LANGUAGE="" 40 | SKIPFILE=/dev/null 41 | PREF=test 42 | DOCUT=1 43 | CUTOFF=70 #should be a parameter, later... 44 | PREDFILE="" 45 | PREDMAP="" 46 | PREDLAT="" 47 | GOLDFILE="" 48 | GOLDLAT="" 49 | LATSUF=tobeparsed.gold_tagged+gold_token.lattices 50 | START=1 51 | END=100000 # let's hope that no file will ever be that long (linewise) 52 | SPMRLDATA_ROOTDIR=$SHARED/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 53 | FIVEK="" 54 | 55 | if [ -z $1 ] ; then 56 | echo "tedeval.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 57 | -D | --debug use tedeval + debug outputs 58 | -n | --new use latest tedeval-2.2.jar (default is 2.1 59 | -u | --unlabeled unlabeled evaluation (default) 60 | -l | --labeled labeled evaluation 61 | -p | --ptb evaluate const. files 62 | -c | --conll evaluate conll files (default) 63 | -a | --arabic dev mode for Arabic, do not use 64 | -h | --hebrew dev mode for Hebrew, do not use 65 | -y | --any LANGUAGE LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...) 66 | -t | --test dev mode, test file, do not use 67 | -d | --dev dev mode, test file, do not use 68 | -k | --cut CUTOFF mode (tedeval is really slow for long sentences , length cutoff hardcoded to 70 69 | -s | --system FILE test file to be evaluated 70 | -g | --gold FILE gold standard file ;; 71 | -L | --predlat FILE predicted lattice files as provided by SPMRL. If not given, spmrl one will be used 72 | -m | --predmap mapping for predicted files (use it only when with non-spmrl predicted file . Generated from predlat file otherwise 73 | -b | --begin line ID to start evaluate 74 | -e | --end line ID+1 to stop the evaluation 75 | --spmrldata_rootdir root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 76 | --help 77 | " ; 78 | exit 79 | fi 80 | 81 | echo "###########################" 82 | echo "Running\: tedeval.sh $@" 83 | echo "###########################" 84 | echo "\n" 85 | TEMP=`/sw/bin/getopt -o DnlupcAHy:tdks:m:L:g:b:e: --long help,debug,new,labeled,unlabeled,ptb,conll,Arabic,Hebrew,any:,test,dev,cut,predfile:,predmap:,predlat:,gold:,begin:,end:,spmrldata_rootdir,fivek -- "$@"` 86 | 87 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi 88 | 89 | # Note the quotes around `$TEMP': they are essential! 90 | eval set -- "$TEMP" 91 | 92 | 93 | while true; do 94 | case "$1" in 95 | -D | --debug ) TEDEVALJAR=$PROGDIR/tedeval_debug.jar; shift ;; 96 | -n | --new ) TEDEVALJAR=$PROGDIR/tedeval-2.2.jar; shift ;; 97 | -u | --unlabeled ) LABEL="-unlabeled" ; shift ;; 98 | -l | --labeled ) LABEL="" ; shift ;; 99 | -p | --ptb ) TYPE=ptb ; TYPENAME=bracketed ; ARG="" ; shift ;; 100 | -c | --conll ) TYPE=conll ; TYPENAME=conll ; ARG="-c" ; shift ;; 101 | -A | --arabic ) LANGUAGE=ARABIC ;SUF=""; SKIPFILE="" ; exit ; shift ;; 102 | -H | --hebrew ) LANGUAGE=HEBREW ; shift ;; 103 | -y | --any ) LANGUAGE="$2" ; shift 2 ;; 104 | -t | --test ) PREF="test" ; shift ;; 105 | -d | --dev ) PREF="dev" ; shift ;; 106 | -k | --cut ) DOCUT=1 ; shift ;; 107 | -s | --system ) PREDFILE="$2" ; shift 2;; 108 | -g | --gold ) GOLDFILE="$2" ; shift 2;; 109 | -m | --predmap ) PREDMAP="$2" ; shift 2;; 110 | -L | --predlat ) PREDLAT="$2" ; shift 2;; 111 | -b | --begin ) START="$2" ; shift 2;; 112 | -e | --end ) END="$2" ; shift 2;; 113 | -R | --spmrldata_rootdir ) SPMRLDATA_ROOTDIR="$2" ; shift 2;; 114 | --fivek ) FIVEK="-5k" ; shift ;; 115 | --help ) 116 | echo "tedeval_simple.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 117 | -D | --debug use tedeval + debug outputs 118 | -n | --new use latest tedeval-2.2.jar (default is 2.1 119 | -u | --unlabeled unlabeled evaluation (default) 120 | -l | --labeled labeled evaluation 121 | -p | --ptb evaluate const. files 122 | -c | --conll evaluate conll files (default) 123 | -a | --arabic dev mode for Arabic, do not use 124 | -h | --hebrew dev mode for Hebrew, do not use 125 | -y | --any LANGUAGE LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...) 126 | -t | --test dev mode, test file, do not use 127 | -d | --dev dev mode, test file, do not use 128 | -k | --cut CUTOFF mode (tedeval is really slow for long sentences , length cutoff hardcoded to 70 129 | -s | --system FILE test file to be evaluated 130 | -g | --gold FILE gold standard file ;; 131 | -b | --begin line ID to start evaluate 132 | -e | --end line ID+1 to stop the evaluation 133 | --spmrldata_rootdir root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 134 | --help 135 | " ; 136 | shift ; exit 1;; 137 | -- ) shift; break ;; 138 | * ) break ;; 139 | esac 140 | done 141 | 142 | 143 | # *** INIT 144 | # arabic lines bug 145 | #START=699 # should be 799 but 100 sentences > 70 were removed 146 | #START=167 # same bug as 699 but in labeled 147 | 148 | #END=700 149 | #END=168 150 | LDIR=`echo $LANGUAGE | perl -ne 'print uc $_'`_SPMRL 151 | LANGUPPED=`echo $LANGUAGE | perl -ne 'print uc $_'` 152 | LANGUAGE=`echo $LANGUAGE|perl -p -ne '$_ = ucfirst lc $_ ;'` 153 | if [ ! -z "$FIVEK" ] ; then 154 | FIVEK="-5k $LANGUPPED" 155 | fi 156 | 157 | 158 | if [ -z "$PREDFILE" ]; then 159 | PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed$SUF 160 | fi 161 | 162 | if [ ! -f "$PREDFILE" ] ; then 163 | echo "PREDFILE: $PREDFILE not found" 164 | exit 0 165 | fi 166 | 167 | echo "LDIR = $LDIR LANGUAGE $LANGUAGE PREDLAT $PREDLAT" 168 | #exit 169 | 170 | 171 | 172 | 173 | # we're always picking the 174 | if [ -z "$GOLDFILE" ] ; then 175 | GOLDFILE=${SPMRLDATA_ROOTDIR}/${LDIR}/gold/$TYPE/${PREF}/${PREF}.${LANGUAGE}.gold.$TYPE 176 | fi 177 | 178 | 179 | 180 | # checking all files 181 | 182 | 183 | if [ -f "$GOLDFILE" ] ; then 184 | echo "gold: $GOLDFILE found" 185 | else 186 | wc -L $GOLDFILE 187 | echo "gold: $GOLDFILE not found" 188 | exit 189 | fi 190 | 191 | 192 | 193 | if [ -f "$PREDFILE" ] ; then 194 | echo "pred file: $PREDFILE found" 195 | else 196 | echo "pred file: $PREDFILE not found" 197 | exit 198 | fi 199 | 200 | 201 | 202 | #exit 203 | 204 | 205 | #PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices 206 | 207 | # dev.Hebrew.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.parsed.lattices.5k 208 | # test.Arabic.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed.full 209 | 210 | 211 | # fixing the blank line bug 212 | #perl -i.bak -pe 's/^\s*$/\n/g' $GOLFILE $PREDFILE $PREDLAT $GOLDLAT 213 | 214 | #################################################### 215 | ## real workd done here 216 | ###################################################### 217 | 218 | if test $DOCUT = 1 ; then 219 | echo "generating lines to be skipped" 220 | cat $GOLDFILE | perl -pe 's/^\s*$/\n/' |get_cutoffed_sent.pl -c -K 70 /dev/null > $GOLDFILE.tobeskipped 221 | if test "$PREF.$LANGUAGE" = "test.Arabic" ; then 222 | #echo -e "\n799" >> $GOLDFILELAT.tobeskipped # buggy sentence for unlabeld evaluation on arabic test 223 | echo "\n" >> $GOLDFILE.tobeskipped 224 | elif test "$PREF.$LANGUAGE" = "dev.Arabic" ; then 225 | #echo -e "\n904\n1889" >> $GOLDFILELAT.tobeskipped #buggy sentence for unlabeld evaluation on arabic dev 226 | echo "\n" >> $GOLDFILE.tobeskipped 227 | 228 | fi 229 | SKIPFILE=$GOLDFILE.tobeskipped 230 | else 231 | SKIPFILE=/dev/null 232 | fi 233 | 234 | 235 | #exit 236 | 237 | echo "generating normalized files" 238 | echo -e "\t==> gold" 239 | wc $ARG $GOLDFILE 240 | cat $GOLDFILE|perl -pe 's/^\s*$/\n/'| skip_lines.pl $ARG $FIVEK $SKIPFILE | lines $ARG $START $END | reprojectivize.sh -$TYPE| clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $GOLDFILE.4tedeval.$$ 241 | wc $ARG $GOLDFILE.4tedeval.$$ 242 | 243 | echo -e "\t==> pred" 244 | wc $ARG $PREDFILE 245 | cat $PREDFILE|perl -pe 's/^\s*$/\n/'|skip_lines.pl $ARG $FIVEK $SKIPFILE | lines $ARG $START $END | reprojectivize.sh -$TYPE| clean$TYPE.pl| perl -pe 's/^\s*$/\n/' |uniq > $PREDFILE.4tedeval.$$ #no idea why twice 246 | wc $ARG $PREDFILE.4tedeval.$$ 247 | #exit 248 | 249 | # that was to generate a fake arabic parsed file 250 | #cat $PREDLAT | lines $ARG 1 $END | cut -f2-7 | add_fake_col.pl | clean$TYPE.pl | perl -pe 's/^\s*$/\n/' |uniq> $PREDLAT.fake 251 | 252 | # normal gold vs pred (LABELED) 253 | 254 | if test $LABEL = "-unlabeled" ; then 255 | SUF="-unlabeled" 256 | else 257 | SUF="-labeled" 258 | fi 259 | 260 | java -server -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF 261 | file="$PREDFILE.4tedeval.simple_tedeval.res$SUF.ted" 262 | echo "cat $file|grep "AVG:"|perl -p -s -e 'chomp ; s/(.)$/\1\t$file \n/' -- -file=$file|get_ted_res.pl" 263 | 264 | 265 | echo "java -server -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF" 266 | 267 | echo "\n\n" 268 | #rm -f $GOLDFILE.4tedeval.$$ $GOLDLAT.4tedeval.mapping.$$ $PREDMAP 269 | 270 | #eval gold vs gold => 100% 271 | #java -Xmx3g -jar $TEDEVALJAR -g $GOLDFILE.4tedeval -sg $GOLDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE 272 | 273 | # eval pred vs gold 274 | #java -server -Xmx3g -jar $TEDEVALJAR -g $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE 275 | 276 | # eval pred vs pred => 100% 277 | #java -server -Xmx3g -jar $TEDEVALJAR -g $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $PREDFILE.4tedeval.$$ -sp $PREDLAT.4tedeval.mapping -format $TYPE 278 | 279 | exit 280 | 281 | -------------------------------------------------------------------------------- /TedWrappers_20131015/tedeval_simple_polish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # wrapper script to make tedeval work on SPMRL Shared task data set 4 | # Djame Seddah 5 | 6 | # version August 19, 03:49 7 | # fixing java.nullPointer exception (removing sentence 908 and 1889 from dev arabic pred+pred) 8 | # version August 18, 02:14 9 | # fixing java.nullPointer exception (removing sentence 799 from test test arabic pred+pred) 10 | 11 | 12 | 13 | # see 14 | # http://stackoverflow.com/questions/402377/using-getopts-in-bash-shell-script-to-get-long-and-short-command-line-options/7680682#7680682 15 | # options 16 | # -d (debug version), -n 17 | # -labeled, -unlalebed (*) 18 | # -ptb, -conll (*) 19 | # -ar (for arabic, default hebrew) 20 | # -test (test set gold file used, default dev) 21 | # -cut (cut-off lenght + bad sentences removed, fixed?) 22 | # -predfile FILE (predicted parsed file) 23 | # -predmap FILE (predicted mapping file) # if not given, calculated 24 | # -gold FILE 25 | # -begin starting line to be evaluated, 1 if nothing 26 | # -end end+1 line to be evaluated +1000000 if nothing 27 | 28 | 29 | set -x 30 | 31 | # VARIABLES 32 | PROGDIR=`dirname $0` 33 | TEDEVALJAR=$PROGDIR/tedeval-2.2.jar 34 | #TEDEVALJAR=$PROGDIR/tedeval.jar 35 | LABEL="-unlabeled" 36 | TYPE="conll" 37 | TYPENAME=conll 38 | ARG="-c" 39 | LANGUAGE="" 40 | SKIPFILE=/dev/null 41 | PREF=test 42 | DOCUT=1 43 | CUTOFF=70 #should be a parameter, later... 44 | PREDFILE="" 45 | PREDMAP="" 46 | PREDLAT="" 47 | GOLDFILE="" 48 | GOLDLAT="" 49 | LATSUF=tobeparsed.gold_tagged+gold_token.lattices 50 | START=1 51 | END=100000 # let's hope that no file will ever be that long (linewise) 52 | SPMRLDATA_ROOTDIR=$SHARED/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 53 | FIVEK="" 54 | 55 | if [ -z $1 ] ; then 56 | echo "tedeval.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 57 | -D | --debug use tedeval + debug outputs 58 | -n | --new use latest tedeval-2.2.jar (default is 2.1 59 | -u | --unlabeled unlabeled evaluation (default) 60 | -l | --labeled labeled evaluation 61 | -p | --ptb evaluate const. files 62 | -c | --conll evaluate conll files (default) 63 | -a | --arabic dev mode for Arabic, do not use 64 | -h | --hebrew dev mode for Hebrew, do not use 65 | -y | --any LANGUAGE LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...) 66 | -t | --test dev mode, test file, do not use 67 | -d | --dev dev mode, test file, do not use 68 | -k | --cut CUTOFF mode (tedeval is really slow for long sentences , length cutoff hardcoded to 70 69 | -s | --system FILE test file to be evaluated 70 | -g | --gold FILE gold standard file ;; 71 | -L | --predlat FILE predicted lattice files as provided by SPMRL. If not given, spmrl one will be used 72 | -m | --predmap mapping for predicted files (use it only when with non-spmrl predicted file . Generated from predlat file otherwise 73 | -b | --begin line ID to start evaluate 74 | -e | --end line ID+1 to stop the evaluation 75 | --spmrldata_rootdir root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 76 | --help 77 | " ; 78 | exit 79 | fi 80 | 81 | echo "###########################" 82 | echo "Running\: tedeval.sh $@" 83 | echo "###########################" 84 | echo "\n" 85 | TEMP=`/sw/bin/getopt -o DnlupcAHy:tdks:m:L:g:b:e: --long help,debug,new,labeled,unlabeled,ptb,conll,Arabic,Hebrew,any:,test,dev,cut,predfile:,predmap:,predlat:,gold:,begin:,end:,spmrldata_rootdir,fivek -- "$@"` 86 | 87 | if [ $? != 0 ] ; then echo "Terminating..." >&2 ; exit 1 ; fi 88 | 89 | # Note the quotes around `$TEMP': they are essential! 90 | eval set -- "$TEMP" 91 | 92 | 93 | while true; do 94 | case "$1" in 95 | -D | --debug ) TEDEVALJAR=$PROGDIR/tedeval_debug.jar; shift ;; 96 | -n | --new ) TEDEVALJAR=$PROGDIR/tedeval-2.2.jar; shift ;; 97 | -u | --unlabeled ) LABEL="-unlabeled" ; shift ;; 98 | -l | --labeled ) LABEL="" ; shift ;; 99 | -p | --ptb ) TYPE=ptb ; TYPENAME=bracketed ; ARG="" ; shift ;; 100 | -c | --conll ) TYPE=conll ; TYPENAME=conll ; ARG="-c" ; shift ;; 101 | -A | --arabic ) LANGUAGE=ARABIC ;SUF=""; SKIPFILE="" ; exit ; shift ;; 102 | -H | --hebrew ) LANGUAGE=HEBREW ; shift ;; 103 | -y | --any ) LANGUAGE="$2" ; shift 2 ;; 104 | -t | --test ) PREF="test" ; shift ;; 105 | -d | --dev ) PREF="dev" ; shift ;; 106 | -k | --cut ) DOCUT=1 ; shift ;; 107 | -s | --system ) PREDFILE="$2" ; shift 2;; 108 | -g | --gold ) GOLDFILE="$2" ; shift 2;; 109 | -m | --predmap ) PREDMAP="$2" ; shift 2;; 110 | -L | --predlat ) PREDLAT="$2" ; shift 2;; 111 | -b | --begin ) START="$2" ; shift 2;; 112 | -e | --end ) END="$2" ; shift 2;; 113 | -R | --spmrldata_rootdir ) SPMRLDATA_ROOTDIR="$2" ; shift 2;; 114 | --fivek ) FIVEK="-5k" ; shift ;; 115 | --help ) 116 | echo "tedeval_simple.sh OPTIONS -g GOLDFILE -s SYSTEMFILE 117 | -D | --debug use tedeval + debug outputs 118 | -n | --new use latest tedeval-2.2.jar (default is 2.1 119 | -u | --unlabeled unlabeled evaluation (default) 120 | -l | --labeled labeled evaluation 121 | -p | --ptb evaluate const. files 122 | -c | --conll evaluate conll files (default) 123 | -a | --arabic dev mode for Arabic, do not use 124 | -h | --hebrew dev mode for Hebrew, do not use 125 | -y | --any LANGUAGE LANGUAGE being one the SPRML 2013 shared task (ARABIC, FRENCH,...) 126 | -t | --test dev mode, test file, do not use 127 | -d | --dev dev mode, test file, do not use 128 | -k | --cut CUTOFF mode (tedeval is really slow for long sentences , length cutoff hardcoded to 70 129 | -s | --system FILE test file to be evaluated 130 | -g | --gold FILE gold standard file ;; 131 | -b | --begin line ID to start evaluate 132 | -e | --end line ID+1 to stop the evaluation 133 | --spmrldata_rootdir root directory to the SPMRL FINAL DATA SET (default \$FINAL/SPMRL_FINAL/READY_TO_SHIP_FINAL/ 134 | --help 135 | " ; 136 | shift ; exit 1;; 137 | -- ) shift; break ;; 138 | * ) break ;; 139 | esac 140 | done 141 | 142 | 143 | # *** INIT 144 | # arabic lines bug 145 | #START=699 # should be 799 but 100 sentences > 70 were removed 146 | #START=167 # same bug as 699 but in labeled 147 | 148 | #END=700 149 | #END=168 150 | LDIR=`echo $LANGUAGE | perl -ne 'print uc $_'`_SPMRL 151 | LANGUPPED=`echo $LANGUAGE | perl -ne 'print uc $_'` 152 | LANGUAGE=`echo $LANGUAGE|perl -p -ne '$_ = ucfirst lc $_ ;'` 153 | if [ ! -z "$FIVEK" ] ; then 154 | FIVEK="-5k $LANGUPPED" 155 | fi 156 | 157 | 158 | if [ -z "$PREDFILE" ]; then 159 | PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed$SUF 160 | fi 161 | 162 | if [ ! -f "$PREDFILE" ] ; then 163 | echo "PREDFILE: $PREDFILE not found" 164 | exit 0 165 | fi 166 | 167 | echo "LDIR = $LDIR LANGUAGE $LANGUAGE PREDLAT $PREDLAT" 168 | #exit 169 | 170 | 171 | 172 | 173 | # we're always picking the 174 | if [ -z "$GOLDFILE" ] ; then 175 | GOLDFILE=${SPMRLDATA_ROOTDIR}/${LDIR}/gold/$TYPE/${PREF}/${PREF}.${LANGUAGE}.gold.$TYPE 176 | fi 177 | 178 | 179 | 180 | # checking all files 181 | 182 | 183 | if [ -f "$GOLDFILE" ] ; then 184 | echo "gold: $GOLDFILE found" 185 | else 186 | wc -L $GOLDFILE 187 | echo "gold: $GOLDFILE not found" 188 | exit 189 | fi 190 | 191 | 192 | 193 | if [ -f "$PREDFILE" ] ; then 194 | echo "pred file: $PREDFILE found" 195 | else 196 | echo "pred file: $PREDFILE not found" 197 | exit 198 | fi 199 | 200 | 201 | 202 | #exit 203 | 204 | 205 | #PREDFILE=${PREF}.${LANGUAGE}.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices 206 | 207 | # dev.Hebrew.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.parsed.lattices.5k 208 | # test.Arabic.pred.$TYPE.tobeparsed.pred_tagged+pred_token.disamb.lattices.parsed.full 209 | 210 | 211 | # fixing the blank line bug 212 | #perl -i.bak -pe 's/^\s*$/\n/g' $GOLFILE $PREDFILE $PREDLAT $GOLDLAT 213 | 214 | #################################################### 215 | ## real workd done here 216 | ###################################################### 217 | 218 | if test $DOCUT = 1 ; then 219 | echo "generating lines to be skipped" 220 | cat $GOLDFILE | perl -pe 's/^\s*$/\n/' |get_cutoffed_sent.pl -c -K 70 /dev/null > $GOLDFILE.tobeskipped 221 | if test "$PREF.$LANGUAGE" = "test.Arabic" ; then 222 | #echo -e "\n799" >> $GOLDFILELAT.tobeskipped # buggy sentence for unlabeld evaluation on arabic test 223 | echo "\n" >> $GOLDFILE.tobeskipped 224 | elif test "$PREF.$LANGUAGE" = "dev.Arabic" ; then 225 | #echo -e "\n904\n1889" >> $GOLDFILELAT.tobeskipped #buggy sentence for unlabeld evaluation on arabic dev 226 | echo "\n" >> $GOLDFILE.tobeskipped 227 | 228 | fi 229 | SKIPFILE=$GOLDFILE.tobeskipped 230 | else 231 | SKIPFILE=/dev/null 232 | fi 233 | 234 | 235 | #exit 236 | 237 | echo "generating normalized files" 238 | echo -e "\t==> gold" 239 | wc $ARG $GOLDFILE 240 | cat $GOLDFILE|perl -pe 's/^\s*$/\n/'| skip_lines.pl $ARG $FIVEK $SKIPFILE | lines $ARG $START $END | reprojectivize.sh -$TYPE|clean$TYPE.pl| perl -pe 's/^\s*$/\n/'|cat -s |uniq > $GOLDFILE.4tedeval.$$ 241 | wc $ARG $GOLDFILE.4tedeval.$$ 242 | 243 | echo -e "\t==> pred" 244 | wc $ARG $PREDFILE 245 | cat $PREDFILE|perl -pe 's/^\s*$/\n/'|skip_lines.pl $ARG $FIVEK $SKIPFILE | lines $ARG $START $END | reprojectivize.sh -$TYPE|clean$TYPE.pl| perl -pe 's/^\s*$/\n/'|cat -s |uniq > $PREDFILE.4tedeval.$$ #no idea why twice 246 | wc $ARG $PREDFILE.4tedeval.$$ 247 | #exit 248 | 249 | # that was to generate a fake arabic parsed file 250 | #cat $PREDLAT | lines $ARG 1 $END | cut -f2-7 | add_fake_col.pl | clean$TYPE.pl | perl -pe 's/^\s*$/\n/' |uniq> $PREDLAT.fake 251 | 252 | # normal gold vs pred (LABELED) 253 | 254 | if test $LABEL = "-unlabeled" ; then 255 | SUF="-unlabeled" 256 | else 257 | SUF="-labeled" 258 | fi 259 | 260 | java -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF 261 | file="$PREDFILE.4tedeval.simple_tedeval.res$SUF.ted" 262 | 263 | 264 | 265 | echo "java -Xmx768m -jar $TEDEVALJAR $LABEL -g $GOLDFILE.4tedeval.$$ -p $PREDFILE.4tedeval.$$ -format $TYPENAME -o $PREDFILE.4tedeval.simple_tedeval.res$SUF" > /dev/stderr 266 | echo " " 267 | cat $file | grep "AVG:"| perl -p -s -e 'chomp ; s/(.)$/\1\t$file \n/' -- -file=$file | get_ted_res.pl 268 | 269 | 270 | echo -e "\n\n" 271 | #rm -f $GOLDFILE.4tedeval.$$ $GOLDLAT.4tedeval.mapping.$$ $PREDMAP 272 | 273 | #eval gold vs gold => 100% 274 | #java -Xmx3g -jar $TEDEVALJAR -g $GOLDFILE.4tedeval -sg $GOLDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE 275 | 276 | # eval pred vs gold 277 | #java -server -Xmx3g -jar $TEDEVALJAR -g $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $GOLDFILE.4tedeval -sp $GOLDLAT.4tedeval.mapping -format $TYPE 278 | 279 | # eval pred vs pred => 100% 280 | #java -server -Xmx3g -jar $TEDEVALJAR -g $PREDFILE.4tedeval.$$ -sg $PREDLAT.4tedeval.mapping -p $PREDFILE.4tedeval.$$ -sp $PREDLAT.4tedeval.mapping -format $TYPE 281 | 282 | exit 283 | 284 | -------------------------------------------------------------------------------- /TedWrappers_20131015/wc: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if test "$1" = "-c" -o "$1" = "-L" ; then 4 | ARG=$1 5 | for file in `eval ls $@` ; do 6 | # echo "=== $file" 7 | # echo "cat $file | lines $ARG -p 1 100000000" 8 | NB=`cat $file | lines $ARG -p 1 100000000` 9 | echo -e "\t$NB $file" 10 | done 11 | 12 | else 13 | /usr/bin/wc -l $@ 14 | fi 15 | -------------------------------------------------------------------------------- /data/core12map.txt: -------------------------------------------------------------------------------- 1 | li_PRT PRT 2 | fa_PRT PRT 3 | 2mp_pron PRO 4 | PRT PRT 5 | 2ms_pron PRO 6 | PNX PNX 7 | 2fs_PRO PRO 8 | C C 9 | 2ms_PRO PRO 10 | N N 11 | la_PRT PRT 12 | 2mp_PRO PRO 13 | V V 14 | REL REL 15 | P P 16 | fa_C C 17 | PRO PRO 18 | 3ms_pron PRO 19 | 1s_PRO PRO 20 | 3d_PRO PRO 21 | 3ms_PRO PRO 22 | wa_prep P 23 | 3fs_PRO PRO 24 | wa_C C 25 | AV AV 26 | 1p_PRO PRO 27 | 3mp_PRO PRO 28 | 1p_pron PRO 29 | 1s_pron PRO 30 | PN PN 31 | AB ABBREV 32 | sa_PRT PRT 33 | 3fs_pron PRO 34 | AJ AJ 35 | ma_PRO PRO 36 | ma_REL REL 37 | 2d_PRO PRO 38 | 2fp_pron PRO 39 | 3fp_PRO PRO 40 | 2fs_pron PRO 41 | -------------------------------------------------------------------------------- /data/spmrl.uni.map: -------------------------------------------------------------------------------- 1 | C CONJ 2 | P ADP 3 | PNX PNX 4 | V VERB 5 | -------------------------------------------------------------------------------- /data/tags-all.mod.txt: -------------------------------------------------------------------------------- 1 | ABBREV 2 | ADJ 3 | ADJ.VN ADJ 4 | ADJ_COMP ADJ 5 | ADJ_NUM NUM 6 | ADV 7 | INTERROG_ADV ADV 8 | CASE 9 | CASE_INDEF_ACC CASE 10 | CONJ 11 | CV V 12 | CVSUFF VSUFF 13 | CVSUFF_DO:1S PRON 14 | CVSUFF_DO:3MS PRON 15 | CVSUFF_SUBJ:2MP VSUFF 16 | CVSUFF_SUBJ:2MS VSUFF 17 | CVSUFF_DO:3MP PRON 18 | CVSUFF_DO:1P PRON 19 | CVSUFF_DO:3FS PRON 20 | CVSUFF_DO:3D PRON 21 | CVSUFF_DO:3FP PRON 22 | CVSUFF_SUBJ:2FS VSUFF 23 | DEM_PRON_FD PRON 24 | DEM_PRON_FS PRON 25 | DEM_PRON_MD PRON 26 | DEM_PRON_MP PRON 27 | DEM_PRON_MS PRON 28 | DEM_PRON PRON 29 | DEM_PRON_F PRON 30 | DEM_PRON_P PRON 31 | DET 32 | FOREIGN 33 | INTERJ 34 | IV V 35 | IV1P V 36 | IV1S V 37 | IV2D V 38 | IV2FP V 39 | IV2FS V 40 | IV2MP V 41 | IV2MS V 42 | IV3FD V 43 | IV3FP V 44 | IV3FS V 45 | IV3MD V 46 | IV3MP V 47 | IV3MS V 48 | IVSUFF_DO:1P PRON 49 | IVSUFF_DO:1S PRON 50 | IVSUFF_DO:2FS PRON 51 | IVSUFF_DO:2D PRON 52 | IVSUFF_DO:2MP PRON 53 | IVSUFF_DO:2MS PRON 54 | IVSUFF_DO:3D PRON 55 | IVSUFF_DO:3FS PRON 56 | IVSUFF_DO:3MP PRON 57 | IVSUFF_DO:3MS PRON 58 | IVSUFF_DO:2FP PRON 59 | IVSUFF_DO:3FP PRON 60 | IVSUFF_MOOD VSUFF 61 | IVSUFF_SUBJ:2FS VSUFF 62 | IVSUFF_SUBJ:D VSUFF 63 | IVSUFF_SUBJ:FP VSUFF 64 | IVSUFF_SUBJ:MP VSUFF 65 | PVSUFF_SUBJ:2FP VSUFF 66 | IVSUFF_SUBJ:3FP VSUFF 67 | IVSUFF_SUBJ:3D VSUFF 68 | IVSUFF_MOOD:I VSUFF 69 | IVSUFF_MOOD:J VSUFF 70 | IVSUFF_MOOD:S VSUFF 71 | IVSUFF_SUBJ:2FS_MOOD:I VSUFF 72 | IVSUFF_SUBJ:D_MOOD:I VSUFF 73 | IVSUFF_SUBJ:D_MOOD:SJ VSUFF 74 | IVSUFF_SUBJ:MP_MOOD:I VSUFF 75 | IVSUFF_SUBJ:MP_MOOD:SJ VSUFF 76 | IVSUFF_SUBJ:2FS_MOOD:SJ VSUFF 77 | IVSUFF_SUBJ:3MP_MOOD:SJ VSUFF 78 | IV_PASS V 79 | LATIN FOREIGN 80 | NEG_PART PART 81 | NOUN 82 | NOUN_PROP NOUN_PROP 83 | NOUN_NUM NUM 84 | NOUN_QUANT NOUN 85 | NOUN.VN NOUN 86 | NSUFF_FEM_DU NSUFF 87 | NSUFF_FEM_PL NSUFF 88 | NSUFF_FEM_SG NSUFF 89 | NSUFF_MASC_DU NSUFF 90 | NSUFF_MASC_PL NSUFF 91 | NSUFF_FEM_DU_ACC NSUFF 92 | NSUFF_FEM_DU_ACC_POSS NSUFF 93 | NSUFF_FEM_DU_GEN NSUFF 94 | NSUFF_FEM_DU_GEN_POSS NSUFF 95 | NSUFF_FEM_DU_NOM NSUFF 96 | NSUFF_FEM_DU_NOM_POSS NSUFF 97 | NSUFF_MASC_DU_ACC NSUFF 98 | NSUFF_MASC_DU_ACC_POSS NSUFF 99 | NSUFF_MASC_DU_GEN NSUFF 100 | NSUFF_MASC_DU_GEN_POSS NSUFF 101 | NSUFF_MASC_DU_NOM NSUFF 102 | NSUFF_MASC_DU_NOM_POSS NSUFF 103 | NSUFF_MASC_PL_ACC NSUFF 104 | NSUFF_MASC_PL_ACC_POSS NSUFF 105 | NSUFF_MASC_PL_GEN NSUFF 106 | NSUFF_MASC_PL_GEN_POSS NSUFF 107 | NSUFF_MASC_PL_NOM NSUFF 108 | NSUFF_MASC_PL_NOM_POSS NSUFF 109 | NUM 110 | O 111 | PART 112 | FUT PART 113 | CONNEC_PART PART 114 | FOCUS_PART PART 115 | RESTRIC_PART PART 116 | EMPHATIC_PART PART 117 | PSEUDO_VERB V 118 | VOC_PART PART 119 | INTERROG_PART PART 120 | INTERROG_PRON PART 121 | INTERJ PART 122 | JUS_PART PART 123 | EMPH_PART PART 124 | EXCEPT_PART PART 125 | POSS_PRON_1P PRON 126 | POSS_PRON_1S PRON 127 | POSS_PRON_2FP PRON 128 | POSS_PRON_2FS PRON 129 | POSS_PRON_2MP PRON 130 | POSS_PRON_2MS PRON 131 | POSS_PRON_3D PRON 132 | POSS_PRON_3FP PRON 133 | POSS_PRON_3FS PRON 134 | POSS_PRON_3MP PRON 135 | POSS_PRON_3MS PRON 136 | POSS_PRON_2D PRON 137 | PREP 138 | PRON 139 | PRON_1P PRON 140 | PRON_1S PRON 141 | PRON_2D PRON 142 | PRON_2FP PRON 143 | PRON_2FS PRON 144 | PRON_2MP PRON 145 | PRON_2MS PRON 146 | PRON_3D PRON 147 | PRON_3FP PRON 148 | PRON_3FS PRON 149 | PRON_3MP PRON 150 | PRON_3MS PRON 151 | EXCLAM_PRON PRON 152 | PUNC 153 | PV V 154 | PVSUFF_DO:1P PRON 155 | PVSUFF_DO:1S PRON 156 | PVSUFF_DO:2MP PRON 157 | PVSUFF_DO:2MS PRON 158 | PVSUFF_DO:3D PRON 159 | PVSUFF_DO:2D PRON 160 | PVSUFF_DO:3FS PRON 161 | PVSUFF_DO:3MP PRON 162 | PVSUFF_DO:3MS PRON 163 | PVSUFF_DO:2FS PRON 164 | PVSUFF_DO:2FP PRON 165 | PVSUFF_DO:3FP PRON 166 | PVSUFF_SUBJ:1P VSUFF 167 | PVSUFF_SUBJ:1S VSUFF 168 | PVSUFF_SUBJ:2FS VSUFF 169 | PVSUFF_SUBJ:2MP VSUFF 170 | PVSUFF_SUBJ:2MS VSUFF 171 | PVSUFF_SUBJ:3FD VSUFF 172 | PVSUFF_SUBJ:3FP VSUFF 173 | PVSUFF_SUBJ:3FS VSUFF 174 | PVSUFF_SUBJ:3MD VSUFF 175 | PVSUFF_SUBJ:3MP VSUFF 176 | PVSUFF_SUBJ:3MS VSUFF 177 | PVSUFF_SUBJ:2D VSUFF 178 | PV_PASS V 179 | NUMERIC_COMMA PUNC 180 | RC_PART PART 181 | REL_ADV REL 182 | REL_PRON REL 183 | SUB_CONJ CONJ 184 | VERB_PART PART 185 | TYPO FOREIGN 186 | DIALECT FOREIGN 187 | FUT_PART PART 188 | VERB V 189 | NO_FUNC NOUN 190 | DEM NOUN 191 | JUS JUS 192 | FUNC_WORD FUNC 193 | -------------------------------------------------------------------------------- /data/tags-mada2core12.txt: -------------------------------------------------------------------------------- 1 | PREP P 2 | NOUN N 3 | PUNC PNX 4 | FUNC N 5 | ADV AV 6 | PART PRT 7 | DET PRT 8 | V V 9 | CONJ C 10 | REL REL 11 | FOREIGN N 12 | ADJ AJ 13 | ABBREV ABBREV 14 | PRON PRO 15 | NUM N 16 | NOUN_PROP PN 17 | adj AJ 18 | adj_comp AJ 19 | adj_num N 20 | noun N 21 | noun_num N 22 | noun_quant N 23 | noun_prop PN 24 | verb V 25 | verb_pseudo V 26 | pron_rel REL 27 | adv_rel REL 28 | part_restrict PRT 29 | prep P 30 | -------------------------------------------------------------------------------- /data/test0.mada.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuanzh/SegParser/dda3f6ca501b0c7ef0de26f08c9e05062c19d4fe/data/test0.mada.gz -------------------------------------------------------------------------------- /decoder/ClassifierDecoder.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ClassifierDecoder.cpp 3 | * 4 | * Created on: May 7, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "ClassifierDecoder.h" 9 | #include 10 | 11 | namespace segparser { 12 | 13 | ClassifierDecoder::ClassifierDecoder(Options* options) : DependencyDecoder(options) { 14 | } 15 | 16 | ClassifierDecoder::~ClassifierDecoder() { 17 | } 18 | 19 | void ClassifierDecoder::decode(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe) { 20 | ThrowException("no need to decode"); 21 | } 22 | 23 | void ClassifierDecoder::train(DependencyInstance* gold, DependencyInstance* pred, FeatureExtractor* fe, int trainintIter) { 24 | // check the gold and pred have same seg/pos 25 | for (int i = 1; i < pred->numWord; ++i) { 26 | assert(gold->word[i].currSegCandID == pred->word[i].currSegCandID); 27 | 28 | SegInstance& goldInst = gold->word[i].getCurrSeg(); 29 | SegInstance& predInst = pred->word[i].getCurrSeg(); 30 | 31 | assert(goldInst.size() == predInst.size()); 32 | for (int j = 0; j < predInst.size(); ++j) { 33 | assert(goldInst.element[j].currPosCandID == predInst.element[j].currPosCandID); 34 | } 35 | } 36 | 37 | CacheTable* cache = fe->getCacheTable(pred); 38 | boost::shared_ptr tmpCache = boost::shared_ptr(new CacheTable()); 39 | if (!cache) { 40 | cache = tmpCache.get(); // temporary cache for this run 41 | tmpCache->initCacheTable(fe->type, pred, NULL, options); 42 | } 43 | 44 | for (int mw = 1; mw < pred->numWord; ++mw) { 45 | SegInstance& segInst = pred->word[mw].getCurrSeg(); 46 | SegInstance& goldInst = gold->word[mw].getCurrSeg(); 47 | 48 | for (int ms = 0; ms < segInst.size(); ++ms) { 49 | HeadIndex m(mw, ms); 50 | findOptHead(pred, gold, m, fe, cache); 51 | 52 | if (segInst.element[ms].dep != goldInst.element[ms].dep) { 53 | FeatureVector newFV; 54 | fe->getArcFv(fe, gold, goldInst.element[ms].dep, m, &newFV, cache); 55 | double newScore = fe->parameters->getScore(&newFV); 56 | 57 | FeatureVector oldFV; 58 | fe->getArcFv(fe, pred, segInst.element[ms].dep, m, &oldFV, cache); 59 | double oldScore = fe->parameters->getScore(&oldFV); 60 | 61 | newFV.concatNeg(&oldFV); 62 | 63 | double err = fe->parameters->wordDepError(gold->word[mw], pred->word[mw]); 64 | 65 | if (err - (newScore - oldScore) > 1e-4) { 66 | fe->parameters->update(gold, pred, &newFV, err - (newScore - oldScore), fe, updateTimes); 67 | } 68 | } 69 | updateTimes++; 70 | } 71 | } 72 | } 73 | 74 | void ClassifierDecoder::findOptHead(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, FeatureExtractor* fe, CacheTable* cache) { 75 | 76 | assert(cache && cache->numSeg == pred->getNumSeg()); 77 | 78 | // get pruned list 79 | vector isPruned = move(fe->isPruned(pred, m, cache)); 80 | int segID = -1; 81 | 82 | SegElement& predSegEle = pred->getElement(m); 83 | double bestScore = -DBL_MAX; 84 | HeadIndex bestDep(-1, 0); 85 | 86 | for (int hw = 0; hw < pred->numWord; ++hw) { 87 | SegInstance& segInst = pred->word[hw].getCurrSeg(); 88 | 89 | for (int hs = 0; hs < segInst.size(); ++hs) { 90 | segID++; 91 | if (isPruned[segID]) { 92 | continue; 93 | } 94 | 95 | assert(hw != m.hWord || hs != m.hSeg); 96 | 97 | HeadIndex h(hw, hs); 98 | 99 | predSegEle.dep = h; 100 | FeatureVector fv; 101 | fe->getArcFv(fe, pred, h, m, &fv, cache); 102 | double score = fe->parameters->getScore(&fv); 103 | if (gold) { 104 | // add loss 105 | score += fe->parameters->wordDepError(gold->word[m.hWord], pred->word[m.hWord]); 106 | } 107 | 108 | if (score > bestScore + 1e-6) { 109 | bestScore = score; 110 | bestDep = h; 111 | } 112 | } 113 | } 114 | assert(segID == (int)isPruned.size() - 1); 115 | predSegEle.dep = bestDep; 116 | } 117 | 118 | } /* namespace segparser */ 119 | -------------------------------------------------------------------------------- /decoder/ClassifierDecoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ClassifierDecoder.h 3 | * 4 | * Created on: May 7, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef CLASSIFIERDECODER_H_ 9 | #define CLASSIFIERDECODER_H_ 10 | 11 | #include "DependencyDecoder.h" 12 | 13 | namespace segparser { 14 | 15 | class ClassifierDecoder: public segparser::DependencyDecoder { 16 | public: 17 | ClassifierDecoder(Options* options); 18 | virtual ~ClassifierDecoder(); 19 | 20 | void decode(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe); 21 | void train(DependencyInstance* gold, DependencyInstance* pred, FeatureExtractor* fe, int trainintIter); 22 | 23 | private: 24 | void findOptHead(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, FeatureExtractor* fe, CacheTable* cache); 25 | 26 | }; 27 | 28 | } /* namespace segparser */ 29 | #endif /* CLASSIFIERDECODER_H_ */ 30 | -------------------------------------------------------------------------------- /decoder/DependencyDecoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DependencyDecoder.h 3 | * 4 | * Created on: Apr 8, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef DEPENDENCYDECODER_H_ 9 | #define DEPENDENCYDECODER_H_ 10 | 11 | #include "../Options.h" 12 | #include "../DependencyInstance.h" 13 | #include "../SegParser.h" 14 | #include "../util/StringUtils.h" 15 | #include "../util/Random.h" 16 | #include "../FeatureExtractor.h" 17 | 18 | namespace segparser { 19 | 20 | class FeatureExtractor; 21 | class CacheTable; 22 | 23 | class DependencyDecoder { 24 | public: 25 | DependencyDecoder(Options* options); 26 | virtual ~DependencyDecoder(); 27 | 28 | static DependencyDecoder* createDependencyDecoder(Options* options, int mode, int thread, bool isTrain); 29 | 30 | virtual void initialize() { 31 | 32 | } 33 | 34 | virtual void shutdown() { 35 | 36 | } 37 | 38 | virtual void decode(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe) { 39 | ThrowException("should not go virtual decode function"); 40 | } 41 | 42 | virtual void train(DependencyInstance* gold, DependencyInstance* pred, FeatureExtractor* fe, int trainintIter) { 43 | ThrowException("should not go virtual train function"); 44 | } 45 | 46 | void prune(DependencyInstance* inst, HeadIndex& m, FeatureExtractor* fe, vector& pruned); 47 | 48 | int getUpdateTimes() { 49 | return updateTimes; 50 | } 51 | 52 | void resetUpdateTimes() { 53 | updateTimes = 0; 54 | } 55 | 56 | void initInst(DependencyInstance* inst, FeatureExtractor* fe); 57 | void removeGoldInfo(DependencyInstance* inst); 58 | 59 | int seed; 60 | Options* options; 61 | 62 | int getBottomUpOrder(DependencyInstance* inst, HeadIndex& arg, vector& idx, int id); 63 | double sampleSeg1O(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe, int wordID, Random& r); 64 | double samplePos1O(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe, int wordID, Random& r); 65 | bool randomWalkSampler(DependencyInstance* pred, DependencyInstance* gold, FeatureExtractor* fe, CacheTable* cache, 66 | vector& toBeSampled, Random& r, double T); 67 | 68 | protected: 69 | int updateTimes; 70 | 71 | bool isAncestor(DependencyInstance* s, HeadIndex& h, HeadIndex m); 72 | bool isProj(DependencyInstance* s, HeadIndex& h, HeadIndex& m); 73 | int samplePoint(vector& prob, Random& r); 74 | void convertScoreToProb(vector& score); 75 | double getSegPosProb(WordInstance& word); 76 | void getFirstOrderVec(DependencyInstance* inst, DependencyInstance* gold, 77 | FeatureExtractor* fe, HeadIndex& m, CacheTable* cache, bool treeConstraint, vector& candH, vector& score); 78 | double getMHDepProb(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe, int wordID); 79 | double sampleSegPos(DependencyInstance* inst, DependencyInstance* gold, int wordID, Random& r); 80 | double sampleMHDepProb(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe, int wordID, Random& r); 81 | void updateSeg(DependencyInstance* inst, WordInstance& word, int newSeg); 82 | void updatePos(WordInstance& word, SegElement& ele, int newPos); 83 | 84 | void cycleErase(DependencyInstance* inst, HeadIndex i, vector& toBeSampled); 85 | void updateSeg(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, 86 | int newSeg, int oldSeg, int baseOptSeg, int baseOptPos, vector& oldPos, vector& oldHeadIndex, 87 | vector& relatedChildren, vector& relatedOldParent); 88 | void setGoldSegAndPos(DependencyInstance* pred, DependencyInstance* gold); 89 | }; 90 | 91 | } /* namespace segparser */ 92 | #endif /* DEPENDENCYDECODER_H_ */ 93 | -------------------------------------------------------------------------------- /decoder/DevelopmentThread.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DevelopmentThread.h 3 | * 4 | * Created on: Apr 16, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef DEVELOPMENTTHREAD_H_ 9 | #define DEVELOPMENTTHREAD_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include "../io/DependencyReader.h" 15 | #include "../SegParser.h" 16 | 17 | namespace segparser { 18 | 19 | using namespace std; 20 | 21 | class DependencyReader; 22 | class SegParser; 23 | 24 | class DevelopmentThread { 25 | public: 26 | DevelopmentThread(); 27 | virtual ~DevelopmentThread(); 28 | 29 | void start(string devfile, string devoutfile, SegParser* sp, bool verbal); 30 | void evaluate(DependencyInstance* inst, DependencyInstance* gold); 31 | double computeTedEval(); 32 | 33 | bool isDevTesting; 34 | 35 | string devfile; 36 | string devoutfile; 37 | 38 | DependencyReader reader; 39 | 40 | int currProcessID; 41 | int currFinishID; 42 | 43 | pthread_mutex_t processMutex; 44 | pthread_mutex_t finishMutex; 45 | 46 | double wordNum; 47 | double corrWordSegNum; 48 | double goldSegNum; 49 | double predSegNum; 50 | double corrSegNum; 51 | double corrPosNum; 52 | double goldDepNum; 53 | double predDepNum; 54 | double corrDepNum; 55 | 56 | pthread_t workThread; 57 | pthread_t outputThread; 58 | vector decodeThread; 59 | int decodeThreadNum; 60 | 61 | unordered_map id2Pred; 62 | int finishThreadNum; 63 | 64 | SegParser* sp; 65 | Options* options; 66 | bool verbal; 67 | 68 | private: 69 | bool isPunc(string& pos); 70 | int numSegWithoutPunc(DependencyInstance* inst); 71 | string normalize(string form); 72 | }; 73 | 74 | } /* namespace segparser */ 75 | #endif /* DEVELOPMENTTHREAD_H_ */ 76 | -------------------------------------------------------------------------------- /decoder/HillClimbingDecoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * HillClimbingDecoder.h 3 | * 4 | * Created on: May 1, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef HILLCLIMBINGDECODER_H_ 9 | #define HILLCLIMBINGDECODER_H_ 10 | 11 | #include "DependencyDecoder.h" 12 | #include 13 | 14 | namespace segparser { 15 | 16 | class HillClimbingDecoder: public segparser::DependencyDecoder { 17 | public: 18 | HillClimbingDecoder(Options* options, int thread, int convergeIter); 19 | virtual ~HillClimbingDecoder(); 20 | 21 | void initialize(); 22 | void shutdown(); 23 | void startTask(DependencyInstance* pred, DependencyInstance* gold, FeatureExtractor* fe); 24 | void waitAndGetResult(DependencyInstance* inst); 25 | void decode(DependencyInstance* inst, DependencyInstance* gold, FeatureExtractor* fe); 26 | void train(DependencyInstance* gold, DependencyInstance* pred, FeatureExtractor* fe, int trainintIter); 27 | double findOptHead(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, FeatureExtractor* fe, CacheTable* cache); 28 | double findOptBigramHead(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, HeadIndex& n, FeatureExtractor* fe, CacheTable* cache); 29 | double findOptPos(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, FeatureExtractor* fe, CacheTable* cache); 30 | double findOptSeg(DependencyInstance* pred, DependencyInstance* gold, HeadIndex& m, FeatureExtractor* fe, CacheTable* cache); 31 | 32 | void debug(string msg, int id); 33 | 34 | vector threadID; 35 | vector taskMutex; 36 | vector taskStartCond; 37 | vector taskDoneCond; 38 | vector taskDone; 39 | vector threadExit; 40 | 41 | double bestScore; 42 | VariableInfo best; 43 | int unChangeIter; // converge criteria 44 | pthread_mutex_t updateMutex; 45 | pthread_mutex_t debugMutex; 46 | 47 | DependencyInstance* pred; 48 | DependencyInstance* gold; 49 | FeatureExtractor* fe; 50 | 51 | int thread; 52 | 53 | int convergeIter; 54 | 55 | int earlyStopIter; 56 | bool samplePos; 57 | bool sampleSeg; 58 | }; 59 | 60 | } /* namespace segparser */ 61 | #endif /* HILLCLIMBINGDECODER_H_ */ 62 | -------------------------------------------------------------------------------- /io/DependencyReader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * DependencyReader.cpp 3 | * 4 | * Created on: Mar 27, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "DependencyReader.h" 9 | #include "../util/StringUtils.h" 10 | #include 11 | #include 12 | #include 13 | #include "../util/Constant.h" 14 | 15 | namespace segparser { 16 | 17 | DependencyReader::DependencyReader(Options* options, string file) : options(options) { 18 | hasCandidate = true; 19 | isTrain = false; 20 | startReading(file); 21 | } 22 | 23 | DependencyReader::DependencyReader() : options(NULL) { 24 | hasCandidate = true; 25 | isTrain = false; 26 | } 27 | 28 | DependencyReader::~DependencyReader() { 29 | } 30 | 31 | void DependencyReader::startReading(Options* options, string file) { 32 | this->options = options; 33 | fin.open(file.c_str()); 34 | } 35 | 36 | void DependencyReader::startReading(string file) { 37 | fin.open(file.c_str()); 38 | } 39 | 40 | void DependencyReader::close() { 41 | if (fin.is_open()) 42 | fin.close(); 43 | } 44 | 45 | HeadIndex DependencyReader::parseHeadIndex(string str) { 46 | unsigned int pos = str.find("/"); 47 | assert(pos != string::npos); 48 | int word = atoi(str.substr(0, pos).c_str()); 49 | int seg = atoi(str.substr(pos + 1).c_str()); 50 | return HeadIndex(word, seg); 51 | } 52 | 53 | void DependencyReader::addGoldSegElement(WordInstance* word, string form, string lemma, string pos, 54 | string morphStr, int segid, int hwordid, int hsegid, string lab) { 55 | word->goldForm.push_back(form); 56 | word->goldLemma.push_back(lemma); 57 | word->goldPos.push_back(pos); 58 | word->goldDep.push_back(HeadIndex(hwordid, hsegid)); 59 | word->goldLab.push_back(lab); 60 | word->goldMorphIndex = -1; 61 | word->goldAlIndex = -1; 62 | 63 | if (morphStr != "_") { 64 | vector data; 65 | StringSplit(morphStr, "|", &data); 66 | 67 | if (word->goldAlIndex == -1 && data[0][data[0].length() - 1] == 'y') { 68 | word->goldAlIndex = segid; 69 | } 70 | 71 | if (word->goldMorphIndex == -1) { 72 | assert(data.size() == 4); 73 | bool hasMorphInfo = false; 74 | for (int i = 1; i < 4; ++i) { 75 | string val = data[i].substr(data[i].find_last_of("=") + 1, string::npos); 76 | if (val != "na" && val != "NA") 77 | hasMorphInfo = true; 78 | } 79 | if (hasMorphInfo) { 80 | word->goldMorph.clear(); 81 | for (int i = 1; i < 4; ++i) { 82 | string val = data[i].substr(data[i].find_last_of("=") + 1, string::npos); 83 | word->goldMorph.push_back(val); 84 | } 85 | word->goldMorphIndex = segid; 86 | } 87 | } 88 | } 89 | } 90 | 91 | void DependencyReader::normalizeProb(WordInstance* word) { 92 | // normalize seg/pos candidate probability 93 | 94 | double sumSegProb = 0.0; 95 | for (unsigned int i = 0; i < word->candSeg.size(); ++i) { 96 | for (unsigned int j = 0; j < word->candSeg[i].element.size(); ++j) { 97 | SegElement& ele = word->candSeg[i].element[j]; 98 | double sumPosProb = 0.0; 99 | for (unsigned int k = 0; k < ele.candPos.size(); ++k) { 100 | sumPosProb += ele.candProb[k]; 101 | 102 | //if (k > 0) { 103 | // assert(ele.candProb[k - 1] >= ele.candProb[k]); 104 | //} 105 | } 106 | assert(sumPosProb > 0.0); 107 | for (unsigned int k = 0; k < ele.candPos.size(); ++k) { 108 | ele.candProb[k] /= sumPosProb; 109 | if (ele.candProb[k] > 1e-6) 110 | ele.candProb[k] = log(ele.candProb[k]); 111 | else 112 | ele.candProb[k] = -1000000; 113 | } 114 | } 115 | sumSegProb += word->candSeg[i].prob; 116 | } 117 | 118 | assert(sumSegProb > 0.0); 119 | for (unsigned int i = 0; i < word->candSeg.size(); ++i) { 120 | word->candSeg[i].prob /= sumSegProb; 121 | if (word->candSeg[i].prob > 1e-6) 122 | word->candSeg[i].prob = log(word->candSeg[i].prob); 123 | else 124 | word->candSeg[i].prob = -1000000; 125 | } 126 | } 127 | 128 | void DependencyReader::addGoldSegToCand(WordInstance* word) { 129 | // add the gold seg in to seg candidate if not exist (with prob 0) 130 | double prob = hasCandidate ? (isTrain ? 0.3 : 0.0) : 1.0; 131 | 132 | string goldSegStr = word->goldForm[0]; 133 | for (unsigned int i = 1; i < word->goldForm.size(); ++i) 134 | goldSegStr += "+" + word->goldForm[i]; 135 | 136 | unsigned int goldSegID = 0; 137 | for (; goldSegID < word->candSeg.size(); ++goldSegID) { 138 | if (goldSegStr.compare(word->candSeg[goldSegID].segStr) == 0) { 139 | break; 140 | } 141 | } 142 | 143 | if (goldSegID == word->candSeg.size()) { 144 | // new cand 145 | //cout << "gold seg not exist" << endl; 146 | 147 | SegInstance segInst; 148 | segInst.prob = prob; 149 | segInst.segStr = goldSegStr; 150 | segInst.morph = word->goldMorph; 151 | segInst.morphIndex = word->goldMorphIndex; 152 | segInst.AlIndex = word->goldAlIndex; 153 | 154 | segInst.element.resize(word->goldForm.size()); 155 | for (unsigned int i = 0; i < word->goldForm.size(); ++i) { 156 | SegElement& curr = segInst.element[i]; 157 | curr.currPosCandID = 0; 158 | curr.form = word->goldForm[i]; 159 | curr.lemma = word->goldLemma[i]; 160 | curr.candPos.resize(1); 161 | curr.candPosid.resize(1); 162 | curr.candDetPosid.resize(1); 163 | curr.candProb.resize(1); 164 | curr.candSpecialPos.resize(1); 165 | 166 | curr.candPos[0] = word->goldPos[i]; 167 | curr.candProb[0] = 1.0; 168 | } 169 | 170 | word->currSegCandID = goldSegID; 171 | word->candSeg.push_back(segInst); 172 | } 173 | else { 174 | // old cand, check pos 175 | SegInstance& segInst = word->candSeg[goldSegID]; 176 | word->currSegCandID = goldSegID; 177 | segInst.morph = word->goldMorph; 178 | segInst.morphIndex = word->goldMorphIndex; 179 | segInst.AlIndex = word->goldAlIndex; 180 | 181 | assert(word->goldForm.size() == segInst.element.size()); 182 | for (unsigned int i = 0; i < word->goldForm.size(); ++i) { 183 | assert(word->goldForm[i].compare(segInst.element[i].form) == 0); 184 | segInst.element[i].lemma = word->goldLemma[i]; 185 | string goldPos = word->goldPos[i]; 186 | unsigned int goldPosID = 0; 187 | SegElement& ele = segInst.element[i]; 188 | for (; goldPosID < ele.candPos.size(); ++goldPosID) { 189 | if (goldPos.compare(ele.candPos[goldPosID]) == 0) { 190 | break; 191 | } 192 | } 193 | 194 | if (goldPosID == ele.candPos.size()) { 195 | // new pos 196 | //cout << "gold pos not exist" << endl; 197 | ele.candPos.resize(goldPosID + 1); 198 | ele.candPosid.resize(goldPosID + 1); 199 | ele.candDetPosid.resize(goldPosID + 1); 200 | ele.candProb.resize(goldPosID + 1); 201 | ele.candSpecialPos.resize(goldPosID + 1); 202 | 203 | ele.candPos[goldPosID] = goldPos; 204 | ele.candProb[goldPosID] = prob; 205 | 206 | ele.currPosCandID = goldPosID; 207 | } 208 | else { 209 | // old pos 210 | ele.currPosCandID = goldPosID; 211 | } 212 | } 213 | } 214 | 215 | // add dep, lab will be set in segInstID 216 | SegInstance& segInst = word->getCurrSeg(); 217 | for (int i = 0; i < segInst.size(); ++i) { 218 | segInst.element[i].dep = word->goldDep[i]; 219 | } 220 | } 221 | 222 | void DependencyReader::addSegCand(WordInstance* word, string str) { 223 | vector dataList; 224 | StringSplit(str, "||", &dataList); 225 | if (dataList.size() != 5) 226 | cout << str << endl; 227 | assert(dataList.size() == 5); 228 | SegInstance segInst; 229 | 230 | double segProb = atof(dataList[4].c_str()); 231 | segInst.prob = segProb; 232 | 233 | int AlIndex = atoi(dataList[1].c_str()); 234 | segInst.AlIndex = AlIndex; 235 | 236 | int morphIndex = atoi(dataList[2].c_str()); 237 | segInst.morphIndex = morphIndex; 238 | 239 | StringSplit(dataList[3], "/", &segInst.morph); 240 | 241 | bool hasMorphValue = false; 242 | for (unsigned int i = 0; i < segInst.morph.size(); ++i) { 243 | if (segInst.morph[i] != "na" && segInst.morph[i] != "NA") { 244 | hasMorphValue = true; 245 | break; 246 | } 247 | } 248 | if (!hasMorphValue) { 249 | segInst.morphIndex = -1; 250 | segInst.morph.clear(); 251 | } 252 | 253 | string segStr = dataList[0]; 254 | vector segList; 255 | StringSplit(segStr, "&&", &segList); 256 | 257 | segInst.element.resize(segList.size()); 258 | for (unsigned int i = 0; i < segList.size(); ++i) { 259 | SegElement& curr = segInst.element[i]; 260 | vector posList; 261 | StringSplit(segList[i], "@#", &posList); 262 | curr.form = posList[0]; // normalize is done when set inst ids 263 | curr.lemma = posList[1]; 264 | curr.candPos.resize(posList.size() - 2); 265 | curr.candPosid.resize(posList.size() - 2); 266 | curr.candDetPosid.resize(posList.size() - 2); 267 | curr.candProb.resize(posList.size() - 2); 268 | curr.candSpecialPos.resize(posList.size() - 2); 269 | 270 | for (unsigned int j = 2; j < posList.size(); ++j) { 271 | int pos = posList[j].find_last_of("_"); 272 | curr.candPos[j - 2] = posList[j].substr(0, pos); 273 | curr.candProb[j - 2] = atof(posList[j].substr(pos + 1).c_str()); 274 | } 275 | 276 | if (i > 0) 277 | segInst.segStr += "+"; 278 | segInst.segStr += curr.form; 279 | } 280 | 281 | word->candSeg.push_back(move(segInst)); 282 | } 283 | 284 | void DependencyReader::concatSegStr(WordInstance* word) { 285 | word->wordStr = ""; 286 | for (unsigned int i = 0; i < word->goldForm.size(); ++i) { 287 | word->wordStr += word->goldForm[i]; 288 | } 289 | } 290 | 291 | inst_ptr DependencyReader::nextInstance() { 292 | 293 | if (fin.eof()) { 294 | return inst_ptr((DependencyInstance*)NULL); 295 | } 296 | 297 | string str; 298 | getline(fin, str); 299 | if (str.empty()) { 300 | return inst_ptr((DependencyInstance*)NULL); 301 | } 302 | 303 | inst_ptr s(new DependencyInstance()); 304 | vector data; 305 | while (!str.empty()) { 306 | data.push_back(str); 307 | getline(fin, str); 308 | } 309 | 310 | // get sentence length and seg counts 311 | // word id starts from 1, seg starts from 0 312 | int len = 0; 313 | for (unsigned int i = 0; i < data.size(); ++i) { 314 | int pos = data[i].find("\t"); 315 | HeadIndex hi = parseHeadIndex(data[i].substr(0, pos)); 316 | len = hi.hWord + 1; // include root 317 | } 318 | 319 | s->numWord = len; 320 | s->word.resize(len); 321 | 322 | // add root information 323 | addGoldSegElement(&s->word[0], "", "", "", "_", 0, -1, 0, ""); 324 | concatSegStr(&s->word[0]); 325 | addSegCand(&s->word[0], "@#@#_1.0||-1||0||_||1.0"); 326 | 327 | // process each line 328 | for (unsigned int i = 0; i < data.size(); ++i) { 329 | vector line; 330 | StringSplit(data[i], "\t", &line); 331 | 332 | HeadIndex id = parseHeadIndex(line[0]); 333 | string word = line[1]; 334 | string lemma = line[2]; 335 | string pos = line[3]; 336 | string morphStr = line[5]; 337 | HeadIndex head = parseHeadIndex(line[6]); 338 | string lab = ""; 339 | 340 | addGoldSegElement(&s->word[id.hWord], word, lemma, pos, morphStr, id.hSeg, head.hWord, head.hSeg, lab); 341 | assert((unsigned int)id.hSeg + 1 == s->word[id.hWord].goldForm.size()); 342 | } 343 | 344 | // complete information 345 | for (int i = 1; i < len; ++i) { 346 | concatSegStr(&s->word[i]); 347 | } 348 | 349 | // process segmentation candidate 350 | if (hasCandidate) { 351 | for (int i = 1; i < len; ++i) { 352 | getline(fin, str); 353 | vector segCand; 354 | StringSplit(str, "\t", &segCand); 355 | if (s->word[i].wordStr != segCand[0]) { 356 | cout << str << endl; 357 | cout << s->word[i].wordStr << " " << segCand[0] << endl; 358 | } 359 | assert(s->word[i].wordStr.compare(segCand[0]) == 0); 360 | for (unsigned int j = 1; j < segCand.size(); ++j) { 361 | addSegCand(&s->word[i], segCand[j]); 362 | } 363 | } 364 | getline(fin, str); 365 | assert(str.empty()); 366 | } 367 | 368 | // complete information 369 | for (int i = 0; i < len; ++i) { 370 | addGoldSegToCand(&s->word[i]); 371 | normalizeProb(&s->word[i]); 372 | } 373 | 374 | s->constructConversionList(); 375 | s->setOptSegPosCount(); 376 | s->buildChild(); 377 | 378 | return s; 379 | } 380 | 381 | } /* namespace segparser */ 382 | -------------------------------------------------------------------------------- /io/DependencyReader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DependencyReader.h 3 | * 4 | * Created on: Mar 27, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef DEPENDENCYREADER_H_ 9 | #define DEPENDENCYREADER_H_ 10 | 11 | #include 12 | #include "../Options.h" 13 | #include "../DependencyInstance.h" 14 | 15 | namespace segparser { 16 | 17 | using namespace std; 18 | 19 | class DependencyReader { 20 | public: 21 | DependencyReader(); 22 | DependencyReader(Options* options, string file); 23 | virtual ~DependencyReader(); 24 | 25 | void startReading(Options* options, string file); 26 | void startReading(string file); 27 | void close(); 28 | inst_ptr nextInstance(); 29 | 30 | bool hasCandidate; 31 | bool isTrain; 32 | 33 | private: 34 | ifstream fin; 35 | Options* options; 36 | 37 | HeadIndex parseHeadIndex(string str); 38 | void addGoldSegElement(WordInstance* word, string form, string lemma, string pos, 39 | string morphStr, int segid, int hwordid, int hsegid, string lab); 40 | void addGoldSegToCand(WordInstance* word); 41 | void normalizeProb(WordInstance* word); 42 | void addSegCand(WordInstance* word, string str); 43 | string normalize(string s); 44 | void concatSegStr(WordInstance* word); 45 | }; 46 | 47 | } /* namespace segparser */ 48 | #endif /* DEPENDENCYREADER_H_ */ 49 | -------------------------------------------------------------------------------- /io/DependencyWriter.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * DependencyWriter.cpp 3 | * 4 | * Created on: Apr 16, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "DependencyWriter.h" 9 | 10 | namespace segparser { 11 | 12 | DependencyWriter::DependencyWriter(Options* options) : options(options) { 13 | } 14 | 15 | DependencyWriter::DependencyWriter(Options* options, string file) : options(options) { 16 | startWriting(file); 17 | } 18 | 19 | DependencyWriter::~DependencyWriter() { 20 | } 21 | 22 | void DependencyWriter::startWriting(string file) { 23 | fout.open(file.c_str()); 24 | } 25 | 26 | void DependencyWriter::close() { 27 | if (fout.is_open()) 28 | fout.close(); 29 | } 30 | 31 | void DependencyWriter::writeInstance(DependencyInstance* inst) { 32 | for (int i = 1; i < inst->numWord; ++i) { 33 | WordInstance& word = inst->word[i]; 34 | SegInstance& segInst = word.getCurrSeg(); 35 | 36 | for (int j = 0; j < segInst.size(); ++j) { 37 | fout << i << "/" << j << "\t" << segInst.element[j].form << "\t" << segInst.element[j].form << "\t"; 38 | string pos = segInst.element[j].candPos[segInst.element[j].currPosCandID]; 39 | fout << pos << "\t" << pos << "\t_\t"; 40 | fout << segInst.element[j].dep << "\t" << word.currSegCandID << "\t" << segInst.element[j].currPosCandID << "\t_\n"; 41 | } 42 | } 43 | fout << endl; 44 | fout.flush(); 45 | } 46 | 47 | } /* namespace segparser */ 48 | -------------------------------------------------------------------------------- /io/DependencyWriter.h: -------------------------------------------------------------------------------- 1 | /* 2 | * DependencyWriter.h 3 | * 4 | * Created on: Apr 16, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef DEPENDENCYWRITER_H_ 9 | #define DEPENDENCYWRITER_H_ 10 | 11 | #include 12 | #include "../DependencyInstance.h" 13 | #include "../Options.h" 14 | 15 | namespace segparser { 16 | 17 | class DependencyWriter { 18 | public: 19 | DependencyWriter(Options* options); 20 | DependencyWriter(Options* options, string file); 21 | virtual ~DependencyWriter(); 22 | 23 | void startWriting(string file); 24 | void close(); 25 | void writeInstance(DependencyInstance* inst); 26 | 27 | private: 28 | ofstream fout; 29 | Options* options; 30 | 31 | }; 32 | 33 | } /* namespace segparser */ 34 | #endif /* DEPENDENCYWRITER_H_ */ 35 | -------------------------------------------------------------------------------- /runs/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /spmrl_code_generator/SpmrlDataGenerator.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | import java.util.*; 3 | 4 | public class SpmrlDataGenerator { 5 | 6 | public static boolean sameSentence(SpmrlSentence s1, MadaSentence s2) { 7 | String x1 = ""; 8 | for (int i = 0; i < s1.words.length; ++i) 9 | x1 += s1.words[i].word + " "; 10 | 11 | String x2 = ""; 12 | for (int i = 0; i < s2.words.length; ++i) 13 | x2 += s2.words[i].word + " "; 14 | 15 | if (!x1.equals(x2)) { 16 | System.out.println(x1); 17 | System.out.println(x2); 18 | } 19 | return x1.equals(x2); 20 | } 21 | 22 | public static void generateFile(int split, String fileName) throws IOException { 23 | int maxLength = 70; 24 | 25 | SpmrlReader reader = new SpmrlReader(); 26 | reader.open("../data/" + fileName + ".Arabic"); 27 | if (reader.useGoldSegDict) 28 | reader.buildGoldSegDict(); 29 | 30 | MadaReader[] madaReader = new MadaReader[split]; 31 | int maxLine = -1; 32 | for (int i = 0; i < split; ++i) { 33 | madaReader[i] = new MadaReader(); 34 | madaReader[i].open("../data/" + fileName + i + ".mada"); 35 | madaReader[i].maxLine = maxLine; 36 | } 37 | 38 | BufferedWriter bw = new BufferedWriter(new FileWriter("../data/spmrl/spmrl.seg.full." + fileName)); 39 | 40 | SpmrlSentence sent = null; 41 | int id = 0; 42 | int cnt = 0; 43 | while ((sent = reader.readNextSentence()) != null) { 44 | MadaSentence madaSent = madaReader[id].readNextSentence(); 45 | 46 | MadaReader.Assert(sameSentence(sent, madaSent)); 47 | 48 | int segNum = 0; 49 | for (int i = 0; i < sent.words.length; ++i) 50 | segNum += sent.words[i].segs.length; 51 | if (segNum <= maxLength) { 52 | reader.outputGoldSentence(sent, bw); 53 | SegStruct[] segStruct = madaReader[id].generateCandidate(madaSent); 54 | for (int z = 0; z < segStruct.length; ++z) { 55 | if (segStruct[z].word.equals("l= 5) { 64 | reader.addGoldSegDict(segStruct); 65 | // for (int z = 0; z < segStruct.length; ++z) { 66 | // if (segStruct[z].word.equals("l& key) { 46 | key.reserve(numEntries); 47 | for (auto kv : map) { 48 | key.push_back(kv.first); 49 | } 50 | } 51 | 52 | int Alphabet::size() { 53 | return numEntries + 1; 54 | } 55 | 56 | void Alphabet::stopGrowth() { 57 | growthStopped = true; 58 | } 59 | 60 | void Alphabet::writeObject (FILE* fs) { 61 | CHECK(WriteInteger(fs, numEntries)); 62 | CHECK(WriteStringIntegerMap(fs, map)); 63 | CHECK(WriteBool(fs, growthStopped)); 64 | } 65 | 66 | void Alphabet::readObject (FILE* fs) { 67 | CHECK(ReadInteger(fs, &numEntries)); 68 | CHECK(ReadStringIntegerMap(fs, &map)); 69 | CHECK(ReadBool(fs, &growthStopped)); 70 | } 71 | 72 | } /* namespace segparser */ 73 | -------------------------------------------------------------------------------- /util/Alphabet.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Alphabet.h 3 | * 4 | * Created on: Mar 28, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef ALPHABET_H_ 9 | #define ALPHABET_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | namespace segparser { 16 | 17 | using namespace std; 18 | 19 | class Alphabet { 20 | public: 21 | Alphabet(); 22 | Alphabet(int capacity); 23 | virtual ~Alphabet(); 24 | 25 | int lookupIndex (const string& entry, bool addIfNotPresent); 26 | int lookupIndex (const string& entry); 27 | void toArray(vector& key); 28 | int size(); 29 | void stopGrowth(); 30 | 31 | void writeObject (FILE* fs); 32 | void readObject (FILE* fs); 33 | 34 | private: 35 | unordered_map map; 36 | int numEntries; 37 | bool growthStopped; 38 | }; 39 | 40 | } /* namespace segparser */ 41 | #endif /* ALPHABET_H_ */ 42 | -------------------------------------------------------------------------------- /util/Constant.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Constant.cpp 3 | * 4 | * Created on: Mar 27, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "Constant.h" 9 | 10 | namespace segparser { 11 | 12 | const string PossibleLang::langString[] = {"qatar", "spmrl", "ctb"}; 13 | 14 | } /* namespace segparser */ 15 | -------------------------------------------------------------------------------- /util/Constant.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Constant.h 3 | * 4 | * Created on: Mar 27, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef CONSTANT_H_ 9 | #define CONSTANT_H_ 10 | 11 | #include 12 | 13 | namespace segparser { 14 | 15 | class DependencyInstance; 16 | 17 | using namespace std; 18 | 19 | struct DecodingMode { 20 | enum types { 21 | HillClimb = 0, 22 | Exact, 23 | }; 24 | }; 25 | 26 | struct PossibleLang { 27 | enum types { 28 | Arabic = 0, 29 | SPMRL, 30 | Chinese, 31 | Count, 32 | }; 33 | 34 | static const string langString[PossibleLang::Count]; 35 | }; 36 | 37 | struct SpecialPos { 38 | enum types { 39 | C = 0, P, PNX, V, AJ, N, OTHER, COUNT, 40 | }; 41 | }; 42 | 43 | struct ConstPosLex { 44 | enum types { 45 | UNSEEN = 0, 46 | START, 47 | MID, 48 | END, 49 | QUOTE, 50 | LRB, 51 | RRB, 52 | }; 53 | }; 54 | 55 | struct ConstLab { 56 | enum types { 57 | UNSEEN = 0, 58 | NOTYPE, 59 | }; 60 | }; 61 | 62 | #define BINNED_BUCKET 8 63 | #define MAX_CHILD_NUM 5 64 | #define MAX_SPAN_LENGTH 5 65 | #define MAX_LEN_DIFF 4 66 | #define MAX_FEATURE_NUM 7 67 | 68 | } /* namespace segparser */ 69 | #endif /* CONSTANT_H_ */ 70 | -------------------------------------------------------------------------------- /util/FeatureAlphabet.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * FeatureAlphabet.cpp 3 | * 4 | * Created on: Mar 28, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "FeatureAlphabet.h" 9 | #include "SerializationUtils.h" 10 | #include "../FeatureEncoder.h" 11 | 12 | namespace segparser { 13 | 14 | FeatureAlphabet::FeatureAlphabet (int capacity) { 15 | arcMap.reserve(capacity); 16 | secondOrderMap.reserve(capacity); 17 | thirdOrderMap.reserve(capacity); 18 | highOrderMap.reserve(capacity); 19 | numEntries = 0; 20 | growthStopped = false; 21 | 22 | table[TemplateType::TArc] = &arcMap; 23 | table[TemplateType::TSecondOrder] = &secondOrderMap; 24 | table[TemplateType::TThirdOrder] = &thirdOrderMap; 25 | table[TemplateType::THighOrder] = &highOrderMap; 26 | } 27 | 28 | FeatureAlphabet::FeatureAlphabet() : FeatureAlphabet(10000) { 29 | } 30 | 31 | FeatureAlphabet::~FeatureAlphabet() { 32 | } 33 | 34 | unordered_map* FeatureAlphabet::getMap(int type) { 35 | 36 | unordered_map* intmap = NULL; 37 | if (type < TemplateType::COUNT) 38 | intmap = table[type]; 39 | else 40 | ThrowException("undefined template type"); 41 | return intmap; 42 | } 43 | 44 | int FeatureAlphabet::lookupIndex(const int type, const uint64_t entry, bool addIfNotPresent) { 45 | unordered_map* intmap = getMap(type); 46 | 47 | int ret = 0; 48 | if (intmap->find(entry) == intmap->end()) { 49 | if (!growthStopped && addIfNotPresent) { 50 | ret = numEntries + 1; 51 | numEntries++; 52 | (*intmap)[entry] = ret; 53 | } 54 | } 55 | else { 56 | ret = (*intmap)[entry]; 57 | } 58 | return ret; 59 | } 60 | 61 | int FeatureAlphabet::lookupIndex(unordered_map& intmap, const uint64_t entry, bool addIfNotPresent) { 62 | int ret = 0; 63 | if (intmap.find(entry) == intmap.end()) { 64 | if (!growthStopped && addIfNotPresent) { 65 | ret = numEntries + 1; 66 | numEntries++; 67 | intmap[entry] = ret; 68 | } 69 | } 70 | else { 71 | ret = intmap[entry]; 72 | } 73 | return ret; 74 | } 75 | 76 | int FeatureAlphabet::lookupIndex(const int type, const uint64_t entry) { 77 | return lookupIndex (type, entry, true); 78 | } 79 | 80 | int FeatureAlphabet::size() { 81 | return numEntries + 1; 82 | } 83 | 84 | void FeatureAlphabet::stopGrowth() { 85 | growthStopped = true; 86 | } 87 | 88 | void FeatureAlphabet::writeObject (FILE* fs) { 89 | CHECK(WriteInteger(fs, numEntries)); 90 | CHECK(WriteUINT64IntegerMap(fs, arcMap)); 91 | CHECK(WriteUINT64IntegerMap(fs, secondOrderMap)); 92 | CHECK(WriteUINT64IntegerMap(fs, thirdOrderMap)); 93 | CHECK(WriteUINT64IntegerMap(fs, highOrderMap)); 94 | CHECK(WriteBool(fs, growthStopped)); 95 | } 96 | 97 | void FeatureAlphabet::readObject (FILE* fs) { 98 | CHECK(ReadInteger(fs, &numEntries)); 99 | CHECK(ReadUINT64IntegerMap(fs, &arcMap)); 100 | CHECK(ReadUINT64IntegerMap(fs, &secondOrderMap)); 101 | CHECK(ReadUINT64IntegerMap(fs, &thirdOrderMap)); 102 | CHECK(ReadUINT64IntegerMap(fs, &highOrderMap)); 103 | CHECK(ReadBool(fs, &growthStopped)); 104 | } 105 | 106 | } /* namespace segparser */ 107 | -------------------------------------------------------------------------------- /util/FeatureAlphabet.h: -------------------------------------------------------------------------------- 1 | /* 2 | * FeatureAlphabet.h 3 | * 4 | * Created on: Mar 28, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef FEATUREALPHABET_H_ 9 | #define FEATUREALPHABET_H_ 10 | 11 | #include 12 | #include 13 | #include "../FeatureEncoder.h" 14 | 15 | namespace segparser { 16 | 17 | using namespace std; 18 | 19 | class FeatureAlphabet { 20 | public: 21 | FeatureAlphabet(int capacity); 22 | FeatureAlphabet(); 23 | virtual ~FeatureAlphabet(); 24 | 25 | int lookupIndex (const string& entry, bool addIfNotPresent); 26 | int lookupIndex (const string& entry); 27 | unordered_map* getMap(int type); 28 | int lookupIndex(const int type, const uint64_t entry, bool addIfNotPresent); 29 | int lookupIndex(unordered_map& intmap, const uint64_t entry, bool addIfNotPresent); 30 | int lookupIndex(const int type, const uint64_t entry); 31 | int size(); 32 | void stopGrowth(); 33 | void writeObject (FILE* fs); 34 | void readObject (FILE* fs); 35 | 36 | unordered_map arcMap; 37 | unordered_map secondOrderMap; 38 | unordered_map thirdOrderMap; 39 | unordered_map highOrderMap; 40 | 41 | private: 42 | int numEntries; 43 | bool growthStopped; 44 | 45 | unordered_map* table[TemplateType::COUNT]; 46 | }; 47 | 48 | } /* namespace segparser */ 49 | #endif /* FEATUREALPHABET_H_ */ 50 | -------------------------------------------------------------------------------- /util/FeatureVector.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * FeatureVector.cpp 3 | * 4 | * Created on: Mar 28, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "FeatureVector.h" 9 | #include 10 | #include 11 | 12 | namespace segparser { 13 | 14 | vector FeatureVector::dpVec; 15 | unsigned int FeatureVector::rows; 16 | 17 | FeatureVector::FeatureVector() { 18 | } 19 | 20 | FeatureVector::~FeatureVector() { 21 | } 22 | 23 | void FeatureVector::clear() { 24 | binaryIndex.clear(); 25 | negBinaryIndex.clear(); 26 | normalIndex.clear(); 27 | normalValue.clear(); 28 | } 29 | 30 | void FeatureVector::add(int index, double value) { 31 | normalIndex.push_back(index); 32 | normalValue.push_back(value); 33 | } 34 | 35 | void FeatureVector::addBinary(int index) { 36 | binaryIndex.push_back(index); 37 | } 38 | 39 | void FeatureVector::addNegBinary(int index) { 40 | negBinaryIndex.push_back(index); 41 | } 42 | 43 | void FeatureVector::concat(FeatureVector* fv) { 44 | for(unsigned int i = 0; i < fv->binaryIndex.size(); ++i) { 45 | binaryIndex.push_back(fv->binaryIndex[i]); 46 | } 47 | for(unsigned int i = 0; i < fv->negBinaryIndex.size(); ++i) { 48 | negBinaryIndex.push_back(fv->negBinaryIndex[i]); 49 | } 50 | for(unsigned int i = 0; i < fv->normalIndex.size(); ++i) { 51 | normalIndex.push_back(fv->normalIndex[i]); 52 | normalValue.push_back(fv->normalValue[i]); 53 | } 54 | } 55 | 56 | void FeatureVector::concatNeg(FeatureVector* fv) { 57 | for(unsigned int i = 0; i < fv->binaryIndex.size(); ++i) { 58 | negBinaryIndex.push_back(fv->binaryIndex[i]); 59 | } 60 | for(unsigned int i = 0; i < fv->negBinaryIndex.size(); ++i) { 61 | binaryIndex.push_back(fv->negBinaryIndex[i]); 62 | } 63 | for(unsigned int i = 0; i < fv->normalIndex.size(); ++i) { 64 | normalIndex.push_back(fv->normalIndex[i]); 65 | normalValue.push_back(-fv->normalValue[i]); 66 | } 67 | } 68 | 69 | double FeatureVector::dotProduct(FeatureVector* fv) { 70 | double result = 0.0; 71 | 72 | double b = 2.0; 73 | 74 | for(unsigned int i = 0; i < binaryIndex.size(); ++i) { 75 | dpVec[binaryIndex[i]] += 1.0; 76 | } 77 | for(unsigned int i = 0; i < negBinaryIndex.size(); ++i) { 78 | dpVec[negBinaryIndex[i]] -= 1.0; 79 | } 80 | for(unsigned int i = 0; i < normalIndex.size(); ++i) { 81 | double val = min(b, max(-b, normalValue[i])); 82 | dpVec[normalIndex[i]] += val; 83 | } 84 | 85 | for(unsigned int i = 0; i < fv->binaryIndex.size(); ++i) { 86 | result += dpVec[fv->binaryIndex[i]]; 87 | } 88 | for(unsigned int i = 0; i < fv->negBinaryIndex.size(); ++i) { 89 | result -= dpVec[fv->negBinaryIndex[i]]; 90 | } 91 | for(unsigned int i = 0; i < fv->normalIndex.size(); ++i) { 92 | double val = min(b, max(-b, fv->normalValue[i])); 93 | result += dpVec[fv->normalIndex[i]] * val; 94 | } 95 | 96 | for(unsigned int i = 0; i < binaryIndex.size(); ++i) { 97 | dpVec[binaryIndex[i]] = 0.0; 98 | } 99 | for(unsigned int i = 0; i < negBinaryIndex.size(); ++i) { 100 | dpVec[negBinaryIndex[i]] = 0.0; 101 | } 102 | for(unsigned int i = 0; i < normalIndex.size(); ++i) { 103 | dpVec[normalIndex[i]] = 0.0; 104 | } 105 | 106 | return result; 107 | } 108 | 109 | double FeatureVector::dotProduct(vector& param) { 110 | // get score 111 | double score = 0.0; 112 | for(unsigned int i = 0; i < binaryIndex.size(); ++i) { 113 | score += param[binaryIndex[i]]; 114 | } 115 | for(unsigned int i = 0; i < negBinaryIndex.size(); ++i) { 116 | score -= param[negBinaryIndex[i]]; 117 | } 118 | for(unsigned int i = 0; i < normalIndex.size(); ++i) { 119 | score += param[normalIndex[i]] * normalValue[i]; 120 | } 121 | return score; 122 | } 123 | 124 | void FeatureVector::initVec(unsigned int _rows) { 125 | rows = _rows; 126 | dpVec.resize(rows); 127 | } 128 | 129 | void FeatureVector::output() { 130 | cout << "bi: "; 131 | for(unsigned int i = 0; i < binaryIndex.size(); ++i) { 132 | cout << binaryIndex[i] << " "; 133 | } 134 | cout << endl; 135 | cout << "nbi: "; 136 | for(unsigned int i = 0; i < negBinaryIndex.size(); ++i) { 137 | cout << negBinaryIndex[i] << " "; 138 | } 139 | cout << endl; 140 | cout << "ni: "; 141 | for(unsigned int i = 0; i < normalIndex.size(); ++i) { 142 | cout << normalIndex[i] << " "; 143 | } 144 | cout << endl; 145 | 146 | int x; 147 | cin >> x; 148 | } 149 | 150 | } /* namespace segparser */ 151 | -------------------------------------------------------------------------------- /util/FeatureVector.h: -------------------------------------------------------------------------------- 1 | /* 2 | * FeatureVector.h 3 | * 4 | * Created on: Mar 28, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef FEATUREVECTOR_H_ 9 | #define FEATUREVECTOR_H_ 10 | 11 | #include 12 | 13 | namespace segparser { 14 | 15 | using namespace std; 16 | 17 | class FeatureVector { 18 | public: 19 | vector binaryIndex; 20 | vector negBinaryIndex; 21 | vector normalIndex; 22 | vector normalValue; 23 | 24 | FeatureVector(); 25 | virtual ~FeatureVector(); 26 | 27 | void clear(); 28 | void add(int index, double value); 29 | void addBinary(int index); 30 | void addNegBinary(int index); 31 | void concat(FeatureVector* fv); 32 | void concatNeg(FeatureVector* fv); 33 | double dotProduct(FeatureVector* fv); 34 | double dotProduct(vector& param); 35 | 36 | static void initVec(unsigned int _rows); 37 | void output(); 38 | private: 39 | static vector dpVec; 40 | static unsigned int rows; 41 | }; 42 | 43 | } /* namespace segparser */ 44 | #endif /* FEATUREVECTOR_H_ */ 45 | -------------------------------------------------------------------------------- /util/Logarithm.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Logarithm.cpp 3 | * 4 | * Created on: Apr 15, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | 9 | #include "Logarithm.h" 10 | #include 11 | 12 | double logsumexp(double num1, double num2) { 13 | double max_exp = 0.0; 14 | double sum = 0.0; 15 | if (num2 > num1) { 16 | max_exp = num2; 17 | sum = 1.0 + exp(num1 - max_exp); 18 | } 19 | else { 20 | max_exp = num1; 21 | sum = 1.0 + exp(num2 - max_exp); 22 | } 23 | double ret = log(sum) + max_exp; 24 | return ret; 25 | } 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /util/Logarithm.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Logarithm.h 3 | * 4 | * Created on: Apr 15, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef LOGARITHM_H_ 9 | #define LOGARITHM_H_ 10 | 11 | extern double logsumexp(double num1, double num2); 12 | 13 | #endif /* LOGARITHM_H_ */ 14 | -------------------------------------------------------------------------------- /util/Random.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Random.h 3 | * 4 | * Created on: Jan 21, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef RANDOM_H_ 9 | #define RANDOM_H_ 10 | 11 | #include 12 | #include 13 | 14 | class Random { 15 | public: 16 | Random() { 17 | eng.seed(time(NULL)); 18 | } 19 | 20 | Random(int seed) { 21 | eng.seed(seed); 22 | } 23 | 24 | void setSeed(int seed) { 25 | eng.seed(seed); 26 | } 27 | 28 | int nextInt(int n) { 29 | std::uniform_int_distribution dist(0, n - 1); 30 | return dist(eng); 31 | } 32 | 33 | double nextDouble() { 34 | std::uniform_real_distribution dist(0, 1); 35 | return dist(eng); 36 | } 37 | 38 | private: 39 | std::default_random_engine eng; 40 | }; 41 | 42 | #endif /* RANDOM_H_ */ 43 | -------------------------------------------------------------------------------- /util/SerializationUtils.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SerializationUtils.cpp 3 | * 4 | * Created on: Jan 21, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include "SerializationUtils.h" 9 | #include 10 | 11 | bool WriteString(FILE *fs, const std::string& data) { 12 | const char *buffer = data.c_str(); 13 | unsigned int length = strlen(buffer); 14 | if (1 != fwrite(&length, sizeof(int), 1, fs)) return false; 15 | if (length != fwrite(buffer, sizeof(char), length, fs)) return false; 16 | return true; 17 | } 18 | 19 | bool WriteBool(FILE *fs, bool value) { 20 | if (1 != fwrite(&value, sizeof(bool), 1, fs)) return false; 21 | return true; 22 | } 23 | 24 | bool WriteInteger(FILE *fs, int value) { 25 | if (1 != fwrite(&value, sizeof(int), 1, fs)) return false; 26 | return true; 27 | } 28 | 29 | bool WriteUINT64(FILE *fs, uint64_t value) { 30 | if (1 != fwrite(&value, sizeof(uint64_t), 1, fs)) return false; 31 | return true; 32 | } 33 | 34 | bool WriteDouble(FILE *fs, double value) { 35 | if (1 != fwrite(&value, sizeof(double), 1, fs)) return false; 36 | return true; 37 | } 38 | 39 | bool WriteStringIntegerMap(FILE *fs, const std::unordered_map& map) { 40 | if (1 != WriteInteger(fs, map.size())) 41 | return false; 42 | for (auto kv : map) { 43 | if (1 != WriteString(fs, kv.first)) 44 | return false; 45 | if (1 != WriteInteger(fs, kv.second)) 46 | return false; 47 | } 48 | return true; 49 | } 50 | 51 | bool WriteUINT64IntegerMap(FILE *fs, const std::unordered_map& map) { 52 | if (1 != WriteInteger(fs, map.size())) 53 | return false; 54 | for (auto kv : map) { 55 | if (1 != WriteUINT64(fs, kv.first)) 56 | return false; 57 | if (1 != WriteInteger(fs, kv.second)) 58 | return false; 59 | } 60 | return true; 61 | } 62 | 63 | bool WriteDoubleArray(FILE *fs, const std::vector& arr) { 64 | if (1 != WriteInteger(fs, arr.size())) 65 | return false; 66 | for (unsigned int i = 0; i < arr.size(); ++i) { 67 | if (1 != WriteDouble(fs, arr[i])) 68 | return false; 69 | } 70 | return true; 71 | } 72 | 73 | bool ReadString(FILE *fs, std::string *data) { 74 | unsigned int length; 75 | if (1 != fread(&length, sizeof(int), 1, fs)) return false; 76 | char *buffer = new char[length + 1]; 77 | if (length != fread(buffer, sizeof(char), length, fs)) return false; 78 | buffer[length] = '\0'; 79 | *data = buffer; 80 | delete[] buffer; 81 | return true; 82 | } 83 | 84 | bool ReadBool(FILE *fs, bool *value) { 85 | if (1 != fread(value, sizeof(bool), 1, fs)) return false; 86 | return true; 87 | } 88 | 89 | bool ReadInteger(FILE *fs, int *value) { 90 | if (1 != fread(value, sizeof(int), 1, fs)) return false; 91 | return true; 92 | } 93 | 94 | bool ReadUINT64(FILE *fs, uint64_t *value) { 95 | if (1 != fread(value, sizeof(uint64_t), 1, fs)) return false; 96 | return true; 97 | } 98 | 99 | bool ReadDouble(FILE *fs, double *value) { 100 | if (1 != fread(value, sizeof(double), 1, fs)) return false; 101 | return true; 102 | } 103 | 104 | bool ReadStringIntegerMap(FILE *fs, std::unordered_map* map) { 105 | int size = 0; 106 | if (1 != ReadInteger(fs, &size)) 107 | return false; 108 | for (int i = 0; i < size; ++i) { 109 | std::string key; 110 | int value; 111 | if (1 != ReadString(fs, &key)) 112 | return false; 113 | if (1 != ReadInteger(fs, &value)) 114 | return false; 115 | (*map)[key] = value; 116 | } 117 | return true; 118 | } 119 | 120 | bool ReadUINT64IntegerMap(FILE *fs, std::unordered_map* map) { 121 | int size = 0; 122 | if (1 != ReadInteger(fs, &size)) 123 | return false; 124 | for (int i = 0; i < size; ++i) { 125 | uint64_t key; 126 | int value; 127 | if (1 != ReadUINT64(fs, &key)) 128 | return false; 129 | if (1 != ReadInteger(fs, &value)) 130 | return false; 131 | (*map)[key] = value; 132 | } 133 | return true; 134 | } 135 | 136 | bool ReadDoubleArray(FILE* fs, std::vector* arr) { 137 | int size = 0; 138 | if (1 != ReadInteger(fs, &size)) 139 | return false; 140 | arr->clear(); 141 | arr->reserve(size); 142 | for (int i = 0; i < size; ++i) { 143 | double value; 144 | if (1 != ReadDouble(fs, &value)) 145 | return false; 146 | arr->push_back(value); 147 | } 148 | return true; 149 | } 150 | 151 | 152 | -------------------------------------------------------------------------------- /util/SerializationUtils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SerializationUtils.h 3 | * 4 | * Created on: Jan 21, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef SERIALIZATIONUTILS_H_ 9 | #define SERIALIZATIONUTILS_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "StringUtils.h" 17 | 18 | extern bool WriteString(FILE *fs, const std::string& data); 19 | extern bool WriteBool(FILE *fs, bool value); 20 | extern bool WriteInteger(FILE *fs, int value); 21 | extern bool WriteUINT64(FILE *fs, uint64_t value); 22 | extern bool WriteDouble(FILE *fs, double value); 23 | extern bool WriteStringIntegerMap(FILE *fs, const std::unordered_map& map); 24 | extern bool WriteUINT64IntegerMap(FILE *fs, const std::unordered_map& map); 25 | extern bool WriteDoubleArray(FILE *fs, const std::vector& arr); 26 | 27 | extern bool ReadString(FILE *fs, std::string *data); 28 | extern bool ReadBool(FILE *fs, bool *value); 29 | extern bool ReadInteger(FILE *fs, int *value); 30 | extern bool ReadUINT64(FILE *fs, uint64_t *value); 31 | extern bool ReadDouble(FILE *fs, double *value); 32 | extern bool ReadStringIntegerMap(FILE *fs, std::unordered_map* map); 33 | extern bool ReadUINT64IntegerMap(FILE *fs, std::unordered_map* map); 34 | extern bool ReadDoubleArray(FILE *fs, std::vector* arr); 35 | 36 | #define CHECK(x) { if (!x) ThrowException("check bug"); } 37 | 38 | #endif /* SERIALIZATIONUTILS_H_ */ 39 | -------------------------------------------------------------------------------- /util/StringUtils.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * StringUtils.cpp 3 | * 4 | * Created on: Jan 21, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #include 9 | #include "StringUtils.h" 10 | #include 11 | 12 | // Split string str on any delimiting character in delim, and write the result 13 | // as a vector of strings. 14 | void StringSplit(const string &str, 15 | const string &delim, 16 | vector *results) { 17 | size_t cutAt; 18 | string tmp = str; 19 | size_t len = delim.size(); 20 | 21 | while ((cutAt = tmp.find(delim)) != tmp.npos) { 22 | if(cutAt > 0) { 23 | results->push_back(tmp.substr(0,cutAt)); 24 | } 25 | tmp = tmp.substr(cutAt+len); 26 | } 27 | if(tmp.length() > 0) results->push_back(tmp); 28 | } 29 | 30 | // Deletes any head in the string "line" after the first occurrence of any 31 | // non-delimiting character (e.g. whitespaces). 32 | void TrimLeft(const string &delim, string *line) { 33 | size_t cutAt = line->find_first_not_of(delim); 34 | if (cutAt == line->npos) { 35 | *line = ""; 36 | } else { 37 | *line = line->substr(cutAt); 38 | } 39 | } 40 | 41 | // Deletes any tail in the string "line" after the last occurrence of any 42 | // non-delimiting character (e.g. whitespaces). 43 | void TrimRight(const string &delim, string *line) { 44 | size_t cutAt = line->find_last_not_of(delim); 45 | if (cutAt == line->npos) { 46 | *line = ""; 47 | } else { 48 | *line = line->substr(0, cutAt+1); 49 | } 50 | } 51 | 52 | // Trims left and right (see above). 53 | void Trim(const string &delim, string *line) { 54 | TrimLeft(delim, line); 55 | TrimRight(delim, line); 56 | } 57 | 58 | void ThrowException(const string& msg) { 59 | cerr << msg << endl; 60 | exit(-1); 61 | } 62 | 63 | int ChineseStringLength(const string& str) { 64 | int p = 0; 65 | int len = 0; 66 | while (str.find("ASC/", p) != string::npos) { 67 | len++; 68 | p = str.find("ASC/", p) + 4; 69 | } 70 | return len; 71 | } 72 | 73 | string GetChineseChar(const string& str, int k) { 74 | int p = 0; 75 | for (int i = 0; i < k; ++i) { 76 | assert(str.find("ASC/", p) != string::npos); 77 | p = str.find("ASC/", p) + 4; 78 | } 79 | 80 | int st = str.find("ASC/", p); 81 | int en = str.find("ASC/", st + 4); 82 | 83 | return str.substr(st, (en == (int)string::npos ? string::npos : en - st)); 84 | } 85 | 86 | 87 | -------------------------------------------------------------------------------- /util/StringUtils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * StringUtils.h 3 | * 4 | * Created on: Jan 21, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef STRINGUTILS_H_ 9 | #define STRINGUTILS_H_ 10 | 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | extern void StringSplit(const string &str, 17 | const string &delim, 18 | vector *results); 19 | 20 | extern void TrimLeft(const string &delim, string *line); 21 | 22 | extern void TrimRight(const string &delim, string *line); 23 | 24 | extern void Trim(const string &delim, string *line); 25 | 26 | extern void ThrowException(const string& msg); 27 | 28 | extern int ChineseStringLength(const string& str); 29 | 30 | extern string GetChineseChar(const string& str, int k); 31 | 32 | #endif /* STRINGUTILS_H_ */ 33 | -------------------------------------------------------------------------------- /util/Timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Timer.h 3 | * 4 | * Created on: Jan 21, 2014 5 | * Author: yuanz 6 | */ 7 | 8 | #ifndef TIMER_H_ 9 | #define TIMER_H_ 10 | 11 | #include 12 | #include 13 | 14 | //#define CLOCKS_PER_SEC 1000000l 15 | 16 | class Timer { 17 | public: 18 | 19 | Timer() { 20 | gettimeofday(&begin, NULL); 21 | } 22 | 23 | double stop() { 24 | timeval end; 25 | gettimeofday(&end, NULL); 26 | double diffms = (((end.tv_sec - begin.tv_sec) * 1000000) + (end.tv_usec - begin.tv_usec))/1000; 27 | return diffms; 28 | } 29 | private: 30 | timeval begin; 31 | }; 32 | 33 | 34 | #endif /* TIMER_H_ */ 35 | --------------------------------------------------------------------------------