├── LICENSE ├── README.md ├── data ├── english.aff ├── english.dic ├── hu-en.dic ├── hu-en.stem.dic ├── hungarian.aff ├── hungarian.dic └── null.dic ├── examples ├── demo.en.stem ├── demo.hu.stem ├── demo.manual.ladder ├── en.raw └── hu.raw ├── regtest ├── handaligns │ ├── 1984.hu.handstem │ │ ├── README │ │ ├── en.pre │ │ ├── en.sen │ │ ├── hand.ladder │ │ ├── hu.pre │ │ └── hu.sen │ ├── 1984.hu │ │ ├── README │ │ ├── en.pre │ │ ├── en.sen │ │ ├── hand.ladder │ │ ├── hu.pre │ │ └── hu.sen │ ├── 1984.ro.utf8 │ │ ├── README │ │ ├── en.pre │ │ ├── en.sen │ │ ├── hand.ladder │ │ ├── hu.pre │ │ ├── hu.sen │ │ └── sgmlTolatin2.sed │ ├── 1984.ro │ │ ├── README │ │ ├── en.pre │ │ ├── en.sen │ │ ├── hand.ladder │ │ ├── hu.pre │ │ ├── hu.sen │ │ └── sgmlTolatin2.sed │ ├── dtm │ │ ├── README │ │ ├── dtm.bi │ │ ├── en.pre │ │ ├── en.sen │ │ ├── hand.ladder │ │ ├── hu.pre │ │ └── hu.sen │ ├── steinbeck.huntoken.nopara │ │ ├── README │ │ ├── en.pre │ │ ├── en.sen │ │ ├── hand.ladder │ │ ├── hu.pre │ │ └── hu.sen │ ├── steinbeck.huntoken │ │ ├── README │ │ ├── en.pre │ │ ├── en.sen │ │ ├── hand.ladder │ │ ├── hu.pre │ │ └── hu.sen │ └── steinbeck │ │ ├── README │ │ ├── auto.ladder │ │ ├── en.pre │ │ ├── en.sen │ │ ├── hand.ladder │ │ ├── hu.pre │ │ └── hu.sen ├── regtest.sh ├── results │ └── dummy └── targets │ ├── 1984.hu.cerr │ ├── 1984.hu.handstem.realign.cerr │ ├── 1984.ro.realign.cerr │ ├── 1984.ro.utf8.realign.cerr │ ├── dtm.realign.cerr │ └── steinbeck.huntoken.nopara.cerr ├── scripts ├── DCEP │ ├── README │ ├── README.md │ ├── batchfilebylangpair.2ndcpu.sh │ ├── batchfilebylangpair.sh │ ├── dictforlanguagepair.sh │ ├── dictsforalllanguagepairs.sh │ ├── extract-bisentences.sh │ ├── filteralign.sh │ ├── finalpackage.sh │ ├── finalpackageforlangpairs.sh │ ├── flatladdertolangpair.sh │ ├── flatladdertolangpairs.sh │ ├── ladder2text.py │ ├── languagepair.py │ ├── mergedicts.sh │ ├── normalizedict.sh │ ├── normalizesztakidict.sh │ ├── packaligninfobylangpair.sh │ ├── readme.sh │ ├── realignall.2ndcpu.sh │ ├── realignall.sh │ ├── renamesztakidicts.sh │ ├── reorg.py │ ├── tokenizeAll.sh │ └── verifylangpair.sh ├── en.sen.one.sh ├── hu.sen.one.sh ├── hunalignDriver.py ├── ladder2text.py ├── partialAlign.py ├── process.sh ├── release.howto.txt ├── subprocessTest.py ├── teed.py ├── testProcess-RealHunalign.sh ├── testProcess.sh ├── testProcess1.sh ├── testProcessWithInput.sh ├── tok.one.sh ├── translate.txt ├── visualizeAlignQuality.awk ├── visualizeLadder.awk └── visualizeLadder.noshrink.awk └── src ├── hunalign ├── DOMTreeErrorReporter.cpp ├── Makefile ├── TEIReader.cpp ├── TEIReader.h ├── alignerTool.cpp ├── alignment.cpp ├── alignment.h ├── bloom.cpp ├── bloom.h ├── bookToMatrix.cpp ├── bookToMatrix.h ├── cooccurrence.cpp ├── cooccurrence.h ├── cooccurrenceTool.cpp ├── dicTree.h ├── dictionary.cpp ├── dictionary.h ├── help.h ├── main.cpp ├── networkFlow.cpp ├── networkFlow.h ├── oldAlignTest.cpp ├── quasiDiagonal.h ├── similarityEvaluator.cpp ├── similarityEvaluator.h ├── trailPostprocessors.cpp ├── trailPostprocessors.h ├── translate.cpp ├── translate.h ├── wordAlignment.cpp ├── wordAlignment.h └── words.h ├── include ├── argumentsParser.h ├── histogram.h ├── portableHash.h ├── serializeImpl.h ├── stringsAndStreams.h └── timer.h └── utils ├── argumentsParser.cpp ├── histogram.cpp ├── stringsAndStreams.cpp └── timer.cpp /data/english.aff: -------------------------------------------------------------------------------- 1 | 2 | SET ISO8859-2 3 | FORBIDDENWORD ! 4 | ONLYROOT ~ 5 | WORDCHARS -_ 6 | 7 | SFX n Y 1 8 | SFX n 0 es [^o]o > 9 | 10 | SFX i Y 1 11 | SFX i 0 s [^o]o > 12 | 13 | SFX k Y 2 14 | SFX k 0 s' [^o]o > 15 | SFX k 0 s [^o]o > 16 | 17 | SFX u Y 5 18 | SFX u 0 's [ck][^s] > 19 | SFX u 0 's [^ck]. > 20 | SFX u 0 ' nce > 21 | SFX u 0 ' [ck]s > 22 | SFX u 0 ' s > 23 | 24 | SFX h Y 1 25 | SFX h 0 0 . > 26 | 27 | SFX g Y 1 28 | SFX g e ing ge > 29 | 30 | SFX d Y 6 31 | SFX d y ied [^aeiou]y > 32 | SFX d 0 ed [bcdfgklmnprstvwxyz]c > 33 | SFX d 0 ked [aeiou]c > 34 | SFX d 0 ed [aeiou]y > 35 | SFX d 0 ed [xw] > 36 | SFX d e ed e > 37 | 38 | SFX c Y 14 39 | SFX c e ing [^ieg]e > 40 | SFX c 0 es [^c]s > 41 | SFX c 0 s [^sc]h > 42 | SFX c y ies [bcdfgklmnprstvwxyz]y > 43 | SFX c 0 es [sc]h > 44 | SFX c 0 s [aeiou]y > 45 | SFX c 0 s oo > 46 | SFX c 0 es x > 47 | SFX c 0 s [^shoxy] > 48 | SFX c 0 ing [xyw] > 49 | SFX c 0 ing [bcdfgklmnprstvwxyz]c > 50 | SFX c 0 king [aeiou]c > 51 | SFX c 0 ing ee > 52 | SFX c ie ying ie > 53 | 54 | SFX e Y 14 55 | SFX e 0 zing [^z]z > 56 | SFX e 0 ping [^p]p > 57 | SFX e 0 king [^ck]k > 58 | SFX e 0 ding [^d]d > 59 | SFX e 0 bing [^b]b > 60 | SFX e 0 ring [^r]r > 61 | SFX e 0 sing [^s]s > 62 | SFX e 0 ting [^t]t > 63 | SFX e 0 ning [^n]n > 64 | SFX e 0 fing [^f]f > 65 | SFX e 0 ming [^m]m > 66 | SFX e 0 ling [^l]l > 67 | SFX e 0 ving [^v]v > 68 | SFX e 0 ging [^g]g > 69 | 70 | SFX b Y 1 71 | SFX b 0 0 . > 72 | 73 | SFX a Y 1 74 | SFX a 0 0 . > 75 | 76 | SFX w Y 6 77 | SFX w 0 er [aciouxw] > 78 | SFX w y iest y > 79 | SFX w e est e > 80 | SFX w y ier y > 81 | SFX w e er e > 82 | SFX w 0 est [aciouxw] > 83 | 84 | SFX f Y 1 85 | SFX f 0 ing ge > 86 | 87 | SFX j Y 17 88 | SFX j 0 s [bcdfgklmnprstvwxyz]y > 89 | SFX j 0 0 cs > 90 | SFX j 0 es [sc]h > 91 | SFX j 0 s [aeiou]y > 92 | SFX j 0 s oo > 93 | SFX j 0 s [^shoxy] > 94 | SFX j 0 es' x > 95 | SFX j 0 es [^c]s > 96 | SFX j 0 s [^sc]h > 97 | SFX j 0 es x > 98 | SFX j 0 s' [^shoxy] > 99 | SFX j 0 es' [^c]s > 100 | SFX j 0 s' [bcdfgklmnprstvwxyz]y > 101 | SFX j 0 es' [sc]h > 102 | SFX j 0 s' [aeiou]y > 103 | SFX j 0 s' oo > 104 | SFX j 0 s' [^sc]h > 105 | 106 | SFX l Y 4 107 | SFX l fe ves fe > 108 | SFX l f ves' f > 109 | SFX l f ves f > 110 | SFX l fe ves' fe > 111 | 112 | SFX m Y 1 113 | SFX m 0 ing [^ecxyw] > 114 | 115 | SFX v Y 1 116 | SFX v 0 ed [^eycxw] > 117 | 118 | SFX o Y 2 119 | SFX o 0 es' [^o]o > 120 | SFX o 0 es [^o]o > 121 | 122 | SFX p Y 28 123 | SFX p 0 ler [^l]l > 124 | SFX p 0 pest [^p]p > 125 | SFX p 0 ker [^ck]k > 126 | SFX p 0 ver [^v]v > 127 | SFX p 0 ger [^g]g > 128 | SFX p 0 ber [^b]b > 129 | SFX p 0 kest [^ck]k > 130 | SFX p 0 rer [^r]r > 131 | SFX p 0 ser [^s]s > 132 | SFX p 0 dest [^d]d > 133 | SFX p 0 best [^b]b > 134 | SFX p 0 rest [^r]r > 135 | SFX p 0 sest [^s]s > 136 | SFX p 0 test [^t]t > 137 | SFX p 0 nest [^n]n > 138 | SFX p 0 der [^d]d > 139 | SFX p 0 ter [^t]t > 140 | SFX p 0 ner [^n]n > 141 | SFX p 0 fest [^f]f > 142 | SFX p 0 mest [^m]m > 143 | SFX p 0 zest [^z]z > 144 | SFX p 0 fer [^f]f > 145 | SFX p 0 lest [^l]l > 146 | SFX p 0 mer [^m]m > 147 | SFX p 0 zer [^z]z > 148 | SFX p 0 vest [^v]v > 149 | SFX p 0 gest [^g]g > 150 | SFX p 0 per [^p]p > 151 | 152 | SFX q Y 2 153 | SFX q 0 est [^ecxyw] > 154 | SFX q 0 er [^ecxyw] > 155 | 156 | SFX r Y 14 157 | SFX r 0 ded [^d]d > 158 | SFX r 0 ted [^t]t > 159 | SFX r 0 ned [^n]n > 160 | SFX r 0 fed [^f]f > 161 | SFX r 0 med [^m]m > 162 | SFX r 0 zed [^z]z > 163 | SFX r 0 ped [^p]p > 164 | SFX r 0 led [^l]l > 165 | SFX r 0 ked [^ck]k > 166 | SFX r 0 ved [^v]v > 167 | SFX r 0 ged [^g]g > 168 | SFX r 0 bed [^b]b > 169 | SFX r 0 red [^r]r > 170 | SFX r 0 sed [^s]s > 171 | 172 | SFX s Y 2 173 | SFX s y ies [bcdfgklmnprstvwxyz]y > 174 | SFX s y ies' [bcdfgklmnprstvwxyz]y > 175 | 176 | SFX t Y 2 177 | SFX t 0 's [^s] > 178 | SFX t 0 ' s > 179 | 180 | -------------------------------------------------------------------------------- /data/english.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/english.dic -------------------------------------------------------------------------------- /data/hu-en.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/hu-en.dic -------------------------------------------------------------------------------- /data/hu-en.stem.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/hu-en.stem.dic -------------------------------------------------------------------------------- /data/hungarian.aff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/hungarian.aff -------------------------------------------------------------------------------- /data/hungarian.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/hungarian.dic -------------------------------------------------------------------------------- /data/null.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/null.dic -------------------------------------------------------------------------------- /examples/demo.hu.stem: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/examples/demo.hu.stem -------------------------------------------------------------------------------- /examples/demo.manual.ladder: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 3 | 2 2 4 | 3 2 5 | 4 2 6 | 5 3 7 | 6 3 8 | 7 3 9 | 8 3 10 | 9 4 11 | 10 4 12 | 12 5 13 | 13 5 14 | 15 6 15 | 16 7 16 | 17 8 17 | 18 9 18 | 19 10 19 | 20 11 20 | 21 12 21 | 22 13 22 | 23 14 23 | 24 15 24 | 25 16 25 | 26 17 26 | 27 18 27 | 28 19 28 | 29 20 29 | 30 21 30 | 31 22 31 | 32 23 32 | 33 24 33 | 34 25 34 | 35 26 35 | 36 27 36 | 37 28 37 | 38 29 38 | 39 30 39 | 40 31 40 | 41 32 41 | 42 33 42 | 43 34 43 | 44 35 44 | 45 36 45 | 46 37 46 | 47 38 47 | 48 39 48 | 49 40 49 | 50 41 50 | 51 42 51 | 52 43 52 | 53 44 53 | 54 45 54 | 55 46 55 | 56 47 56 | 57 48 57 | 58 49 58 | 59 50 59 | 60 51 60 | 61 52 61 | 62 53 62 | 63 54 63 | 64 55 64 | 65 56 65 | 66 57 66 | 67 58 67 | 68 59 68 | 69 60 69 | 70 61 70 | 71 62 71 | 72 63 72 | 73 64 73 | 74 65 74 | 76 66 75 | 77 67 76 | 78 68 77 | 79 69 78 | 80 70 79 | 81 71 80 | 82 72 81 | 83 73 82 | 85 74 83 | 86 75 84 | 89 76 85 | 90 77 86 | 91 78 87 | 92 79 88 | 93 80 89 | 94 81 90 | 95 82 91 | 96 82 92 | 97 83 93 | 98 84 94 | 99 85 95 | 100 86 96 | 101 87 97 | 102 88 98 | 103 89 99 | 104 90 100 | 105 91 101 | 106 92 102 | 107 93 103 | 108 94 104 | 109 95 105 | 110 96 106 | 112 97 107 | 113 98 108 | 114 99 109 | 115 99 110 | 116 100 111 | 117 101 112 | 118 103 113 | 119 104 114 | 120 105 115 | 121 106 116 | 122 107 117 | 123 108 118 | 124 109 119 | 125 110 120 | 126 111 121 | 127 112 122 | 130 113 123 | 131 114 124 | 132 115 125 | 132 116 126 | 132 117 127 | 133 118 128 | 134 119 129 | 135 120 130 | 136 121 131 | 137 122 132 | 138 123 133 | 139 124 134 | 140 125 135 | 141 126 136 | 142 127 137 | 143 128 138 | 144 129 139 | 145 130 140 | 146 131 141 | 147 132 142 | 148 133 143 | 149 134 144 | 150 135 145 | 151 136 146 | 152 137 147 | 153 138 148 | 154 139 149 | 155 140 150 | 156 141 151 | 157 142 152 | 158 143 153 | 159 144 154 | 160 145 155 | 161 146 156 | 162 147 157 | 163 148 158 | 164 150 159 | 165 151 160 | 166 152 161 | 167 153 162 | 168 154 163 | 169 155 164 | 170 156 165 | 171 157 166 | 172 158 167 | 173 159 168 | 174 160 169 | 175 161 170 | 176 162 171 | 177 163 172 | 178 164 173 | 179 165 174 | 180 166 175 | 181 167 176 | 182 168 177 | 183 169 178 | 183 170 179 | 184 171 180 | 185 172 181 | 186 173 182 | 186 174 183 | 187 175 184 | 189 176 185 | 190 177 186 | 191 178 187 | 192 179 188 | 193 179 189 | 194 180 190 | 195 181 191 | 195 182 192 | 196 183 193 | 197 184 194 | 198 185 195 | 199 186 196 | 200 187 197 | 201 188 198 | 202 189 199 | 203 190 200 | 204 191 201 | 205 192 202 | 206 193 203 | 207 194 204 | 207 195 205 | 208 197 206 | 209 198 207 | 210 199 208 | 211 200 209 | 212 201 210 | 213 202 211 | 214 203 212 | 214 204 213 | 215 206 214 | 216 207 215 | 217 208 216 | 218 209 217 | 219 210 218 | 220 211 219 | 221 212 220 | 222 213 221 | 222 214 222 | 223 215 223 | 224 216 224 | 225 217 225 | 226 218 226 | 227 219 227 | 228 220 228 | 229 221 229 | 229 222 230 | 230 223 231 | 231 224 232 | 232 225 233 | 233 226 234 | 234 227 235 | 235 228 236 | 236 229 237 | 237 230 238 | 238 231 239 | 239 232 240 | 240 234 241 | 241 235 242 | 242 236 243 | 243 237 244 | 246 238 245 | 247 239 246 | 248 240 247 | 249 241 248 | 251 242 249 | 252 243 250 | 253 244 251 | 255 245 252 | 256 245 253 | 257 246 254 | 258 247 255 | 259 248 256 | 260 249 257 | 261 250 258 | 263 251 259 | 265 252 260 | 266 253 261 | 267 254 262 | 268 255 263 | 269 256 264 | 270 257 265 | 271 258 266 | 272 259 267 | 273 260 268 | 274 261 269 | 275 262 270 | 276 263 271 | 277 264 272 | 278 265 273 | 279 266 274 | 280 267 275 | 281 268 276 | 282 269 277 | 283 270 278 | 284 271 279 | 285 272 280 | 286 273 281 | 288 274 282 | 289 275 283 | 290 276 284 | 292 277 285 | 293 278 286 | 295 279 287 | 296 280 288 | 297 281 289 | 298 282 290 | 299 283 291 | 300 284 292 | 301 285 293 | 302 286 294 | 303 286 295 | 304 288 296 | 305 289 297 | 306 290 298 | 307 291 299 | 308 292 300 | 309 293 301 | 310 294 302 | 311 295 303 | 312 296 304 | 313 297 305 | 314 298 306 | 315 299 307 | 316 300 308 | 317 301 309 | 318 302 310 | 319 303 311 | 320 303 312 | 322 304 313 | 323 305 314 | 324 306 315 | 325 307 316 | 326 308 317 | 327 309 318 | 328 310 319 | 329 311 320 | 331 312 321 | 332 313 322 | 333 314 323 | 334 315 324 | 335 315 325 | 336 316 326 | 337 317 327 | 338 318 328 | 339 319 329 | 340 320 330 | 341 321 331 | 342 322 332 | 343 323 333 | 344 324 334 | 345 325 335 | 346 326 336 | 347 327 337 | 349 328 338 | 350 329 339 | 352 331 340 | 353 332 341 | 354 333 342 | 355 334 343 | 357 335 344 | 358 336 345 | 359 337 346 | 359 338 347 | 360 340 348 | 361 341 349 | 362 342 350 | 363 343 351 | 364 344 352 | 365 345 353 | 366 346 354 | 367 347 355 | 368 348 356 | 369 349 357 | 370 350 358 | 371 351 359 | 372 352 360 | 374 353 361 | 375 354 362 | 376 355 363 | 377 356 364 | 379 357 365 | 380 358 366 | 381 359 367 | 382 360 368 | 383 361 369 | 384 362 370 | 385 363 371 | 387 364 372 | 388 365 373 | 389 367 374 | 390 368 375 | 391 369 376 | 392 370 377 | 393 371 378 | 394 372 379 | 395 373 380 | 396 374 381 | 398 375 382 | 400 376 383 | 402 378 384 | 403 379 385 | 404 380 386 | 408 381 387 | 409 382 388 | 410 383 389 | 411 384 390 | 412 385 391 | 413 386 392 | 414 387 393 | 415 388 394 | 416 389 395 | 417 390 396 | 418 391 397 | 419 392 398 | 420 393 399 | 421 394 400 | 422 395 401 | 423 397 402 | 424 398 403 | 425 399 404 | 426 400 405 | 427 401 406 | 428 402 407 | 429 403 408 | 430 404 409 | 431 405 410 | 432 406 411 | 433 407 412 | 434 408 413 | 435 409 414 | 436 410 415 | 437 411 416 | 438 414 417 | 439 415 418 | 440 416 419 | 441 417 420 | 442 418 421 | 443 419 422 | 444 420 423 | 445 421 424 | 446 422 425 | 447 423 426 | 448 424 427 | 449 425 428 | 450 425 429 | 451 426 430 | 452 427 431 | 453 428 432 | 454 429 433 | 455 430 434 | 456 431 435 | 457 432 436 | 458 433 437 | 459 434 438 | 460 435 439 | 461 436 440 | 462 437 441 | 463 438 442 | 464 439 443 | 465 440 444 | 467 441 445 | 468 442 446 | 470 443 447 | 471 444 448 | 472 445 449 | 473 446 450 | 473 447 451 | 474 448 452 | 475 449 453 | 476 450 454 | 477 451 455 | 478 452 456 | 479 453 457 | 480 454 458 | 481 455 459 | 482 456 460 | 483 457 461 | 484 458 462 | 485 459 463 | 486 460 464 | 487 461 465 | 488 462 466 | 489 463 467 | 490 464 468 | 492 465 469 | 493 466 470 | 494 467 471 | 495 468 472 | 496 469 473 | 497 470 474 | 498 471 475 | 499 472 476 | 500 473 477 | 501 474 478 | 502 475 479 | 503 476 480 | 504 477 481 | 505 478 482 | 506 479 483 | 507 480 484 | 508 481 485 | 509 482 486 | 510 483 487 | 511 484 488 | 512 485 489 | 513 486 490 | 514 487 491 | 515 488 492 | 516 489 493 | 517 490 494 | 518 491 495 | 519 492 496 | 520 493 497 | 521 494 498 | 522 495 499 | 523 496 500 | 524 496 501 | 525 497 502 | 526 498 503 | 527 499 504 | 528 500 505 | 529 501 506 | -------------------------------------------------------------------------------- /examples/hu.raw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/examples/hu.raw -------------------------------------------------------------------------------- /regtest/handaligns/1984.hu.handstem/README: -------------------------------------------------------------------------------- 1 | ==== 2 | Notes for users: 3 | 4 | 5 | Data obtained from the English-Hungarian parallel Multext-East "1984" corpus. 6 | 7 | .sen files contain the sentence-level information of that corpus, excluding word-level information and paragraph structure. 8 | 9 | .pre files (unlike in ../1984.hu) contain tokenization and word stems that were manually obtained (well, were partly manually verified) by the Multext-East people. 10 | 11 | ==== 12 | Notes for Hunglish developers: 13 | 14 | Originally at 15 | 16 | sen: 17 | ~/hunglish/data/experiments/1984.nonlemmatized.improved/Hungarian/1984.sen 18 | ~/hunglish/data/experiments/1984.nonlemmatized.improved/English/1984.sen 19 | 20 | preproc: 21 | ~/hunglish/data/experiments/1984.hu.lemmas 22 | ~/hunglish/data/experiments/1984.en.lemmas 23 | 24 | (Actually, Multext-East ids were truncated from these files.) 25 | 26 | hand: 27 | ~/hunglish/data/experiments/hand.indexes 28 | -------------------------------------------------------------------------------- /regtest/handaligns/1984.hu.handstem/hu.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.hu.handstem/hu.pre -------------------------------------------------------------------------------- /regtest/handaligns/1984.hu.handstem/hu.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.hu.handstem/hu.sen -------------------------------------------------------------------------------- /regtest/handaligns/1984.hu/README: -------------------------------------------------------------------------------- 1 | ==== 2 | Notes for users: 3 | 4 | 5 | Data obtained from the English-Hungarian parallel Multext-East "1984" corpus. 6 | 7 | .sen files contain the sentence-level information of that corpus, excluding word-level information and paragraph structure. 8 | 9 | .pre files contain an automatically processed version of these, for aligner consumption. 10 | Processing steps are: rude tokenization, lowercasing, stemming by the hunstem tool with Hungarian and English resources. 11 | 12 | 13 | ==== 14 | Notes for Hunglish developers: 15 | 16 | Originally at 17 | 18 | sen: 19 | /home/daniel/hunglish/data/experiments/1984.nonlemmatized.improved/Hungarian/1984.sen 20 | /home/daniel/hunglish/data/experiments/1984.nonlemmatized.improved/English/1984.sen 21 | 22 | preproc: 23 | /home/daniel/hunglish/data/experiments/1984.nonlemmatized.improved/Hungarian/1984.sen.low.rok.stem 24 | /home/daniel/hunglish/data/experiments/1984.nonlemmatized.improved/English/1984.sen.low.rok.stem 25 | 26 | hand: 27 | /home/daniel/hunglish/data/experiments/hand.indexes 28 | -------------------------------------------------------------------------------- /regtest/handaligns/1984.hu/en.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.hu/en.pre -------------------------------------------------------------------------------- /regtest/handaligns/1984.hu/hu.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.hu/hu.pre -------------------------------------------------------------------------------- /regtest/handaligns/1984.hu/hu.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.hu/hu.sen -------------------------------------------------------------------------------- /regtest/handaligns/1984.ro.utf8/README: -------------------------------------------------------------------------------- 1 | ==== 2 | Notes for users: 3 | 4 | Data obtained from the English-Romanian parallel Multext-East "1984" corpus. 5 | 6 | Note: The Romanian files are named hu.* , to make life easier for our scripts. We apologize for this approach. :) Basically, "hu" is a codeword for "source language" and "en" is a codeword for "target language". 7 | 8 | .sen files contain the token-level information of that corpus, not including stem information and paragraph structure. 9 | 10 | Encoding is UTF-8. This is the only difference to ../1984.ro 11 | 12 | .pre files contain versions of the sen files with some very crude tokenization, and with punctuation marks deleted. 13 | 14 | ==== 15 | Notes for Hunglish developers: 16 | 17 | Originally at 18 | 19 | sen: 20 | /home/daniel/hunglish/data/experiments/roman2/ro.sen 21 | /home/daniel/hunglish/data/experiments/roman2/en.sen 22 | 23 | preproc: 24 | /home/daniel/hunglish/data/experiments/roman2/ro.sen.deent.low.rok 25 | /home/daniel/hunglish/data/experiments/roman2/en.sen.low.rok 26 | 27 | hand: 28 | /home/daniel/hunglish/data/experiments/roman2/hand.indexes 29 | -------------------------------------------------------------------------------- /regtest/handaligns/1984.ro.utf8/sgmlTolatin2.sed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.ro.utf8/sgmlTolatin2.sed -------------------------------------------------------------------------------- /regtest/handaligns/1984.ro/README: -------------------------------------------------------------------------------- 1 | ==== 2 | Notes for users: 3 | 4 | Data obtained from the English-Romanian parallel Multext-East "1984" corpus. 5 | 6 | Note: The Romanian files are named hu.* , to make life easier for our scripts. We apologize for this approach. :) Basically, "hu" is a codeword for "source language" and "en" is a codeword for "target language". 7 | 8 | .sen files contain the token-level information of that corpus, not including stem information and paragraph structure. 9 | 10 | Encoding is ISO Latin 2. 11 | 12 | .pre files contain versions of the sen files with some very crude tokenization, and with punctuation marks deleted. 13 | 14 | ==== 15 | Notes for Hunglish developers: 16 | 17 | Originally at 18 | 19 | sen: 20 | /home/daniel/hunglish/data/experiments/roman2/ro.sen 21 | /home/daniel/hunglish/data/experiments/roman2/en.sen 22 | 23 | preproc: 24 | /home/daniel/hunglish/data/experiments/roman2/ro.sen.deent.low.rok 25 | /home/daniel/hunglish/data/experiments/roman2/en.sen.low.rok 26 | 27 | hand: 28 | /home/daniel/hunglish/data/experiments/roman2/hand.indexes 29 | -------------------------------------------------------------------------------- /regtest/handaligns/1984.ro/hu.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.ro/hu.pre -------------------------------------------------------------------------------- /regtest/handaligns/1984.ro/hu.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.ro/hu.sen -------------------------------------------------------------------------------- /regtest/handaligns/1984.ro/sgmlTolatin2.sed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.ro/sgmlTolatin2.sed -------------------------------------------------------------------------------- /regtest/handaligns/dtm/README: -------------------------------------------------------------------------------- 1 | ==== 2 | Notes for users: 3 | 4 | 5 | From Diplomacy and Trade Magazine. (I'm not exactly sure what part of which issue, exactly.) 6 | 7 | We are grateful to the original copyright holder for the raw data. 8 | 9 | Sentence-level segmentation and manual alignment built at the 10 | Department of Corpus Linguistics at the Hungarian Research Institute for Linguistics. 11 | 12 | .sen files contain the sentence-level information. 13 | 14 | .pre files contain an automatically processed version of these, for aligner consumption. 15 | Processing steps are: rude tokenization and stemming by the hunstem tool with Hungarian and English resources. 16 | 17 | hand.ladder is the manual align of the bitext. 18 | 19 | ==== 20 | Notes for Hunglish developers: 21 | 22 | Based on the text format align ./dtm.bi 23 | 24 | See ../steinbeck/README for details on the text to .sen,.ladder conversion. 25 | 26 | The .pre files were built like this: 27 | 28 | export BINDIR=/home/daniel/Bicorpus/scripts 29 | cat hu.sen | $BINDIR/tok.one.sh | $BINDIR/hu.stem.one.sh > hu.pre 30 | cat en.sen | $BINDIR/tok.one.sh | $BINDIR/en.stem.one.sh > en.pre 31 | -------------------------------------------------------------------------------- /regtest/handaligns/dtm/dtm.bi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/dtm/dtm.bi -------------------------------------------------------------------------------- /regtest/handaligns/dtm/en.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/dtm/en.pre -------------------------------------------------------------------------------- /regtest/handaligns/dtm/en.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/dtm/en.sen -------------------------------------------------------------------------------- /regtest/handaligns/dtm/hand.ladder: -------------------------------------------------------------------------------- 1 | 0 0 2 | 1 1 3 | 2 2 4 | 3 3 5 | 4 4 6 | 5 5 7 | 6 6 8 | 7 7 9 | 8 8 10 | 9 9 11 | 10 10 12 | 11 11 13 | 12 12 14 | 13 13 15 | 14 14 16 | 15 15 17 | 16 16 18 | 17 17 19 | 18 18 20 | 19 19 21 | 20 20 22 | 21 21 23 | 22 22 24 | 23 23 25 | 24 24 26 | 25 25 27 | 26 26 28 | 27 27 29 | 28 28 30 | 29 29 31 | 30 30 32 | 31 31 33 | 32 32 34 | 33 33 35 | 34 34 36 | 35 35 37 | 36 36 38 | 37 37 39 | 38 38 40 | 39 39 41 | 40 40 42 | 41 41 43 | 42 42 44 | 43 43 45 | 44 44 46 | 45 45 47 | 46 46 48 | 47 47 49 | 48 48 50 | 49 49 51 | 50 50 52 | 51 51 53 | 52 52 54 | 53 53 55 | 54 54 56 | 55 55 57 | 56 56 58 | 57 57 59 | 58 58 60 | 59 59 61 | 60 60 62 | 61 61 63 | 62 62 64 | 63 63 65 | 64 64 66 | 65 65 67 | 66 66 68 | 67 67 69 | 68 68 70 | 69 69 71 | 70 70 72 | 71 71 73 | 72 72 74 | 73 73 75 | 74 74 76 | 75 75 77 | 76 76 78 | 77 77 79 | 78 78 80 | 79 79 81 | 80 80 82 | 81 81 83 | 82 82 84 | 83 83 85 | 84 84 86 | 85 85 87 | 86 86 88 | 87 87 89 | 88 88 90 | 89 89 91 | 90 90 92 | 91 91 93 | 92 92 94 | 93 93 95 | 94 94 96 | 95 96 97 | 96 97 98 | 97 98 99 | 98 99 100 | 100 100 101 | 101 101 102 | 102 102 103 | 103 103 104 | 104 104 105 | 105 105 106 | 106 106 107 | 107 107 108 | 108 108 109 | 109 109 110 | 110 110 111 | 111 111 112 | 112 113 113 | 114 114 114 | 115 115 115 | 116 116 116 | 117 117 117 | 118 118 118 | 119 119 119 | 120 120 120 | 122 122 121 | 123 123 122 | 124 124 123 | 126 125 124 | 127 126 125 | 128 127 126 | 129 128 127 | 130 129 128 | 131 130 129 | 132 131 130 | 133 132 131 | 134 133 132 | 135 134 133 | 136 135 134 | 137 136 135 | 138 137 136 | 139 138 137 | 140 139 138 | 141 140 139 | 142 141 140 | 143 142 141 | 144 143 142 | 145 144 143 | 146 145 144 | 147 146 145 | 148 147 146 | 149 148 147 | 150 149 148 | 151 150 149 | 152 151 150 | 153 152 151 | 154 153 152 | 155 154 153 | 156 155 154 | 157 156 155 | 158 157 156 | 159 158 157 | 160 159 158 | 161 160 159 | 162 161 160 | 163 162 161 | 164 163 162 | 165 164 163 | 166 165 164 | 167 166 165 | 168 167 166 | 169 168 167 | 170 169 168 | 171 170 169 | 172 171 170 | 173 172 171 | 174 173 172 | 175 174 173 | 176 175 174 | 177 176 175 | 178 177 176 | 179 178 177 | 180 179 178 | 181 180 179 | 182 181 180 | 183 182 181 | 184 183 182 | 185 184 183 | 186 185 184 | 187 186 185 | 188 187 186 | 189 188 187 | 191 190 188 | 192 192 189 | 193 193 190 | 194 194 191 | 195 195 192 | 196 196 193 | 197 197 194 | 198 198 195 | 199 199 196 | 200 200 197 | 201 201 198 | 202 202 199 | 203 203 200 | 204 204 201 | 205 205 202 | 206 206 203 | 207 207 204 | 208 208 205 | 209 209 206 | 210 210 207 | 211 211 208 | 212 212 209 | 213 213 210 | 214 214 211 | 215 215 212 | 216 216 213 | 217 218 214 | 218 219 215 | 219 220 216 | 220 221 217 | 221 222 218 | 222 223 219 | 223 224 220 | 224 225 221 | 225 226 222 | 226 227 223 | 227 228 224 | 228 229 225 | 229 230 226 | 231 231 227 | 232 232 228 | 233 233 229 | 234 234 230 | 235 235 231 | 236 236 232 | 237 237 233 | 238 238 234 | 239 239 235 | 240 240 236 | 241 241 237 | 242 242 238 | 243 243 239 | 244 244 240 | 245 245 241 | 246 246 242 | 247 247 243 | 248 248 244 | 249 249 245 | 250 250 246 | 251 251 247 | 252 252 248 | 253 253 249 | 254 254 250 | 255 255 251 | 256 256 252 | 257 257 253 | 258 258 254 | 259 259 255 | 260 260 256 | 261 261 257 | 262 262 258 | 263 263 259 | 264 264 260 | 265 265 261 | 266 266 262 | 267 267 263 | 268 268 264 | 269 269 265 | 270 270 266 | 271 271 267 | 272 272 268 | 273 273 269 | 274 274 270 | 275 275 271 | 276 276 272 | 277 277 273 | 278 278 274 | 279 279 275 | 280 280 276 | 281 281 277 | 282 282 278 | 283 283 279 | 284 284 280 | 285 285 281 | 286 286 282 | 287 287 283 | 288 288 284 | 289 289 285 | 290 290 286 | 291 291 287 | 292 292 288 | 293 293 289 | 295 295 290 | 296 296 291 | 297 297 292 | 298 298 293 | 300 300 294 | 301 301 295 | 302 302 296 | 303 303 297 | 304 304 298 | 305 305 299 | 306 306 300 | 307 307 301 | 308 308 302 | 309 309 303 | 310 310 304 | 311 311 305 | 312 312 306 | 313 313 307 | 314 314 308 | 315 315 309 | 316 316 310 | 317 317 311 | 318 318 312 | 319 319 313 | 320 320 314 | 321 321 315 | 322 322 316 | 323 323 317 | 324 324 318 | 325 325 319 | 326 326 320 | 327 327 321 | 328 328 322 | 329 329 323 | 330 330 324 | 331 331 325 | 332 332 326 | 333 333 327 | 334 334 328 | 335 335 329 | 336 336 330 | 337 337 331 | 338 338 332 | 339 339 333 | 340 340 334 | 341 341 335 | 342 342 336 | 343 343 337 | 344 344 338 | 345 345 339 | 346 346 340 | 347 347 341 | 348 348 342 | 349 349 343 | 350 350 344 | 351 351 345 | 352 352 346 | 353 353 347 | 354 354 348 | 355 355 349 | 356 356 350 | 357 357 351 | 358 358 352 | 359 359 353 | 360 360 354 | 361 361 355 | 362 362 356 | 363 363 357 | 364 364 358 | 365 365 359 | 366 366 360 | 367 367 361 | 368 369 362 | 369 370 363 | 370 371 364 | 371 372 365 | 372 373 366 | 373 374 367 | 374 375 368 | 375 376 369 | 376 377 370 | 377 378 371 | 378 379 372 | 379 380 373 | 380 381 374 | 381 382 375 | 382 383 376 | 383 384 377 | 384 385 378 | 385 386 379 | 386 387 380 | 387 388 381 | 388 389 382 | 389 390 383 | 390 391 384 | 391 392 385 | 392 393 386 | 393 394 387 | 394 395 388 | 395 396 389 | 396 397 390 | 397 398 391 | 398 399 392 | 399 400 393 | 400 401 394 | 401 402 395 | 402 403 396 | 403 404 397 | 404 405 398 | 405 406 399 | 407 407 400 | 408 408 401 | 409 410 402 | 410 411 403 | 411 412 404 | 412 413 405 | 413 414 406 | 414 415 407 | 415 416 408 | 416 417 409 | 417 418 410 | 418 419 411 | 419 420 412 | 420 421 413 | 421 422 414 | 422 423 415 | 423 424 416 | 424 425 417 | 425 426 418 | 426 427 419 | 427 428 420 | 428 429 421 | 429 430 422 | 430 431 423 | 431 432 424 | 432 433 425 | 433 434 426 | 434 435 427 | 435 436 428 | 436 437 429 | 437 438 430 | 438 439 431 | 439 440 432 | 440 441 433 | 441 442 434 | 442 443 435 | 443 444 436 | 444 445 437 | 445 446 438 | 446 447 439 | 447 448 440 | 448 449 441 | 449 450 442 | 450 451 443 | 451 452 444 | 452 453 445 | 453 454 446 | 454 455 447 | 455 456 448 | 456 457 449 | 457 458 450 | 458 459 451 | 459 460 452 | 460 461 453 | 461 462 454 | 462 463 455 | 463 464 456 | 464 465 457 | 465 467 458 | 466 468 459 | 467 469 460 | 468 470 461 | 469 471 462 | 470 472 463 | 471 473 464 | 472 474 465 | 473 475 466 | 474 476 467 | 475 477 468 | 476 478 469 | 477 479 470 | 478 480 471 | 479 481 472 | 480 482 473 | 481 483 474 | 482 484 475 | 483 485 476 | 484 486 477 | 485 487 478 | 486 488 479 | 487 489 480 | 489 491 481 | 490 492 482 | 491 493 483 | 492 494 484 | 493 495 485 | 494 496 486 | 495 497 487 | 496 498 488 | 497 499 489 | 498 500 490 | 499 501 491 | 500 502 492 | 501 503 493 | 502 504 494 | 503 505 495 | 504 506 496 | 505 507 497 | 506 508 498 | 507 509 499 | 508 510 500 | 509 511 501 | 510 512 502 | 511 513 503 | 512 514 504 | 513 515 505 | 514 516 506 | 515 517 507 | 516 518 508 | 517 519 509 | 518 520 510 | 519 521 511 | 520 522 512 | 521 523 513 | 522 524 514 | 523 525 515 | 524 526 516 | 525 527 517 | 526 528 518 | 527 529 519 | 528 530 520 | 529 531 521 | 530 532 522 | 531 533 523 | 532 534 524 | 533 535 525 | 534 536 526 | 535 537 527 | 536 538 528 | 537 539 529 | 538 540 530 | 539 541 531 | 540 542 532 | 541 543 533 | 542 544 534 | 543 545 535 | 544 546 536 | 545 547 537 | 546 548 538 | 547 549 539 | 548 550 540 | 549 551 541 | 550 552 542 | 551 553 543 | 552 554 544 | 553 555 545 | 554 556 546 | 555 557 547 | 556 558 548 | 557 559 549 | 558 560 550 | 559 561 551 | 560 562 552 | 561 563 553 | 562 564 554 | 563 565 555 | 564 566 556 | 565 567 557 | 566 568 558 | 567 569 559 | 568 570 560 | 569 571 561 | 570 572 562 | 571 573 563 | 572 574 564 | 573 575 565 | 574 576 566 | 575 577 567 | 576 578 568 | 577 579 569 | 578 580 570 | 579 581 571 | 580 582 572 | 581 583 573 | -------------------------------------------------------------------------------- /regtest/handaligns/dtm/hu.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/dtm/hu.pre -------------------------------------------------------------------------------- /regtest/handaligns/dtm/hu.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/dtm/hu.sen -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck.huntoken.nopara/README: -------------------------------------------------------------------------------- 1 | ==== 2 | Notes for users: 3 | 4 | John Steinbeck's "Cup of Gold". Raw data obtained from the Hungarian Electronic Library and Project Gutenberg. 5 | 6 | .sen files contain the sentence-level information. Sentence-level segmentation by our huntoken tool. (Note: Unedited by hand, so not perfect at all.) 7 | 8 | Although it strictly respects this imperfect sentence-level segmentation, the alignment itself is hand-edited, and should be error-free. 9 | 10 | IMPORTANT NOTE: This directory contains the same data as steinbeck.huntoken, with just one difference: This corpus was built by throwing away all the (automatically obtained) paragraph information from the steinbeck.huntoken corpus. 11 | 12 | .pre files contain an automatically processed version of the .sen files, for aligner consumption. 13 | Processing steps are: rude tokenization, lowercasing, stemming by the hunstem tool with Hungarian and English resources. 14 | 15 | ==== 16 | Notes for Hunglish developers: 17 | 18 | Originally at 19 | 20 | sen: 21 | ~/hunglish/data/experiments/microsoft/steinbeck/kornaiscript/hu.really.sen 22 | ~/hunglish/data/experiments/microsoft/steinbeck/kornaiscript/en.really.sen 23 | 24 | preproc: 25 | ~/hunglish/data/experiments/microsoft/steinbeck/kornaiscript/hu.sen.low.rok.stem 26 | ~/hunglish/data/experiments/microsoft/steinbeck/kornaiscript/en.sen.low.rok.stem 27 | 28 | hand: 29 | ~/hunglish/data/experiments/microsoft/steinbeck/kornaiscript/autole1tra 30 | -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck.huntoken.nopara/en.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken.nopara/en.pre -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck.huntoken.nopara/en.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken.nopara/en.sen -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck.huntoken.nopara/hu.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken.nopara/hu.pre -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck.huntoken.nopara/hu.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken.nopara/hu.sen -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck.huntoken/README: -------------------------------------------------------------------------------- 1 | ==== 2 | Notes for users: 3 | 4 | John Steinbeck's "Cup of Gold". Raw data obtained from the Hungarian Electronic Library and Project Gutenberg. 5 | 6 | Sentence-level segmentation and paragraph-information by our huntoken tool. (Note: Unedited by hand, so not perfect at all.) 7 | 8 | Although it strictly respects this imperfect sentence-level segmentation, the alignment itself is hand-edited, and should be error-free. 9 | It was built at the Department of Corpus Linguistics at the Hungarian Research Institute for Linguistics. 10 | 11 | .sen files contain the sentence-level information. Paragraph structure is described by paragraph-delimiter quasi-sentences. 12 | 13 | .pre files contain an automatically processed version of these, for aligner consumption. 14 | Processing steps are: rude tokenization, lowercasing, stemming by the hunstem tool with Hungarian and English resources. 15 | 16 | ==== 17 | Notes for Hunglish developers: 18 | 19 | Originally at 20 | 21 | sen: 22 | ~/hunglish/data/experiments/Steinbeck2/Steinbeck_1.hu.sen 23 | ~/hunglish/data/experiments/Steinbeck2/Steinbeck_1.en.sen 24 | 25 | preproc: 26 | ~/hunglish/data/experiments/Steinbeck.improve/Steinbeck_1.hu.sen.low.rok.stem 27 | ~/hunglish/data/experiments/Steinbeck.improve/Steinbeck_1.en.sen.low.rok.stem 28 | 29 | hand: 30 | ~/hunglish/data/experiments/Steinbeck.compare/ladder.hand.nostartendpara.txt 31 | -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck.huntoken/en.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken/en.pre -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck.huntoken/en.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken/en.sen -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck.huntoken/hu.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken/hu.pre -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck.huntoken/hu.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken/hu.sen -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck/README: -------------------------------------------------------------------------------- 1 | ==== 2 | Notes for users: 3 | 4 | John Steinbeck's "Cup of Gold". Raw data obtained from the Hungarian Electronic Library and Project Gutenberg. 5 | 6 | Sentence-level segmentation and paragraph-information was hand-edited at the 7 | Department of Corpus Linguistics at the Hungarian Research Institute for Linguistics. 8 | 9 | .sen files contain the sentence-level information. Paragraph structure is described by paragraph-delimiter quasi-sentences. 10 | 11 | .pre files contain an automatically processed version of these, for aligner consumption. 12 | Processing steps are: rude tokenization and stemming by the hunstem tool with Hungarian and English resources. 13 | 14 | auto.ladder was built by hunalign from hu.pre and en.pre, with default arguments. 15 | 16 | hand.ladder is the manual align of the bitext, also by the 17 | Department of Corpus Linguistics at the Hungarian Research Institute for Linguistics. 18 | It is based on auto.ladder. 19 | 20 | ==== 21 | Notes for Hunglish developers: 22 | 23 | Originally at 24 | 25 | sen: 26 | 27 | Originally from 28 | /home/daniel/hunglish/data/experiments/Steinbeck.handsegment/hu.sen 29 | /home/daniel/hunglish/data/experiments/Steinbeck.handsegment/en.sen 30 | 31 | , but some whitespaces were retroactively removed, based on the text version of the manaual align: 32 | cat align.hand.txt | cut -f1 | awk 'BEGIN {FS=" ~~~ "} { for (i=1;i<=NF;++i) { print $(i) } }' | grep -v "^$" | sed "s/ $//" | sed "s/^ //" > hu.sen.hand 33 | cat align.hand.txt | cut -f2 | awk 'BEGIN {FS=" ~~~ "} { for (i=1;i<=NF;++i) { print $(i) } }' | grep -v "^$" | sed "s/ $//" | sed "s/^ //" > en.sen.hand 34 | 35 | preproc: 36 | /home/daniel/hunglish/data/experiments/Steinbeck.handsegment/hu.pre 37 | /home/daniel/hunglish/data/experiments/Steinbeck.handsegment/en.pre 38 | 39 | auto: 40 | /home/daniel/hunglish/data/experiments/Steinbeck.handsegment/auto.ladder 41 | 42 | hand: 43 | /home/joker/steinbeck/kesz.steinbeck_align_rev.v1.0.txt 44 | or 45 | /home/daniel/hunglish/data/handaligns/steinbeck/kesz.steinbeck_align_rev.v1.0.txt 46 | 47 | The text to ladder conversion script used: 48 | 49 | cat kesz.steinbeck_align_rev.v1.0.txt | sed "s/\([^ ]\)~~~ /\1 ~~~ /g" | sed "s/ ~~~\([^ ]\)/ ~~~ \1/g" | sed "s/\([^ ]\)~~~\([^ ]\)/\1 ~~~ \2/g" | grep -v "^$" > align.hand.txt 50 | mkdir tmp 51 | export file=align.hand.txt 52 | half=1 ; cat $file | grep -v "^.$" | cut -f$half | awk 'BEGIN {FS=" ~~~ "; s=0 } { print s; s+=NF } END { print s }' > tmp/ladder.$half 53 | half=2 ; cat $file | grep -v "^.$" | cut -f$half | awk 'BEGIN {FS=" ~~~ "; s=0 } { print s; s+=NF } END { print s }' > tmp/ladder.$half 54 | paste tmp/ladder.1 tmp/ladder.2 | tr '\t' ' ' > hand.ladder 55 | -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck/en.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck/en.pre -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck/en.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck/en.sen -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck/hu.pre: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck/hu.pre -------------------------------------------------------------------------------- /regtest/handaligns/steinbeck/hu.sen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck/hu.sen -------------------------------------------------------------------------------- /regtest/regtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | bindir=../src/hunalign 4 | 5 | 6 | fscorer() { 7 | prec=`cat $1 | grep "^Precision" | tr -d ',' | cut -f2 -d' '` 8 | recall=`cat $1 | grep "^Precision" | tr -d ',' | cut -f4 -d' '` 9 | fscore=`echo "2/(1/$prec+1/$recall)" | bc -l | awk '{ print $0+0 }'` 10 | echo "F-score: $fscore" 11 | echo 12 | } 13 | 14 | evaluator() { 15 | echo "==================================" 16 | echo "Expected:" 17 | cat $1 | tail -3 18 | fscorer $1 19 | targetfscore=$fscore 20 | echo "Achieved:" 21 | cat $2 | tail -3 22 | fscorer $2 23 | targetfscore=$targetfscore fscore=$fscore awk ' 24 | BEGIN{ 25 | print "Expected F-score:", ENVIRON["targetfscore"], " Achieved F-score:",ENVIRON["fscore"] 26 | if (ENVIRON["targetfscore"]>ENVIRON["fscore"]) 27 | { 28 | print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> REGRESSION in F-score <<<<<<<<<<<<<<<<<<<<<<<<<<<<<" 29 | } 30 | else 31 | { 32 | print "No regression found" 33 | } 34 | }' 35 | echo 36 | } 37 | 38 | name=1984.ro.utf8.realign 39 | echo "Testing $name ..." 40 | target=targets/$name.cerr 41 | file=results/$name.cerr 42 | ofile=results/$name.cout 43 | $bindir/hunalign -realign -hand=handaligns/1984.ro.utf8/hand.ladder ../data/null.dic handaligns/1984.ro.utf8/hu.pre handaligns/1984.ro.utf8/en.pre -bisent > $ofile 2> $file 44 | evaluator $target $file 45 | 46 | name=1984.hu 47 | echo "Testing $name ..." 48 | target=targets/$name.cerr 49 | file=results/$name.cerr 50 | ofile=results/$name.cout 51 | $bindir/hunalign -onebyteencoding -hand=handaligns/1984.hu/hand.ladder ../data/hu-en.dic handaligns/1984.hu.handstem/hu.pre handaligns/1984.hu.handstem/en.pre -bisent > $ofile 2> $file 52 | evaluator $target $file 53 | 54 | name=1984.hu.handstem.realign 55 | echo "Testing $name ..." 56 | target=targets/$name.cerr 57 | file=results/$name.cerr 58 | ofile=results/$name.cout 59 | $bindir/hunalign -onebyteencoding -realign -hand=handaligns/1984.hu/hand.ladder ../data/hu-en.dic handaligns/1984.hu.handstem/hu.pre handaligns/1984.hu.handstem/en.pre -bisent > $ofile 2> $file 60 | evaluator $target $file 61 | 62 | name=steinbeck.huntoken.nopara 63 | echo "Testing $name ..." 64 | target=targets/$name.cerr 65 | file=results/$name.cerr 66 | ofile=results/$name.cout 67 | $bindir/hunalign -onebyteencoding -hand=handaligns/steinbeck.huntoken.nopara/hand.ladder ../data/hu-en.dic handaligns/steinbeck.huntoken.nopara/hu.pre handaligns/steinbeck.huntoken.nopara/en.pre > $ofile 2> $file 68 | evaluator $target $file 69 | 70 | name=1984.ro.realign 71 | echo "Testing $name ..." 72 | target=targets/$name.cerr 73 | file=results/$name.cerr 74 | ofile=results/$name.cout 75 | $bindir/hunalign -onebyteencoding -realign -hand=handaligns/1984.ro/hand.ladder ../data/null.dic handaligns/1984.ro/hu.pre handaligns/1984.ro/en.pre -bisent > $ofile 2> $file 76 | evaluator $target $file 77 | 78 | name=dtm.realign 79 | echo "Testing $name ..." 80 | target=targets/$name.cerr 81 | file=results/$name.cerr 82 | ofile=results/$name.cout 83 | $bindir/hunalign -onebyteencoding -hand=handaligns/dtm/hand.ladder ../data/hu-en.dic handaligns/dtm/hu.pre handaligns/dtm/en.pre > $ofile 2> $file 84 | evaluator $target $file 85 | -------------------------------------------------------------------------------- /regtest/results/dummy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/results/dummy -------------------------------------------------------------------------------- /regtest/targets/1984.hu.cerr: -------------------------------------------------------------------------------- 1 | Reading dictionary... 2 | 6733 hungarian sentences read. 3 | 6738 english sentences read. 4 | quasiglobal_stopwordRemoval is set to 0 5 | Simplified dictionary ready. 6 | Rough translation ready. 7 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 8 | Rough translation-based similarity matrix ready. 9 | Matrix built. 10 | Trail found. 11 | Align ready. 12 | Global quality of unfiltered align 1.06407 13 | quasiglobal_spaceOutBySentenceLength is set to 1 14 | Trail spaced out by sentence length. 15 | Global quality of unfiltered align after realign 1.06407 16 | 42 misaligned out of 6446 correct items, 6450 bets. 17 | Precision: 0.993488, Recall: 0.994105 18 | Quality 1.06407 19 | -------------------------------------------------------------------------------- /regtest/targets/1984.hu.handstem.realign.cerr: -------------------------------------------------------------------------------- 1 | Reading dictionary... 2 | 6733 hungarian sentences read. 3 | 6738 english sentences read. 4 | quasiglobal_stopwordRemoval is set to 0 5 | Simplified dictionary ready. 6 | Rough translation ready. 7 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 8 | Rough translation-based similarity matrix ready. 9 | Matrix built. 10 | Trail found. 11 | Align ready. 12 | Global quality of unfiltered align 1.06407 13 | 141409 items inside the border. 14 | Border of realign zone determined. 15 | 6558 bisentences collected. 16 | Plausible bisentences filtered. 17 | Removing stopwords...Removing identicals... 1900 1914 1920 1925 1930 1940 1944 1945 1960 1965 1968 1970 1973 1983 1984 2050 2713 6079 B Berlin Brazzaville Ceylon Colchester Emmanuel Goldstein India J. Kent London Martin's-in-the-Fields Minipax O'Brien Parsons Smith Swift Syme Tibet Winston Withers York agitprop album alibi atom cent film frigid front gallon gin hall hang lift memorandum mind minimum modern most overall park pint propaganda reflex reform reformer sport staccato stop tank tea terror times vitriol 18 | 73 identical translations found. 19 | Removing hapaxes...503 hapax-based dictionary items found. 20 | Building CorpusConstellation... Done. 21 | 3795 items left in original dictionary. 22 | Removing stopwords...Removing identicals... 23 | 0 identical translations found. 24 | Removing hapaxes...0 hapax-based dictionary items found. 25 | Building CorpusConstellation... Done. 26 | 2234 new dictionary items found. 27 | Simplified dictionary ready. 28 | Rough translation ready. 29 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 Matrix built. 30 | Trail found. 31 | Detail realign ready. 32 | Global quality of unfiltered align after realign 1.02754 33 | quasiglobal_spaceOutBySentenceLength is set to 1 34 | Trail spaced out by sentence length. 35 | Global quality of unfiltered align after realign 1.02754 36 | 49 misaligned out of 6446 correct items, 6446 bets. 37 | Precision: 0.992398, Recall: 0.992398 38 | Quality 1.02754 39 | -------------------------------------------------------------------------------- /regtest/targets/1984.ro.realign.cerr: -------------------------------------------------------------------------------- 1 | Reading dictionary... 2 | 6481 hungarian sentences read. 3 | 6706 english sentences read. 4 | quasiglobal_stopwordRemoval is set to 0 5 | Simplified dictionary ready. 6 | Rough translation ready. 7 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 8 | Rough translation-based similarity matrix ready. 9 | Matrix built. 10 | Trail found. 11 | Align ready. 12 | Global quality of unfiltered align 0.318931 13 | 138326 items inside the border. 14 | Border of realign zone determined. 15 | 6151 bisentences collected. 16 | Plausible bisentences filtered. 17 | Removing stopwords...Removing identicals... ! ( ) - . 100 12 1900 1914 1920 1925 1930 1940 1944 1945 1960 1965 1968 1970 1973 1983 1984 20 2050 2713 3 6079 83 85 98 99 : ; ? aaronson absurd accent accident acid act adam adjectival adoptive adverb africa african agent agitprop ah album alibi amersham ampleforth ancestral animal anti-sex apologetic are artificial atlantic australasia avenue b baal bar barman berkhamsted boy brazzaville bumstead byron c canada cap capitalism capitalist care caricaturist caste cent central charrington chaucer china civil clang clement colchester comintern competent complex concept concrete conflict congo consider constant contact continue contralto control coral crime cromwell darwin de definitive demonstrative dickens din direct district doctor doctrine document elaborate electric emmanuel etc eurasia eurasian european evident exact exist face false familiar fanatic film force fragment franca frigid front general gestapo gin goldstein helicopter i ideal ignorant iii imagine important in incident independent india indian indirect individual individualism inferior inprecorr instinct instinctive instrument interval j java jefferson jones julia karl kipling leopoldville lift lingua malabar martin marx material metal milton minimum minipax minute moment mongolia monument mother motor murmur natural nazi negroid new noctis normal o'brien oceania ogilvy oliver omnipotent optimism or orator orgiastic osiris pacific paddington palimpsest pancras pardon paris parsons patriotic patriotism pedant perfect permanent persia pistol plan pneumatic popular pornosec post pretext primae primitive principal protector protest public pure raid rare real respect rest reverie rival robin romantic rutherford secret separate sex sexual shaftesbury shakespeare shoreditch siberia simple slogan smith socialism socialist solar solemn solid solipsism spasm special specialist spirit splendid sport standard stepney stop submarine superior surplus suspect swift syme tibet tic tillotson times tolerant tom total tour transparent trivial valet vast verb verbal versificator violent vistula vitriol w weeks wilsher winston withers york 18 | 294 identical translations found. 19 | Removing hapaxes...558 hapax-based dictionary items found. 20 | Building CorpusConstellation... Done. 21 | 0 items left in original dictionary. 22 | Removing stopwords...Removing identicals... 23 | 0 identical translations found. 24 | Removing hapaxes...0 hapax-based dictionary items found. 25 | Building CorpusConstellation... Done. 26 | 2305 new dictionary items found. 27 | Simplified dictionary ready. 28 | Rough translation ready. 29 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 Matrix built. 30 | Trail found. 31 | Detail realign ready. 32 | Global quality of unfiltered align after realign 0.819735 33 | quasiglobal_spaceOutBySentenceLength is set to 1 34 | Trail spaced out by sentence length. 35 | Global quality of unfiltered align after realign 0.819735 36 | 119 misaligned out of 6015 correct items, 6060 bets. 37 | Precision: 0.980363, Recall: 0.987697 38 | Quality 0.819735 39 | -------------------------------------------------------------------------------- /regtest/targets/1984.ro.utf8.realign.cerr: -------------------------------------------------------------------------------- 1 | Reading dictionary... 2 | 6481 hungarian sentences read. 3 | 6706 english sentences read. 4 | quasiglobal_stopwordRemoval is set to 0 5 | Simplified dictionary ready. 6 | Rough translation ready. 7 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 8 | Rough translation-based similarity matrix ready. 9 | Matrix built. 10 | Trail found. 11 | Align ready. 12 | Global quality of unfiltered align 0.318931 13 | 138326 items inside the border. 14 | Border of realign zone determined. 15 | 6151 bisentences collected. 16 | Plausible bisentences filtered. 17 | Removing stopwords...Removing identicals... ! ( ) - . 100 12 1900 1914 1920 1925 1930 1940 1944 1945 1960 1965 1968 1970 1973 1983 1984 20 2050 2713 3 6079 83 85 98 99 : ; ? aaronson absurd accent accident acid act adam adjectival adoptive adverb africa african agent agitprop ah album alibi amersham ampleforth ancestral animal anti-sex apologetic are artificial atlantic australasia avenue b baal bar barman berkhamsted boy brazzaville bumstead byron c canada cap capitalism capitalist care caricaturist caste cent central charrington chaucer china civil clang clement colchester comintern competent complex concept concrete conflict congo consider constant contact continue contralto control coral crime cromwell darwin de definitive demonstrative dickens din direct district doctor doctrine document elaborate electric emmanuel etc eurasia eurasian european evident exact exist face false familiar fanatic film force fragment franca frigid front general gestapo gin goldstein helicopter i ideal ignorant iii imagine important in incident independent india indian indirect individual individualism inferior inprecorr instinct instinctive instrument interval j java jefferson jones julia karl kipling leopoldville lift lingua malabar martin marx material metal milton minimum minipax minute moment mongolia monument mother motor murmur natural nazi negroid new noctis normal o'brien oceania ogilvy oliver omnipotent optimism or orator orgiastic osiris pacific paddington palimpsest pancras pardon paris parsons patriotic patriotism pedant perfect permanent persia pistol plan pneumatic popular pornosec post pretext primae primitive principal protector protest public pure raid rare real respect rest reverie rival robin romantic rutherford secret separate sex sexual shaftesbury shakespeare shoreditch siberia simple slogan smith socialism socialist solar solemn solid solipsism spasm special specialist spirit splendid sport standard stepney stop submarine superior surplus suspect swift syme tibet tic tillotson times tolerant tom total tour transparent trivial valet vast verb verbal versificator violent vistula vitriol w weeks wilsher winston withers york 18 | 294 identical translations found. 19 | Removing hapaxes...558 hapax-based dictionary items found. 20 | Building CorpusConstellation... Done. 21 | 0 items left in original dictionary. 22 | Removing stopwords...Removing identicals... 23 | 0 identical translations found. 24 | Removing hapaxes...0 hapax-based dictionary items found. 25 | Building CorpusConstellation... Done. 26 | 2305 new dictionary items found. 27 | Simplified dictionary ready. 28 | Rough translation ready. 29 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 Matrix built. 30 | Trail found. 31 | Detail realign ready. 32 | Global quality of unfiltered align after realign 0.819839 33 | quasiglobal_spaceOutBySentenceLength is set to 1 34 | Trail spaced out by sentence length. 35 | Global quality of unfiltered align after realign 0.819839 36 | 119 misaligned out of 6015 correct items, 6060 bets. 37 | Precision: 0.980363, Recall: 0.987697 38 | Quality 0.819839 39 | -------------------------------------------------------------------------------- /regtest/targets/dtm.realign.cerr: -------------------------------------------------------------------------------- 1 | Reading dictionary... 2 | 582 hungarian sentences read. 3 | 584 english sentences read. 4 | quasiglobal_stopwordRemoval is set to 0 5 | Simplified dictionary ready. 6 | Rough translation ready. 7 | 0 100 200 300 400 500 8 | Rough translation-based similarity matrix ready. 9 | Matrix built. 10 | Trail found. 11 | Align ready. 12 | Global quality of unfiltered align 1.31448 13 | quasiglobal_spaceOutBySentenceLength is set to 1 14 | Trail spaced out by sentence length. 15 | Global quality of unfiltered align after realign 1.31448 16 | 11 misaligned out of 572 correct items, 580 bets. 17 | Precision: 0.981034, Recall: 0.994755 18 | Quality 1.31448 19 | -------------------------------------------------------------------------------- /regtest/targets/steinbeck.huntoken.nopara.cerr: -------------------------------------------------------------------------------- 1 | Reading dictionary... 2 | 5487 hungarian sentences read. 3 | 5357 english sentences read. 4 | quasiglobal_stopwordRemoval is set to 0 5 | Simplified dictionary ready. 6 | Rough translation ready. 7 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 8 | Rough translation-based similarity matrix ready. 9 | Matrix built. 10 | Trail found. 11 | Align ready. 12 | Global quality of unfiltered align 0.951101 13 | quasiglobal_spaceOutBySentenceLength is set to 1 14 | Trail spaced out by sentence length. 15 | Global quality of unfiltered align after realign 0.951101 16 | 152 misaligned out of 5180 correct items, 5229 bets. 17 | Precision: 0.970931, Recall: 0.980116 18 | Quality 0.951101 19 | -------------------------------------------------------------------------------- /scripts/DCEP/README: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | # 3 | # DCEP sentence aligned corpora for 276 langugage pairs 4 | # 5 | ####################################################### 6 | 7 | ######## 8 | # Usage 9 | 10 | Example: How to get Danish-Lithuanian sentence-aligned text? 11 | 12 | 0. Enter a directory where the corpus building will take place. 13 | (You can build several language pairs in this same directory.) 14 | 15 | 1. Download and extract the two sentence-segmented monolingual corpora: 16 | 17 | wget http://optima.jrc.it/Resources/DCEP-2013/sentences/DCEP-sentence-DA-pub.tar.bz2 18 | wget http://optima.jrc.it/Resources/DCEP-2013/sentences/DCEP-sentence-LV-pub.tar.bz2 19 | (Note to coauthors: shame on me, maybe we should remove the year reference from the path?) 20 | tar jxf DCEP-sentence-DA-pub.tar.bz2 21 | tar jxf DCEP-sentence-LV-pub.tar.bz2 22 | 23 | The sentence segmented text is now in the ./DCEP/sentence/(xml|sgml)/(DA|LV) subdirectories. 24 | 25 | 2. Download and extract the alignment information: 26 | 27 | wget http://people.mokk.bme.hu/~daniel/DCEP/langpairs/DCEP-DA-LV.tar.bz2 28 | tar jxf DCEP-DA-LV.tar.bz2 29 | 30 | The alignment information contains correspondence between numerical indices 31 | of sentences, in the next step we will turn these into actual sentence pairs. 32 | 33 | Note that the order is alphabetical in language code: DA-LV is good, LV-DA is not. 34 | The alignment information is now in the aligns/DA-LV subdirectory, 35 | and the index describing the correspondence between text documents is in the indices/DA-LV text file. 36 | Bidocuments are indentified by 6 digit numeric ids. 37 | 38 | 3. Download, extract, and run the tool that generates the bicorpus from the above data: 39 | 40 | wget http://people.mokk.bme.hu/~daniel/DCEP/DCEP-tools.tgz 41 | tar zxvf DCEP-tools.tgz 42 | ./src/languagepair.py DA-LV > DA-LV-bisentences.txt 43 | 44 | You have to have python installed, version 2.[567]. 45 | 46 | The output is a tab-delimited UTF-8 text file with two columns. 47 | It contains all corresponding sentence pairs identified by hunalign, the 48 | automatic sentence aligner we used to create the alignment information. 49 | The information about the source document of the sentence pair is lost 50 | in this output format. See below for command line switches that can alter this 51 | behavior. 52 | 53 | If you don't roll your own filter, we recommend to use the --numbering-filter 54 | switch that drops much of the numberings and other lower-quality sentences: 55 | 56 | ./src/languagepair.py --numbering-filter DA-LV > DA-LV-bisentences.txt 57 | 58 | See below for more detail. 59 | 60 | 61 | ################# 62 | # Advanced usage 63 | 64 | ./src/languagepair.py -h shows the available command line options. 65 | Here we give a bit more background for them. 66 | 67 | The original document structure is preserved with the --no-merge command line switch. 68 | This will create aligned text documents in ./bitexts/DA-LV. 69 | The numeric ids are used as file names, e.g. bitexts/DA-LV/013563. 70 | The indices/DA-LV table can be used to find the correspondence between the bidocument and the 71 | original DCEP filenames. 72 | 73 | By default, the script takes the index file describing the document pairings 74 | from the indices/DA-LV . This behavior can be changed with the --index-file argument. 75 | Here is a Unix example that only processes the first 10 documents of the index: 76 | ./src/languagepair.py --index-file <( head -10 indices/DA-LV ) > DA-LV-bisentences.txt 77 | 78 | With the --not-just-bisentences switch, the output format changes: 79 | It is one alignment unit per line, where an alignment unit consists of 80 | two tab-separated columns, one for both languages. In each column, 81 | there is a " ~~~ "-separated list of sentences. It is possible that one 82 | of the columns is empty: that means that the aligner did not find matching 83 | pair for the other column. The default " ~~~ " can be changed with the 84 | --delimiter command line argument. 85 | 86 | There are command line arguments that can be used to throw away suspicious 87 | bisentences if extra precision is required, at the expense of recall. 88 | 89 | The --numbering-filter is a crude but useful heuristic that attempts to drop numberings 90 | and short titles from the output. It works simply by matching sentences on both sides 91 | against a Unicode regex that looks for two alphabetic characters with space between them. 92 | 93 | The --length-filter-level=LENGTH_FILTER_LEVEL argument is used to throw away as suspicious 94 | all bisentences where the ratio of the shorter and the longer sentence (in character length) 95 | is less than LENGTH_FILTER_LEVEL percent. 96 | 97 | The --topo-filter-level=TOPO_FILTER_LEVEL argument is used to throw away 98 | bisentences that appear in suspicious blocks of bisegments. A block of 99 | bisegments is determined to be suspicious if the ratio of 1-to-1 bisegments it contains 100 | is less than TOPO_FILTER_LEVEL percent. The heuristic works with blocks of size 100. 101 | This heuristic is useful to identify and remove segments of text where the original 102 | documents differed in larger parts. (Parts were left untranslated, different order of chapters, etc.) 103 | 104 | -------------------------------------------------------------------------------- /scripts/DCEP/README.md: -------------------------------------------------------------------------------- 1 | # DCEP sentence aligned corpora for 276 langugage pairs 2 | 3 | ## Basic usage 4 | 5 | Example: How to get Danish-Lithuanian sentence-aligned text? 6 | 7 | ###### Get monolingual data 8 | 9 | Enter a directory where the corpus building will take place. 10 | (You can build several language pairs in this same directory.) 11 | 12 | Download and extract the two sentence-segmented monolingual corpora: 13 | 14 | ``` 15 | wget http://optima.jrc.it/Resources/DCEP-2013/sentences/DCEP-sentence-DA-pub.tar.bz2 16 | wget http://optima.jrc.it/Resources/DCEP-2013/sentences/DCEP-sentence-LV-pub.tar.bz2 17 | tar jxf DCEP-sentence-DA-pub.tar.bz2 18 | tar jxf DCEP-sentence-LV-pub.tar.bz2 19 | ``` 20 | 21 | The sentence segmented text is now in the `./DCEP/sentence/(xml|sgml)/(DA|LV)` subdirectories. 22 | 23 | ###### Get alignment data 24 | 25 | Download and extract the alignment information: 26 | 27 | ``` 28 | wget http://people.mokk.bme.hu/~daniel/DCEP/langpairs/DCEP-DA-LV.tar.bz2 29 | tar jxf DCEP-DA-LV.tar.bz2 30 | ``` 31 | 32 | The alignment information contains correspondence between numerical indices 33 | of sentences, in the next step we will turn these into actual sentence pairs. 34 | 35 | Note that the order is alphabetical in language code: `DA-LV` is good, `LV-DA` is not. 36 | The alignment information is now in the `aligns/DA-LV` subdirectory, 37 | and the index describing the correspondence between text documents is in the `indices/DA-LV` text file. 38 | Bidocuments are indentified by 6 digit numeric ids. 39 | 40 | ###### Create bicorpus 41 | 42 | Now we download, extract, and run the tool that generates the bicorpus from the above data: 43 | 44 | ``` 45 | wget http://people.mokk.bme.hu/~daniel/DCEP/DCEP-tools.tgz 46 | tar zxvf DCEP-tools.tgz 47 | ./src/languagepair.py DA-LV > DA-LV-bisentences.txt 48 | ``` 49 | 50 | You have to have python version 2.[567] installed to run the tool. 51 | 52 | The output is a tab-delimited UTF-8 text file with two columns. 53 | It contains all corresponding sentence pairs identified by hunalign, the 54 | automatic sentence aligner we used to create the alignment information. 55 | The information about the source document of the sentence pair is lost 56 | in this output format. See below for command line switches that can alter this 57 | behavior. 58 | 59 | If you don't roll your own sentence filter, we recommend to use the `--numbering-filter` 60 | option that drops much of the numberings that are very common in the corpus: 61 | 62 | ``` 63 | ./src/languagepair.py --numbering-filter DA-LV > DA-LV-bisentences.txt 64 | ``` 65 | 66 | See below for more detail. 67 | 68 | 69 | ## Advanced usage 70 | 71 | `./src/languagepair.py -h` shows the available command line options. 72 | Here we give a bit more background for them. 73 | 74 | The original document structure is preserved with the `--no-merge` option. 75 | This will create aligned text documents in `./bitexts/DA-LV`. 76 | The numeric ids are used as file names, e.g. `bitexts/DA-LV/013563`. 77 | The `indices/DA-LV` table can be used to find the correspondence between the bidocument and the 78 | original DCEP filenames. 79 | 80 | By default, the script looks for the index file describing the document pairings 81 | at `indices/DA-LV`. This behavior can be changed with the `--index-file` argument. 82 | Here is a Unix example that only processes the first 10 documents of the index: 83 | 84 | ```./src/languagepair.py --index-file <( head -10 indices/DA-LV ) > DA-LV-bisentences.txt``` 85 | 86 | With the `--not-just-bisentences` switch, the output format changes: 87 | It is one alignment unit per line, where an alignment unit consists of 88 | two tab-separated columns, one for both languages. In each column, 89 | there is a `" ~~~ "`-separated list of sentences. It is possible that one 90 | of the columns is empty: that means that the aligner did not find matching 91 | pair for the other column. The default `" ~~~ "` can be changed with the 92 | `--delimiter` command line argument. 93 | 94 | There are command line arguments that can be used to throw away suspicious 95 | bisentences if extra precision is required, at the expense of recall. 96 | 97 | The `--numbering-filter` is a crude but useful heuristic that attempts to drop numberings 98 | and short titles from the output. It works simply by matching sentences on both sides 99 | against a Unicode regex that looks for two alphabetic characters with space between them. 100 | 101 | The `--length-filter-level=LENGTH_FILTER_LEVEL` argument is used to throw away as suspicious 102 | all bisentences where the ratio of the shorter and the longer sentence (in character length) 103 | is less than `LENGTH_FILTER_LEVEL` percent. 104 | 105 | The `--topo-filter-level=TOPO_FILTER_LEVEL` argument is used to throw away 106 | bisentences that appear in suspicious blocks of bisegments. A block of 107 | bisegments is determined to be suspicious if the ratio of 1-to-1 bisegments it contains 108 | is less than `TOPO_FILTER_LEVEL` percent. The heuristic works with blocks of size 100. 109 | This heuristic is useful to identify and remove segments of text where the original 110 | documents differed in larger parts. (Parts were left untranslated, different order of chapters, etc.) 111 | 112 | -------------------------------------------------------------------------------- /scripts/DCEP/batchfilebylangpair.2ndcpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p langpairs/batch2 4 | 5 | awk 'BEGIN { 6 | langnum=0 7 | while ( getline < "langs.txt" ) 8 | { 9 | lang[langnum]=$0 10 | ++langnum; 11 | } 12 | 13 | for (i=0; i langpairs/batch2/" l1 "-" l2 ".batch") 21 | } 22 | } 23 | 24 | }' 25 | -------------------------------------------------------------------------------- /scripts/DCEP/batchfilebylangpair.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p langpairs/batch 4 | 5 | awk 'BEGIN { 6 | langnum=0 7 | while ( getline < "langs.txt" ) 8 | { 9 | lang[langnum]=$0 10 | ++langnum; 11 | } 12 | 13 | for (i=0; i langpairs/batch/" l1 "-" l2 ".batch") 21 | } 22 | } 23 | 24 | }' 25 | -------------------------------------------------------------------------------- /scripts/DCEP/dictforlanguagepair.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p langpairs/autodict 4 | mkdir -p langpairs/autodict.log 5 | 6 | l1=$1 7 | l2=$2 8 | 9 | bi=langpairs/biqf/$l1-$l2 10 | 11 | # I forgot to do this filtering step in the biqf creation step. 12 | # I'm lazy and do it with byte length 13 | cat $bi | awk '(length($0)<1000)' > tmp.bi 14 | 15 | cat tmp.bi | cut -f1 > tmp.l1 16 | cat tmp.bi | cut -f2 > tmp.l2 17 | 18 | ./acquisScripts/scripts/coocc.forAcquis -mc10 -ms40 tmp.l1 tmp.l2 2> langpairs/autodict.log/$l1-$l2 > langpairs/autodict/$l1-$l2.dic 19 | -------------------------------------------------------------------------------- /scripts/DCEP/dictsforalllanguagepairs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk 'BEGIN { 4 | langnum=0 5 | while ( getline < "langs.txt" ) 6 | { 7 | lang[langnum]=$0 8 | ++langnum; 9 | } 10 | 11 | for (i=0; i> langpairs/raw/$l1-$l2 2>> cerr.ladder2text 8 | done 9 | 10 | mkdir langpairs/biqf 11 | cat total.aligninfo.shuffled.limitin1000 | while read did l1 l2 tok1 tok2 ladder 12 | do 13 | hunalign/scripts/ladder2text.py $ladder $tok1 $tok2 | hunalign/scripts/DCEP/filteralign.sh >> langpairs/biqf/$l1-$l2 2> /dev/null 14 | done 15 | -------------------------------------------------------------------------------- /scripts/DCEP/filteralign.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | grep -v "~~~" | grep -v "

" | awk 'BEGIN {FS="\t"} { ra = ( length($2)>length($3) ? (length($2)+10)/(length($3)+10) : (length($3)+10)/(length($2)+10) ) ; if ((ra<1.5)&&($2!=$3)) print $2 "\t" $3 }' 4 | -------------------------------------------------------------------------------- /scripts/DCEP/finalpackage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | p=$1 4 | 5 | cat langpairs/aligninfo/$p.aligninfo | cut -f1,4,5 | sed "s/\.\/tree\/tok\///g" > final/indices/$p 6 | cd final 7 | tar jcf packages/DCEP-$p.tar.bz2 aligns/$p indices/$p 8 | # scp DCEP-$p.tar.bz2 kruso.mokk.bme.hu:./public_html/DCEP/langpairs/ 9 | cd .. 10 | -------------------------------------------------------------------------------- /scripts/DCEP/finalpackageforlangpairs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p final/indices 4 | mkdir -p final/packages 5 | 6 | awk 'BEGIN { 7 | langnum=0 8 | while ( getline < "langs.txt" ) 9 | { 10 | lang[langnum]=$0 11 | ++langnum; 12 | } 13 | 14 | for (i=0; i> tmp/cerr.$p | cut -f1,2 > $targ/$l1-$l2/$id 27 | done 28 | -------------------------------------------------------------------------------- /scripts/DCEP/flatladdertolangpairs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # With an uncomfortable tie-in, it does several things: 4 | # 1. creates the final directory structure. 5 | # 2. restructures the flatly structured ladder files into a language-pair based structure. 6 | # 3. gets rid of the unreliable quality values in the ladder files. (cut -f1,2) 7 | # 4. makes the final filenames as simple as possible, 8 | # from flat/ladder/89/033089.ES.LT.ladder to final/aligns/ES-LT/033089 9 | 10 | targ=final/aligns 11 | mkdir -p $targ 12 | ls langpairs/batch | cut -f1 -d'.' | while read p ; do mkdir $targ/$p ; done 13 | 14 | sub=ladder2 # This is the realign, not ladder22 that's only the second half, this one went through completely. 15 | find flat/$sub -type f | sed "s/^flat\/$sub\///" | sed "s/\.ladder$//" | tr '/.' ' ' |\ 16 | while read dig id l1 l2 17 | do 18 | cat flat/$sub/$dig/$id.$l1.$l2.ladder | cut -f1,2 > $targ/$l1-$l2/$id 19 | done 20 | -------------------------------------------------------------------------------- /scripts/DCEP/ladder2text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # In the code you can see lots of variables named hu* and en*, as 4 | # as in Hungarian and English. This does not mean that the tool 5 | # is not completely language-agnostic. By convention, and 6 | # for obvious historical reasons, hu and en should be interpreted 7 | # as language #1 and language #2. 8 | 9 | import sys 10 | import itertools 11 | import re 12 | 13 | # An especially crude but quite useful heuristics for 14 | # detecting sentences (as opposed to numberings, separators etc). 15 | # Two alphabetic characters with space between them. 16 | # See http://stackoverflow.com/a/2039476/383313 for an explanation. 17 | TWO_WORDS_REGEX = re.compile(r"""\w \w""", re.UNICODE) 18 | # TWO_WORDS_REGEX = re.compile(r"""[^\W\d_] [^\W\d_]""", re.UNICODE) 19 | 20 | def readfile(name): 21 | # Open the input files and read lines 22 | infile = file(name, 'r') 23 | lines = map( lambda s : s.strip("\n"), infile.readlines() ) 24 | return lines 25 | 26 | '''s -> (s0,s1), (s1,s2), (s2, s3), ... 27 | see http://docs.python.org/library/itertools.html''' 28 | def pairwise(iterable): 29 | a, b = itertools.tee(iterable) 30 | b.next() 31 | return itertools.izip(a, b) 32 | 33 | '''Create aligned text from two sentence files and hunalign's ladder-style output. 34 | Usage: ladder2text.py > aligned.txt 35 | See http://mokk.bme.hu/resources/hunalign for detailed format specification and more. 36 | The output file is tab-delimited, with two or three columns. 37 | The first and second columns are the chunks corresponding to each other. 38 | " ~~~ " is the sentence delimiter inside chunks. 39 | The third column is a probability score, if the input file had one. 40 | ''' 41 | 42 | def parseLadderLine(l): 43 | a = l.split() 44 | # We allow both scored and score-less input. 45 | assert 2<=len(a)<=3 46 | # The score we leave as a string, to avoid small diffs caused by different numerical representations. 47 | a[0],a[1] = int(a[0]),int(a[1]) 48 | return a 49 | 50 | # a hole is supposed to be two consecutive items in the array holding the lines of the ladder. /an array of holes is returned by pairwise(ladder)/ 51 | # the following segment returns an interval of sentences corresponding to a hole: 52 | # hulines[int(hole[0][0]):int(hole[1][0])] 53 | def holeToBisegment(hole,hulines,enlines) : 54 | if len(hole[0])==3 : 55 | quality = hole[0][2] 56 | else : 57 | quality = None 58 | 59 | huSens = hulines[hole[0][0]:hole[1][0]] 60 | enSens = enlines[hole[0][1]:hole[1][1]] 61 | return huSens,enSens,quality 62 | 63 | #serializeSens(huSens, enSens, quality, delimiter) 64 | 65 | def serializeBisegment(huSens,enSens,quality=None,delimiter=" ~~~ ") : 66 | huText = delimiter.join(huSens) 67 | enText = delimiter.join(enSens) 68 | text = huText+"\t"+enText 69 | if quality is not None : 70 | text += "\t"+str(quality) 71 | return text 72 | 73 | def isBisen(hole) : 74 | return (hole[1][0]-hole[0][0]==1) and (hole[1][1]-hole[0][1]==1) 75 | 76 | def isBisenPos(pos,ladder) : 77 | assert pos+2<=len(ladder) 78 | hole = ladder[pos:pos+2] 79 | return isBisen(hole) 80 | 81 | def crudeSentenceDetector(huSenUtf,enSenUtf) : 82 | return TWO_WORDS_REGEX.search(huSenUtf) is not None and TWO_WORDS_REGEX.search(enSenUtf) is not None 83 | 84 | def isAcceptableLength(huSenUtf,enSenUtf,lengthFilterLevel) : 85 | lengthFilterRatio = float(lengthFilterLevel)/100 # TODO Casting in every inner loop, how lame is that. 86 | if lengthFilterLevel is None : 87 | return True 88 | h = len(huSenUtf)+1 89 | e = len(enSenUtf)+1 90 | ratio = float(h)/e 91 | if ratio>1 : 92 | ratio = 1/ratio 93 | return ratio>=lengthFilterRatio 94 | 95 | def filterTopology(ladder, topoFilterLevel) : 96 | if topoFilterLevel is None : 97 | return ladder 98 | 99 | WINDOW = 100 100 | # the higher the stricter. 101 | topoFilterRatio = float(topoFilterLevel)/100 102 | rungsToKill = set() 103 | trailSize = len(ladder) 104 | for pos in range(1,trailSize-1-WINDOW) : 105 | huStart = ladder[pos][0] 106 | enStart = ladder[pos][1] 107 | huEnd = ladder[pos+WINDOW][0] 108 | enEnd = ladder[pos+WINDOW][1] 109 | deviation = float(huEnd-huStart+1)/(enEnd-enStart+1) # TODO We don't currently use it. 110 | if deviation>1 : 111 | deviation = 1/deviation 112 | bisenCnt = 0 113 | for pos2 in range(pos,pos+WINDOW) : 114 | if isBisenPos(pos2,ladder) : 115 | bisenCnt += 1 116 | ratio = float(bisenCnt)/WINDOW 117 | # sys.stderr.write("%f %f\n" % (ratio,deviation)) 118 | # TODO That's lame algorithmically, will switch to proper window-sliding when the basic algorithm is validated. 119 | if ratio > aligned.txt\n' ) 177 | sys.exit(-1) 178 | 179 | 180 | if __name__ == "__main__" : 181 | main() 182 | -------------------------------------------------------------------------------- /scripts/DCEP/languagepair.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os.path 5 | import os, errno 6 | import optparse 7 | 8 | import ladder2text 9 | 10 | 11 | dcepLanguages = [ "BG", "CS", "DA", "DE", "EL", "EN", "ES", "ET", "FI", "FR", "GA", "HU", "IT", "LT", "LV", "MT", "NL", "PL", "PT", "RO", "SK", "SL", "SV", "TR" ] 12 | 13 | 14 | class InputError(Exception): 15 | pass 16 | 17 | def error(s) : 18 | sys.stderr.write("ERROR: "+s+"\n") 19 | sys.exit(-1) 20 | 21 | def mkdir_p(path) : 22 | try : 23 | os.makedirs(path) 24 | except OSError, exc : 25 | if exc.errno == errno.EEXIST and os.path.isdir(path) : 26 | pass 27 | else : 28 | raise 29 | 30 | def main(): 31 | 32 | parser = optparse.OptionParser() 33 | defaultDelimiter = " ~~~ " 34 | parser.add_option("--no-merge", action="store_true", dest="noMerge", help="Keep the output bidocuments in separate files under bitext/L1-L2/, instead of merging them and writing them to the standard output.") 35 | parser.add_option("--not-just-bisentences", action="store_false", dest="justBisen", default=True, help="Save all alignment units, not just 1-to-1 correspondences.") 36 | parser.add_option("--delimiter", dest="delimiter", type="string", default=defaultDelimiter, help="String for delimiting sentences within alignment units. Only meaningful when combined with --not-just-bisentences. Default value: '"+defaultDelimiter+"'.") 37 | 38 | # TODO Commented out, hardly finished. 39 | # parser.add_option("--topo-filter-level", action="store", type="int", dest="topoFilterLevel", metavar="TOPO_FILTER_LEVEL", 40 | # help="Agressiveness of context-based bisentence filtering. Between 0 and 100. By default it is not employed. Cannot be combined with --not-just-bisentences.") 41 | 42 | parser.add_option("--length-filter-level", action="store", type="int", dest="lengthFilterLevel", metavar="LENGTH_FILTER_LEVEL", 43 | help="Agressiveness of sentence character length based bisentence filtering. Between 0 and 100. By default it is not employed. Cannot be combined with --not-just-bisentences.") 44 | parser.add_option("--numbering-filter", action="store_true", dest="sentenceDetector", default=False, 45 | help="A crude heuristic that drops numberings and short titles from the output. Cannot be combined with --not-just-bisentences.") 46 | 47 | parser.add_option("--index-file", action="store", type="string", dest="indexFilename", metavar="INDEX_FILE", 48 | help="Use this file to decide which documents to process, instead of the default indices/L1-L2. Tab-separated file with rows containing document-id L1-sentence-segmented-file L2-sentence-segmented-file. When combined with --no-merge, the bitext/L1-L2 directory is deduced from the sentence file paths, assuming DCEP directory structure.") 49 | 50 | parser.usage = "%prog [options] L1-L2\nwhere L1-L2 is a language pair, and L1 and L2 are in alphabetical order. E.g. DE-EN.\n" 51 | parser.usage += "or\n%prog [options] --index-file INDEX_FILE." 52 | 53 | try : 54 | assert len(sys.argv)>1 55 | (options, args) = parser.parse_args(sys.argv[1:]) 56 | except : 57 | parser.print_help() 58 | sys.exit(-1) 59 | 60 | # TODO Remove after topoFilter is finished. 61 | options.topoFilterLevel = None 62 | 63 | if options.indexFilename : 64 | if len(args)>0 : 65 | error("Should not give a language pair when the --index-file argument is used.") 66 | l1 = None 67 | l2 = None 68 | else : 69 | try : 70 | assert len(args)==1 71 | lp = args[0] 72 | l1,l2 = lp.split("-") 73 | assert len(l1)==len(l2)==2 74 | assert l1docCounter+100 : 165 | error("Too many align/"+lp+" files missing, the directory structure was probably not set up properly.") 166 | 167 | main() 168 | -------------------------------------------------------------------------------- /scripts/DCEP/mergedicts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p langpairs/fulldict 4 | 5 | awk 'BEGIN { 6 | langnum=0 7 | while ( getline < "langs.txt" ) 8 | { 9 | lang[langnum]=$0 10 | ++langnum; 11 | } 12 | 13 | for (i=0; i langpairs/fulldict/" p ) 22 | system("bash hunalign/scripts/DCEP/normalizesztakidict.sh < langpairs/sztaki/" p " >> langpairs/fulldict/" p " 2> /dev/null" ) 23 | } 24 | } 25 | 26 | }' 27 | -------------------------------------------------------------------------------- /scripts/DCEP/normalizedict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cut -f1,2 | grep -v "@" | awk '{ print $2,"@",$1 }' 4 | -------------------------------------------------------------------------------- /scripts/DCEP/normalizesztakidict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk '{ print $2,"@",$1 }' 4 | -------------------------------------------------------------------------------- /scripts/DCEP/packaligninfobylangpair.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p langpairs/aligninfo 4 | 5 | awk 'BEGIN { 6 | langnum=0 7 | while ( getline < "langs.txt" ) 8 | { 9 | lang[langnum]=$0 10 | ++langnum; 11 | } 12 | 13 | for (i=0; i langpairs/aligninfo/" l1 "-" l2 ".aligninfo") 21 | } 22 | } 23 | 24 | }' 25 | -------------------------------------------------------------------------------- /scripts/DCEP/realignall.2ndcpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # flat/ladder2 already created. 4 | 5 | mkdir -p langpairs/realign2.log 6 | 7 | awk 'BEGIN { 8 | langnum=0 9 | while ( getline < "langs.txt" ) 10 | { 11 | lang[langnum]=$0 12 | ++langnum; 13 | } 14 | 15 | k=0 16 | 17 | for (i=0; i langpairs/realign2.log/" p ) 28 | } 29 | } 30 | 31 | }' 32 | -------------------------------------------------------------------------------- /scripts/DCEP/realignall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # flat/ladder2 already created. 4 | 5 | mkdir -p langpairs/realign.log 6 | 7 | awk 'BEGIN { 8 | langnum=0 9 | while ( getline < "langs.txt" ) 10 | { 11 | lang[langnum]=$0 12 | ++langnum; 13 | } 14 | 15 | for (i=0; i langpairs/realign.log/" p ) 24 | } 25 | } 26 | 27 | }' 28 | -------------------------------------------------------------------------------- /scripts/DCEP/renamesztakidicts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p langpairs/sztaki 4 | 5 | awk 'BEGIN { 6 | langnum=0 7 | while ( getline < "langs.txt" ) 8 | { 9 | lang[langnum]=$0 10 | ++langnum; 11 | } 12 | 13 | for (i=0; i "$t" 8 | done 9 | 10 | echo "Starting sentence count verification" 11 | date 12 | 13 | find tree/sentence/ -type f | while read f 14 | do 15 | t=`echo $f | sed "s/^tree\/sentence/tree\/tok/"` 16 | lSen=`wc -l < $f` 17 | lStrip=`wc -l < $t` 18 | if [ "$lSen" -ne "$lStrip" ] 19 | then 20 | echo Mismatch: $f $lSen $t $lStrip 21 | fi 22 | done 23 | 24 | echo "Done." 25 | date 26 | -------------------------------------------------------------------------------- /scripts/DCEP/verifylangpair.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # quantifies the difference between align and realign for a given language pair 6 | 7 | p=$1 8 | seed=$2 9 | if [ -z "$seed" ]; then seed=0 ; fi 10 | 11 | cat langpairs/aligninfo/$p.aligninfo | cut -f6 | awk -v seed=$seed '(NR%100==seed)' > tmp/$p.ladsam 12 | cat tmp/$p.ladsam | xargs cat | cut -f1,2 > tmp/$p.ladcat 13 | cat tmp/$p.ladsam | sed "s/ladder/ladder2/" | xargs cat | cut -f1,2 > tmp/$p.ladcat2 14 | l1=`wc -l < tmp/$p.ladcat` 15 | l2=`wc -l < tmp/$p.ladcat2` 16 | d=`diff tmp/$p.ladcat tmp/$p.ladcat2 | wc -l` 17 | echo $p $l1 $l2 $d 18 | -------------------------------------------------------------------------------- /scripts/en.sen.one.sh: -------------------------------------------------------------------------------- 1 | huntoken -e -b | grep "\(^\)\|\(^

\)" | sed "s/^//" | sed "s/^

/

/" 2 | -------------------------------------------------------------------------------- /scripts/hu.sen.one.sh: -------------------------------------------------------------------------------- 1 | huntoken -b | grep "\(^\)\|\(^

\)" | sed "s/^//" | sed "s/^

/

/" 2 | -------------------------------------------------------------------------------- /scripts/hunalignDriver.py: -------------------------------------------------------------------------------- 1 | from StringIO import StringIO 2 | 3 | from teed import * 4 | 5 | from partialAlign import * 6 | 7 | 8 | OUTPUT_FILENAME="tmp/tmp_chunk" 9 | BATCH_JOB_FILENAME = "tmp/tmp_batch" 10 | MAXIMAL_CHUNK_SIZE = 1000 11 | 12 | 13 | def parseLadderData(fileContent) : 14 | ls = fileContent.split("\n") 15 | if ls[-1]=="" : 16 | del ls[-1] 17 | 18 | result = [] 19 | for l in ls : 20 | a = l.split() 21 | if len(a)!=3 : 22 | raise Exception("hunalign should return 3-column data.") 23 | result.append((int(a[0]),int(a[1]),float(a[2]))) 24 | 25 | return result 26 | 27 | def serializeLadderData(ladder) : 28 | def serializeLine(l) : 29 | return "\t".join(map(str,l)) 30 | return "\n".join( map(serializeLine, ladder) ) + "\n" 31 | 32 | def hunalignDriver(hunalignExecutablePath, hunalignArgs) : 33 | 34 | cmd = [ hunalignExecutablePath ] + hunalignArgs 35 | 36 | fout, ferr = StringIO(), StringIO() 37 | exitcode = teed_call(cmd, stdout=fout, stderr=ferr) 38 | stdout = fout.getvalue() 39 | stderr = ferr.getvalue() 40 | 41 | if exitcode!=0 : 42 | raise Exception("hunalign returned with exit code "+str(exitcode)) 43 | 44 | result = parseLadderData(stdout) 45 | 46 | return result, stderr 47 | 48 | 49 | def batchHunalignDriver(hunalignExecutablePath, hunalignArgs) : 50 | cmd = [ hunalignExecutablePath, "-batch" ] + hunalignArgs 51 | 52 | fout, ferr = StringIO(), StringIO() 53 | exitcode = teed_call(cmd, stdout=fout, stderr=ferr) 54 | stdout = fout.getvalue() 55 | stderr = ferr.getvalue() 56 | 57 | if exitcode!=0 : 58 | raise Exception("hunalign returned with exit code "+str(exitcode)) 59 | 60 | assert len(stdout)==0 61 | 62 | 63 | def partialAlignDriver(huFilename, enFilename) : 64 | chain,stdout = partialAlignWithIO(huFilename, enFilename, outputFilename=OUTPUT_FILENAME, huLangName="a", enLangName="b", maximalChunkSize=MAXIMAL_CHUNK_SIZE) 65 | return chain,stdout 66 | 67 | def fullStack(hunalignExecutablePath, huFilename, enFilename, dictFilename) : 68 | chain,stdout = partialAlignDriver(huFilename, enFilename) 69 | 70 | with open(BATCH_JOB_FILENAME,'w') as f : 71 | f.write(stdout) 72 | 73 | extraCareful = True 74 | if extraCareful : 75 | # Output should arrive in files named OUTPUT_FILENAME +"_"+ str(number) +"."+ ("a" if hu else "b") 76 | chunkNumber = len(chain)-1 77 | huSenCnt = 0 78 | enSenCnt = 0 79 | for chunkId in range(1,chunkNumber+1) : 80 | chunkFilename = "%s_%d." % (OUTPUT_FILENAME, chunkId) 81 | huChunkFilename = chunkFilename+"a" 82 | enChunkFilename = chunkFilename+"b" 83 | with open(huChunkFilename) as huChunkFile : 84 | huSenCnt += len(huChunkFile.readlines()) 85 | with open(enChunkFilename) as enChunkFile : 86 | enSenCnt += len(enChunkFile.readlines()) 87 | assert chain[chunkId] == (huSenCnt,enSenCnt) 88 | 89 | hunalignArgs = [ dictFilename, BATCH_JOB_FILENAME ] 90 | batchHunalignDriver(hunalignExecutablePath, hunalignArgs) 91 | 92 | # Output should now arrive in files named OUTPUT_FILENAME +"_"+ str(number) + ".align" 93 | 94 | totalLadder = [] 95 | for chunkId in range(1,chunkNumber+1) : 96 | alignChunkFilename = "%s_%d.align" % (OUTPUT_FILENAME, chunkId) 97 | chunkStarts = chain[chunkId-1] 98 | huStart,enStart = chunkStarts 99 | if len(totalLadder)>0 : 100 | if totalLadder[-1][:2] != chunkStarts : 101 | log( "ERROR: In %s rung %s should match with %s" % (alignChunkFilename, str(totalLadder[-1]), chunkStarts) ) 102 | raise Exception("chunk aligns inconsistent with chunking data") 103 | 104 | # The last element of the ladder is only there to mark the size of the who bidocument. 105 | # Supposedly it's quality value is always 0.3 (We don't check this.) 106 | del totalLadder[-1] 107 | try : 108 | with open(alignChunkFilename) as f : 109 | chunkLadder = parseLadderData(f.read()) 110 | assert chunkLadder[0][:2] == (0,0) 111 | for rung in chunkLadder : 112 | huStep,enStep,quality = rung 113 | totalLadder.append( (huStart+huStep, enStart+enStep, quality) ) 114 | except IOError : 115 | log( "ERROR: %s missing, hunalign probably gave up on input" % alignChunkFilename) 116 | raise Exception("chunk align missing") 117 | 118 | sys.stdout.write(serializeLadderData(totalLadder)) 119 | 120 | def testBatchHunalign() : 121 | hunalignExecutablePath = '../src/hunalign/hunalign' 122 | hunalignArgs = ['../data/null.dic', 'batch.job'] 123 | batchHunalignDriver(hunalignExecutablePath, hunalignArgs) 124 | 125 | 126 | def testHunalign() : 127 | hunalignExecutablePath = '../src/hunalign/hunalign' 128 | ladderDir = '../regtest/handaligns/1984.ro.utf8/' 129 | hunalignArgs = [ '-hand='+ladderDir+'hand.ladder', '../data/null.dic', ladderDir+'/hu.pre', ladderDir+'/en.pre' ] 130 | 131 | result,stderr = hunalignDriver(hunalignExecutablePath, hunalignArgs) 132 | 133 | print "\n".join(map(str,result)) 134 | 135 | 136 | def testFullStack() : 137 | hunalignExecutablePath = '../src/hunalign/hunalign' 138 | ladderDir = '../regtest/handaligns/1984.ro.utf8/' 139 | huFilename, enFilename, dictFilename = ( ladderDir+'/hu.pre', ladderDir+'/en.pre', '../data/null.dic' ) 140 | 141 | fullStack(hunalignExecutablePath, huFilename, enFilename, dictFilename) 142 | 143 | def main() : 144 | # testHunalign() 145 | # testBatchHunalign() 146 | testFullStack() 147 | 148 | if __name__=='__main__' : 149 | main() 150 | -------------------------------------------------------------------------------- /scripts/ladder2text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | import itertools 4 | 5 | '''file -> array holding the lines of the file''' 6 | def readfile(name): 7 | # Open the input files and read lines 8 | infile = file(name, 'r') 9 | lines = map( lambda s : s.strip("\n"), infile.readlines() ) 10 | return lines 11 | 12 | '''s -> (s0,s1), (s1,s2), (s2, s3), ... 13 | see http://docs.python.org/library/itertools.html''' 14 | def pairwise(iterable): 15 | a, b = itertools.tee(iterable) 16 | b.next() 17 | return itertools.izip(a, b) 18 | 19 | '''Create aligned text from two sentence files and hunalign's ladder-style output. 20 | Usage: ladder2text.py > aligned.txt 21 | See http://mokk.bme.hu/resources/hunalign for detailed format specification and more. 22 | The output file is tab-delimited, with three columns. The first is a probability score. 23 | The second and third columns are the chunks corresponding to each other. 24 | " ~~~ " is the sentence delimiter inside chunks. 25 | ''' 26 | def main() : 27 | if len(sys.argv) == 4: 28 | ladderlines = readfile(sys.argv[1]) 29 | hulines = readfile(sys.argv[2]) 30 | enlines = readfile(sys.argv[3]) 31 | def parseLadderLine(l) : 32 | a = l.split() 33 | assert len(a)==3 34 | return ( int(a[0]), int(a[1]), a[2] ) # The score we leave as a string, to avoid small diffs caused by different numerical representations. 35 | ladder = map( parseLadderLine, ladderlines ) 36 | 37 | # the next map() does all the work, so here are some comments... 38 | # the map() iterates over the holes of the ladder. 39 | # a hole is supposed to be two consecutive items in the array holding the lines of the ladder. /an array of holes is returned by pairwise(ladder)/ 40 | # the following segment returns an interval of sentences corresponding to a hole: 41 | # hulines[int(hole[0][0]):int(hole[1][0])] 42 | outputlines = map( lambda hole: 43 | hole[0][2] + "\t" + 44 | " ~~~ ".join(hulines[int(hole[0][0]):int(hole[1][0])]) 45 | + "\t" + 46 | " ~~~ ".join(enlines[int(hole[0][1]):int(hole[1][1])]) 47 | , 48 | pairwise(ladder) 49 | ) 50 | 51 | for l in outputlines : 52 | print l 53 | else: 54 | print 'usage: ladder2text.py > aligned.txt' 55 | sys.exit(-1) 56 | 57 | 58 | if __name__ == "__main__" : 59 | main() 60 | -------------------------------------------------------------------------------- /scripts/process.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | lang=Hungarian ; slang=hu ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen 2>/dev/null ; echo $lang/$au/sen/${au}_$ind.$slang.sen ; $BINDIR/hu.sen.one.sh < $lang/$au/raw/${au}_$ind.$slang.raw > $lang/$au/sen/${au}_$ind.$slang.sen ; done 4 | 5 | lang=English ; slang=en ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen 2>/dev/null ; echo $lang/$au/sen/${au}_$ind.$slang.sen ; $BINDIR/en.sen.one.sh < $lang/$au/raw/${au}_$ind.$slang.raw > $lang/$au/sen/${au}_$ind.$slang.sen ; done 6 | 7 | 8 | lang=Hungarian ; slang=hu ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen.tok 2>/dev/null ; echo $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok ; $BINDIR/tok.one.sh < $lang/$au/sen/${au}_$ind.$slang.sen > $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok ; done 9 | 10 | lang=English ; slang=en ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen.tok 2>/dev/null ; echo $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok ; $BINDIR/tok.one.sh < $lang/$au/sen/${au}_$ind.$slang.sen > $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok ; done 11 | 12 | 13 | lang=Hungarian ; slang=hu ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen.tok.stem 2>/dev/null ; echo $lang/$au/sen.tok.stem/${au}_$ind.$slang.sen.tok.stem ; $BINDIR/$slang.stem.one.sh < $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok > $lang/$au/sen.tok.stem/${au}_$ind.$slang.sen.tok.stem ; done 14 | 15 | lang=English ; slang=en ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen.tok.stem 2>/dev/null ; echo $lang/$au/sen.tok.stem/${au}_$ind.$slang.sen.tok.stem ; $BINDIR/$slang.stem.one.sh < $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok > $lang/$au/sen.tok.stem/${au}_$ind.$slang.sen.tok.stem ; done 16 | 17 | # ///////////////////////////////////////////////// 18 | # VEGRE ALIGN, POSZTPROCESSZOROKKAL: 19 | 20 | mkdir $BICDIR/Align 21 | 22 | cat $CATALOG | cut -f1 | sort -u | while read d ; do mkdir $BICDIR/Align/$d ; mkdir $BICDIR/Align/$d/ladder ; done 23 | 24 | 25 | cat $CATALOG | awk '{ print "Hungarian/"$1"/sen.tok.stem/"$1"_"$2".hu.sen.tok.stem" "\t" "English/"$1"/sen.tok.stem/"$1"_"$2".en.sen.tok.stem" "\t" "Align/"$1"/ladder/"$1"_"$2".ladder" }' > align.batch 26 | 27 | $BINDIR/alignerTool -batch -headerthresh=100 -ppthresh=30 $BINDIR/vonyo7.nojoker.stemmed align.batch > align.cout 2> align.cerr 28 | 29 | cat $CATALOG | cut -f1 | sort -u | while read d ; do mkdir $BICDIR/Align/$d/text ; done 30 | 31 | cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do lang=Hungarian ; slang=hu ; $BINDIR/ladder2text.sh $BICDIR/Align/$au/ladder/${au}_$ind.ladder $BICDIR/Hungarian/$au/sen/${au}_$ind.hu.sen $BICDIR/English/$au/sen/${au}_$ind.en.sen > $BICDIR/Align/$au/text/${au}_$ind.text ; done 32 | 33 | 34 | cat $CATALOG | cut -f1 | sort -u | while read d ; do mkdir $BICDIR/Align/$d/text.qf ; done 35 | 36 | cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do echo Align/$au/text.qf/${au}_$ind.text.qf ; cat Align/$au/text/${au}_$ind.text | grep -v "~~~" | grep -v "

" | awk 'BEGIN {FS="\t"} { ra = ( length($2)>length($3) ? (length($2)+10)/(length($3)+10) : (length($3)+10)/(length($2)+10) ) ; if (ra<1.5) print $0}' > Align/$au/text.qf/${au}_$ind.text.qf ; done 37 | 38 | 39 | cat $CATALOG | cut -f1 | sort -u | while read d ; do mkdir $BICDIR/Align/$d/shuffled ; done 40 | 41 | cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do echo Align/$au/shuffled/${au}_$ind.shuffled ; cat Align/$au/text.qf/${au}_$ind.text.qf | cut -f2,3 | sort > $BICDIR/Align/$au/shuffled/${au}_$ind.shuffled ; done 42 | 43 | 44 | mkdir measure 45 | 46 | cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do lang=Hungarian ; slang=hu ; hu=`cat $lang/$au/sen/${au}_$ind.$slang.sen | grep -v "

" | wc -l` ; lang=English ; slang=en ; en=`cat $lang/$au/sen/${au}_$ind.$slang.sen | grep -v "

" | wc -l` ; echo ${au}_$ind $hu $en ; done | awk '{ h=$2+1; e=$3+1; print (h measure/senratio.txt 47 | 48 | cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do i=$BICDIR/Align/$au/ladder/${au}_$ind.ladder ; echo -n "$i" ; cat $i | awk '{ if (($1-one==1)&&($2-two==1)) { ++bis } ; one=$1; two=$2 } END { print "\t" 0+bis "\t" 0+one "\t" 0+two "\t" bis/(1+(one measure/otor.txt 49 | -------------------------------------------------------------------------------- /scripts/release.howto.txt: -------------------------------------------------------------------------------- 1 | cvs co hunalign 2 | cd hunalign 3 | 4 | # Manually bump version number all over readme.html. 5 | ... 6 | 7 | # Remove CVS metadata. Commit before this! 8 | find . | grep "/CVS$" | while read x ; do rm -rf "$x" ; done 9 | 10 | # I have no rights to release the parallel corpora. 11 | rm -rf regtest 12 | 13 | cd .. 14 | ver=3 15 | mv hunalign hunalign-1.$ver 16 | cp -r hunalign-1.$ver /public/Hunglish/src/hunalign/latest/ 17 | tar zcvf hunalign-1.$ver.tgz hunalign-1.$ver 18 | cp hunalign-1.$ver.tgz /public/Hunglish/src/hunalign/latest/ 19 | 20 | # Build for Windows. Make a zip called hunalign-1.$ver-windows.zip. 21 | # The zip should contain a directory called hunalign-1.$ver-windows, containing 22 | # the hunalign.exe, and 23 | # msvcp100.dll, msvcr100.dll files for MSVC++ or 24 | # cygwin1.dll for CYGWIN make. 25 | # Copy the zip to /public/Hunglish/src/hunalign/latest/. 26 | 27 | # Manually copy the readme.html to the http://mokk.bme.hu/resources/hunalign plone page 28 | # in its html source edit mode. 29 | -------------------------------------------------------------------------------- /scripts/subprocessTest.py: -------------------------------------------------------------------------------- 1 | from StringIO import StringIO 2 | 3 | from teed import * 4 | 5 | cmd = ["./testProcess.sh"] 6 | cmd = ["./testProcess-RealHunalign.sh"] 7 | 8 | ladderDir = '../regtest/handaligns/1984.ro.utf8/' 9 | 10 | cmd = ['../src/hunalign/hunalign', '-utf', 11 | '-hand='+ladderDir+'hand.ladder', '../data/null.dic', ladderDir+'/hu.pre', ladderDir+'/en.pre', '-bisent', '-realign'] 12 | 13 | fout, ferr = StringIO(), StringIO() 14 | exitcode = teed_call(cmd, stdout=fout, stderr=ferr) 15 | stdout = fout.getvalue() 16 | stderr = ferr.getvalue() 17 | 18 | # print len(stdout),len(stderr) 19 | 20 | print stderr 21 | -------------------------------------------------------------------------------- /scripts/teed.py: -------------------------------------------------------------------------------- 1 | # http://stackoverflow.com/questions/4984428/python-subprocess-get-childrens-output-to-file-and-terminal/4985080#4985080 2 | # Thanks http://stackoverflow.com/users/4279/j-f-sebastian 3 | 4 | import sys 5 | from subprocess import Popen, PIPE 6 | from threading import Thread 7 | 8 | def tee(infile, *files): 9 | """Print `infile` to `files` in a separate thread.""" 10 | def fanout(infile, *files): 11 | for line in iter(infile.readline, ''): 12 | for f in files: 13 | f.write(line) 14 | infile.close() 15 | t = Thread(target=fanout, args=(infile,)+files) 16 | t.daemon = True 17 | t.start() 18 | return t 19 | 20 | def teed_call(cmd_args, **kwargs): 21 | stdout, stderr = [kwargs.pop(s, None) for s in 'stdout', 'stderr'] 22 | p = Popen(cmd_args, 23 | stdout=PIPE if stdout is not None else None, 24 | stderr=PIPE if stderr is not None else None, 25 | **kwargs) 26 | threads = [] 27 | # Here I changed Sebastian's original version, because I don't want to tee stdout, just stderr: 28 | # ORIGINAL: 29 | # if stdout is not None: threads.append(tee(p.stdout, stdout, sys.stdout)) 30 | # MINE: 31 | if stdout is not None: threads.append(tee(p.stdout, stdout)) 32 | 33 | if stderr is not None: threads.append(tee(p.stderr, stderr, sys.stderr)) 34 | for t in threads: t.join() # wait for IO completion 35 | return p.wait() 36 | 37 | 38 | if __name__ == '__main__': 39 | outf, errf = open('out.txt', 'w'), open('err.txt', 'w') 40 | assert not teed_call(["cat", __file__], stdout=None, stderr=errf) 41 | assert not teed_call(["echo", "abc"], stdout=outf, stderr=errf, bufsize=0) 42 | assert teed_call(["gcc", "a b"], close_fds=True, stdout=outf, stderr=errf) 43 | 44 | -------------------------------------------------------------------------------- /scripts/testProcess-RealHunalign.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ladderDir=../regtest/handaligns/1984.ro.utf8 4 | ../src/hunalign/hunalign -utf -hand=$ladderDir/hand.ladder ../data/null.dic $ladderDir/hu.pre $ladderDir/en.pre -bisent 5 | -------------------------------------------------------------------------------- /scripts/testProcess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo -n "Working... " > /dev/stderr 4 | for (( i=1 ; i<10 ; ++i )) ; do 5 | sleep 1 6 | echo -n "$i " > /dev/stderr 7 | done 8 | 9 | echo "Done." > /dev/stderr 10 | echo "End result 1" 11 | echo "End result 2" 12 | echo "End result 3" 13 | -------------------------------------------------------------------------------- /scripts/testProcess1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | awk 'BEGIN { for (i=1;i<100;++i) { if (i%20==0) { print "Working on number",i > "/dev/stderr" } ; s += i ; print s } }' 4 | -------------------------------------------------------------------------------- /scripts/testProcessWithInput.sh: -------------------------------------------------------------------------------- 1 | awk '{ if (NR%100==0) { print "Working on line",NR > "/dev/stderr" } ; s += $0 ; print s }' 2 | -------------------------------------------------------------------------------- /scripts/tok.one.sh: -------------------------------------------------------------------------------- 1 | sed 's/\([\.,:\/;()?\!\"]\)/ \1 /g' | sed "s/\([^ -]\)\(--\+\)/\1 \2/g" | sed "s/\(--\+\)\([^ -]\)/\1 \2/g" | sed 's/ \+/ /g' | sed 's/ $//' | sed "s/^ //" 2 | -------------------------------------------------------------------------------- /scripts/translate.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/scripts/translate.txt -------------------------------------------------------------------------------- /scripts/visualizeAlignQuality.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk -f 2 | 3 | BEGIN { 4 | n = 50 ; 5 | r = ""; 6 | for ( i=0; i. 55 | */ 56 | 57 | /* 58 | * $Id: DOMTreeErrorReporter.cpp,v 1.1.1.1 2009-07-06 14:05:31 daniel Exp $ 59 | */ 60 | 61 | // --------------------------------------------------------------------------- 62 | // Includes 63 | // --------------------------------------------------------------------------- 64 | #include 65 | #include "DOMTreeErrorReporter.hpp" 66 | 67 | #include 68 | 69 | #include 70 | #include 71 | 72 | XERCES_CPP_NAMESPACE_USE 73 | 74 | void DOMTreeErrorReporter::warning(const SAXParseException&) 75 | { 76 | // 77 | // Ignore all warnings. 78 | // 79 | } 80 | 81 | void DOMTreeErrorReporter::error(const SAXParseException& toCatch) 82 | { 83 | fSawErrors = true; 84 | std::cerr << "Error at file \"" << StrX(toCatch.getSystemId()) 85 | << "\", line " << toCatch.getLineNumber() 86 | << ", column " << toCatch.getColumnNumber() 87 | << "\n Message: " << StrX(toCatch.getMessage()) << std::endl; 88 | } 89 | 90 | void DOMTreeErrorReporter::fatalError(const SAXParseException& toCatch) 91 | { 92 | fSawErrors = true; 93 | std::cerr << "Fatal Error at file \"" << StrX(toCatch.getSystemId()) 94 | << "\", line " << toCatch.getLineNumber() 95 | << ", column " << toCatch.getColumnNumber() 96 | << "\n Message: " << StrX(toCatch.getMessage()) << std::endl; 97 | } 98 | 99 | void DOMTreeErrorReporter::resetErrors() 100 | { 101 | fSawErrors = false; 102 | } 103 | 104 | std::ostream& operator<<( std::ostream& target, const StrX& toDump) 105 | { 106 | target << toDump.localForm(); 107 | return target; 108 | } 109 | -------------------------------------------------------------------------------- /src/hunalign/Makefile: -------------------------------------------------------------------------------- 1 | sources = alignerTool.cpp alignment.cpp bloom.cpp bookToMatrix.cpp cooccurrence.cpp cooccurrenceTool.cpp dictionary.cpp main.cpp networkFlow.cpp oldAlignTest.cpp trailPostprocessors.cpp translate.cpp wordAlignment.cpp ../utils/stringsAndStreams.cpp ../utils/argumentsParser.cpp ../utils/timer.cpp 2 | 3 | headers = alignment.h bloom.h bookToMatrix.h cooccurrence.h dictionary.h dicTree.h help.h networkFlow.h quasiDiagonal.h similarityEvaluator.h TEIReader.h trailPostprocessors.h translate.h wordAlignment.h words.h 4 | 5 | objects = alignerTool.o alignment.o bloom.o bookToMatrix.o cooccurrence.o cooccurrenceTool.o dictionary.o main.o networkFlow.o oldAlignTest.o trailPostprocessors.o translate.o wordAlignment.o ../utils/stringsAndStreams.o ../utils/argumentsParser.o ../utils/timer.o 6 | 7 | SHELL = /bin/bash 8 | CXX = g++ 9 | CPPFLAGS = -O9 -ffast-math -funroll-loops -I ../include 10 | LIBS = -lstdc++ 11 | RM = rm -f 12 | 13 | hunalign: $(objects) 14 | $(CXX) $(CPPFLAGS) $(LIBS) -o hunalign $(objects) 15 | 16 | depend: 17 | makedepend -Y -s"# DO NOT DELETE THIS LINE -- hunaligndep" $(sources) 18 | 19 | clean: 20 | $(RM) hunalign $(objects) 21 | 22 | # DO NOT DELETE THIS LINE -- hunaligndep 23 | 24 | alignerTool.o: alignment.h quasiDiagonal.h words.h bookToMatrix.h translate.h 25 | alignerTool.o: dictionary.h cooccurrence.h trailPostprocessors.h help.h 26 | alignment.o: alignment.h quasiDiagonal.h words.h dictionary.h 27 | bloom.o: bloom.h words.h 28 | bookToMatrix.o: bookToMatrix.h words.h alignment.h quasiDiagonal.h 29 | bookToMatrix.o: dictionary.h 30 | cooccurrence.o: cooccurrence.h words.h networkFlow.h dictionary.h translate.h 31 | cooccurrenceTool.o: cooccurrence.h words.h networkFlow.h dictionary.h 32 | dictionary.o: dictionary.h words.h 33 | networkFlow.o: networkFlow.h 34 | oldAlignTest.o: dictionary.h words.h bloom.h translate.h alignment.h 35 | oldAlignTest.o: quasiDiagonal.h bookToMatrix.h dicTree.h 36 | trailPostprocessors.o: trailPostprocessors.h alignment.h quasiDiagonal.h 37 | trailPostprocessors.o: words.h bookToMatrix.h 38 | translate.o: translate.h words.h dictionary.h dicTree.h 39 | wordAlignment.o: wordAlignment.h words.h dictionary.h 40 | -------------------------------------------------------------------------------- /src/hunalign/TEIReader.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_TEIREADER_TEIREADER_H 12 | #define __HUNGLISH_TEIREADER_TEIREADER_H 13 | 14 | #include "words.h" 15 | 16 | #include 17 | 18 | #include 19 | 20 | namespace Hunglish 21 | { 22 | 23 | std::string toString( const XMLCh* wstr ); 24 | 25 | std::ostream& operator<<( std::ostream& os, const XMLCh* wstr ); 26 | 27 | void traverseDOMTree( const XERCES_CPP_NAMESPACE::DOMNode* doc, int depth ); 28 | 29 | void trivialSerializeSubTree( const XERCES_CPP_NAMESPACE::DOMNode* node, std::ostream& os ); 30 | 31 | const XERCES_CPP_NAMESPACE::DOMNode* findFirstSubTree( const XERCES_CPP_NAMESPACE::DOMNode* parent, const String& key ); 32 | 33 | const XERCES_CPP_NAMESPACE::DOMNode* findNextSubTree( const XERCES_CPP_NAMESPACE::DOMNode* root, const XERCES_CPP_NAMESPACE::DOMNode* current, String& key ); 34 | 35 | void buildWordFromDOMTree_Hu( const XERCES_CPP_NAMESPACE::DOMNode* parent, Word& word, bool lemma ); 36 | 37 | void buildWordFromDOMTree_En( const XERCES_CPP_NAMESPACE::DOMNode* parent, Word& word, bool lemma ); 38 | 39 | String getIdOfSentence( const XERCES_CPP_NAMESPACE::DOMNode* parent ); 40 | 41 | void buildSentenceFromDOMTree_Hu( const XERCES_CPP_NAMESPACE::DOMNode* parent, Sentence& sentence ); 42 | 43 | void buildSentenceFromDOMTree_En( const XERCES_CPP_NAMESPACE::DOMNode* parent, Sentence& sentence ); 44 | 45 | void buildSentenceFromDOMTree( const XERCES_CPP_NAMESPACE::DOMNode* parent, Sentence& sentence, bool english ); 46 | 47 | void buildSentenceListFromDOMTree( const XERCES_CPP_NAMESPACE::DOMNode* parent, SentenceList& seg, bool english ); 48 | 49 | // If this interface was a class, parseTEI would be its only public method: 50 | int parseTEI( const char* xmlFile, bool english, SentenceList& sentenceList ); 51 | 52 | // ...And this would be the test for the class: 53 | int main_TEIReader( int argC, char* argV[] ); 54 | 55 | } // namespace Hunglish 56 | 57 | #endif // #define __HUNGLISH_TEIREADER_TEIREADER_H 58 | -------------------------------------------------------------------------------- /src/hunalign/alignment.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_ALIGNMENT_ALIGNMENT_H 12 | #define __HUNGLISH_ALIGNMENT_ALIGNMENT_H 13 | 14 | #include "quasiDiagonal.h" 15 | 16 | #include 17 | #include 18 | 19 | namespace Hunglish 20 | { 21 | 22 | // Simply double values for each sentence. Right now we store sentence lengths in them. 23 | typedef std::vector SentenceValues; 24 | 25 | // See quasiDiagonal.h 26 | typedef QuasiDiagonal AlignMatrix; 27 | 28 | // Contains directions, a bit like a force field. 29 | typedef QuasiDiagonal TrelliMatrix; 30 | 31 | // A Rundle (x,y) cuts the bitext into two sub-bitexts: 32 | // [0,x)+[0,y) and [x,huSize)+[y,enSize). 33 | typedef std::pair Rundle; 34 | 35 | // A Trail is a strictly ordered list of Rundles. 36 | // It cuts the bitext into small bitexts. 37 | // Such a small bitext is called a hole or segmentum. 38 | // A hole can contion zero Hungarian sentence, 39 | // it can contain zero English sentences, but not both. 40 | // A Trail is sometimes referred to as a Ladder. 41 | typedef std::vector Trail; 42 | 43 | // A BisentenceList is formally identical to a Trail, but semantically very different. 44 | // It represents an ordered list of bisentences. 45 | // There are some functions which utilize the formal identity, 46 | // manipulating both structures. 47 | typedef std::vector< std::pair > BisentenceList; 48 | 49 | // OBSOLETE: 50 | // TrailValues gives scores to the Rundles of a Trail (of the same size). 51 | // Conceptually TrailValues should be attached to Trails. 52 | // A TrailValues structure always accompanies a Trails list, 53 | // but their consistency must be maintained by hand, pre-OO-style. (TODO) 54 | // typedef std::vector TrailValues; 55 | 56 | // OBSOLETE: 57 | // Has the exactly same relation to BisentenceList as 58 | // a TrailValues has to a Trail. But note that these 59 | // scores mark the confidence in a bisentence. This is 60 | // very different from the confidence in a rundle. 61 | // typedef std::vector BisentenceValues; 62 | 63 | double closeness( double twoSentenceLength, double oneSentenceLength ); 64 | 65 | const double skipScore = -0.3; 66 | 67 | 68 | // The main align function, 69 | // Gets a confidence value for every sentence-pair, 70 | // and sentence lengths for each sentence (for a a Gale-Church-like scoring). 71 | // Returns a trail with the best total score, and the computed dynMatrix matrix: 72 | // dynMatrix[huPos][enPos] gives the similarity of the [0,huPos) and [0,enPos) intervals. 73 | void align( const AlignMatrix& w, const SentenceValues& huLength, const SentenceValues& enLength, 74 | Trail& bestTrail, AlignMatrix& dynMatrix ); 75 | 76 | 77 | bool oneToOne( const Trail& bestTrail, int pos ); 78 | 79 | // Collect bisentences. 80 | void trailToBisentenceList( const Trail& bestTrail, 81 | BisentenceList& bisentenceList ); 82 | 83 | // Score precision-recall of a BisentenceList according to a hand-aligned bicorpus. 84 | // For best results, zero-to-many holes of the hand-alignment should be subdivided to zero-to-ones. 85 | // Builds the manual bisentencelist. The compared sets consist of Bisentences. 86 | double scoreBisentenceList( const BisentenceList& bisentenceList, const Trail& trailHand ); 87 | 88 | // The same precision-recall calculation for Trails. The compared sets consist of Rundles. 89 | double scoreTrail ( const Trail& trailAuto, const Trail& trailHand ); 90 | 91 | 92 | const int outsideOfRadiusValue = -1000000; 93 | const int insideOfRadiusValue = 0; 94 | 95 | // Fills the complement of the radius of the trail with minus infties. 96 | // The return value true means success. Failure means that during the fill, 97 | // we intersected the outside of the quasidiagonal area. 98 | // In this case, the operation is not finished. 99 | bool borderDetailedAlignMatrix( AlignMatrix& m, const Trail& trail, int radius ); 100 | 101 | // What the name implies. 102 | void dumpAlignMatrix( const AlignMatrix& m, bool graphical ); 103 | 104 | template 105 | void dumpAlignMatrix( const QuasiDiagonal& alignMatrix ); 106 | 107 | void dumpAlignMatrix( const QuasiDiagonal& alignMatrix, bool graphical ); 108 | 109 | void dumpTrelliMatrix( const TrelliMatrix& trellis ); 110 | 111 | 112 | } // namespace Hunglish 113 | 114 | #endif // #define __HUNGLISH_ALIGNMENT_ALIGNMENT_H 115 | -------------------------------------------------------------------------------- /src/hunalign/bloom.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #pragma warning ( disable : 4786 ) 12 | 13 | #include "bloom.h" 14 | 15 | namespace Hunglish 16 | { 17 | 18 | int BloomFilter::hash( const Word& w ) 19 | { 20 | unsigned int v=0; 21 | 22 | for ( Word::const_iterator it=w.begin(); it!=w.end(); ++it ) 23 | { 24 | unsigned int top = v >> 24; 25 | // Ez ugy lett tervezve, hogy (unsigned int)(*it) valodi ketbajtos. 26 | // A helyesebb megoldas az lenne, ha ketbajtosaval mennenk vegig rajta. 27 | // De ez overkill, mert ha a xerces unicode-ot hash-el, akkor pont 28 | // ugyanazt csinalja, mint most en. 29 | v += (v * 37) + top + (unsigned int)(*it); 30 | } 31 | 32 | // Divide by modulus 33 | return v % bloomSize; 34 | } 35 | 36 | void BloomFilter::set( const Word& w ) 37 | { 38 | ((std::bitset*)this)->set( hash(w) % bloomSize ); 39 | 40 | } 41 | 42 | bool BloomFilter::test ( const Word& w ) const 43 | { 44 | return ((std::bitset*)this)->test( hash(w) % bloomSize ); 45 | } 46 | 47 | int BloomFilter::count() const 48 | { 49 | return ((std::bitset*)this)->count(); 50 | } 51 | 52 | std::bitset& BloomFilter::getBitset() 53 | { 54 | return * (std::bitset*)this ; 55 | } 56 | 57 | const std::bitset& BloomFilter::getBitset() const 58 | { 59 | return * (const std::bitset*)this ; 60 | } 61 | 62 | int intersectionSize( const BloomFilter& b1, const BloomFilter& b2 ) 63 | { 64 | int count(0); 65 | for ( int i=0; i 15 | 16 | // TODO 17 | #include "words.h" 18 | 19 | namespace Hunglish 20 | { 21 | 22 | const int bloomSize=512; 23 | 24 | class BloomFilter : private std::bitset 25 | { 26 | public: 27 | void set( const Word& w ); 28 | bool test ( const Word& w ) const; 29 | int count() const; 30 | 31 | public: 32 | std::bitset& getBitset(); 33 | const std::bitset& getBitset() const; 34 | 35 | public: 36 | friend int intersectionSize( const BloomFilter& b1, const BloomFilter& b2 ); 37 | 38 | public: 39 | static int hash( const Word& w ); 40 | }; 41 | 42 | int intersectSize( const BloomFilter& b1, const BloomFilter& b2 ); 43 | 44 | typedef std::vector BloomBook; 45 | 46 | } // namespace Hunglish 47 | 48 | #endif // #define __HUNGLISH_ALIGNMENT_BLOOM_H 49 | -------------------------------------------------------------------------------- /src/hunalign/bookToMatrix.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/src/hunalign/bookToMatrix.cpp -------------------------------------------------------------------------------- /src/hunalign/bookToMatrix.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_ALIGNMENT_BOOKTOMATRIX_H 12 | #define __HUNGLISH_ALIGNMENT_BOOKTOMATRIX_H 13 | 14 | #include "words.h" 15 | #include "alignment.h" 16 | 17 | namespace Hunglish 18 | { 19 | 20 | const double scoreOfParagraphMatch = 0.31; 21 | 22 | const double scoreOfParagraphMisMatch = -1.0; 23 | 24 | bool isParagraph( const Phrase& phrase ); 25 | 26 | // (!!!) We assert that sx and sy are ordered sets of Word-s! 27 | int intersectionSize( const WordList& sx, const WordList& sy ); 28 | 29 | void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix ); 30 | 31 | class TransLex; 32 | 33 | double scoreByIdentity( const Phrase& hu, const Phrase& en ); 34 | 35 | double scoreByTranslation( const Phrase& hu, const Phrase& en, const TransLex& transLex ); 36 | 37 | // This is much-much slower, but instead of identity, uses a many-to-many dictionary. 38 | // For performance reasons, by convention does not calculate the similarity if the 39 | // alignMatrix element contains outsideOfRadiusValue, a big negative number. 40 | void sentenceListsToAlignMatrixTranslation( 41 | const SentenceList& huSentenceListPretty, const SentenceList& enSentenceList, 42 | const TransLex& transLex, 43 | AlignMatrix& alignMatrixDetailed ); 44 | 45 | class IBMModelOne; 46 | 47 | void sentenceListsToAlignMatrixIBMModelOne( 48 | const SentenceList& huSentenceList, const SentenceList& enSentenceList, 49 | const IBMModelOne& modelOne, 50 | AlignMatrix& alignMatrix ); 51 | 52 | int characterLength( const Word& words, bool utfCharCountingMode ); 53 | 54 | double characterLength( const Phrase& words, bool utfCharCountingMode ); 55 | 56 | double characterLength( int start, int end, const SentenceList& sentenceList, bool utfCharCountingMode ); 57 | 58 | void setSentenceValues( const SentenceList& sentences, SentenceValues& lengths, bool utfCharCountingMode ); 59 | 60 | } // namespace Hunglish 61 | 62 | #endif // #define __HUNGLISH_ALIGNMENT_BOOKTOMATRIX_H 63 | -------------------------------------------------------------------------------- /src/hunalign/cooccurrence.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/src/hunalign/cooccurrence.cpp -------------------------------------------------------------------------------- /src/hunalign/cooccurrence.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_ALIGNMENT_COOCCURRENCE_H 12 | #define __HUNGLISH_ALIGNMENT_COOCCURRENCE_H 13 | 14 | #include "words.h" 15 | 16 | namespace Hunglish 17 | { 18 | 19 | void cooccurenceAnalysis( SentenceList& huSentenceList, SentenceList& enSentenceList, 20 | double minScore, int minCoocc ); 21 | 22 | void flowBuilderXml( const SentenceList& huSentenceList, const SentenceList& enSentenceList, 23 | std::ostream& flowStream ); 24 | 25 | void lexiconByEdmondsKarp( const SentenceList& huSentenceListC, const SentenceList& enSentenceListC ); 26 | 27 | 28 | typedef std::pair BiWord; 29 | typedef std::vector BiWords; 30 | 31 | // This should be done after removeStopwords, simply because of bilanguage words like 32 | // "a","is","be". We absolutely don't care about rare bilanguage words like "petty". 33 | void removeIdenticals( SentenceList& huSentenceList, SentenceList& enSentenceList, 34 | BiWords& idTranslations ); 35 | 36 | void removeHapaxes( SentenceList& huSentenceList, SentenceList& enSentenceList, 37 | BiWords& hapaxTranslations ); 38 | 39 | class DictionaryItems; 40 | 41 | void filterBicorpusByLexicon 42 | ( SentenceList& huSentenceList, SentenceList& enSentenceList, 43 | const DictionaryItems& dictionaryItems ); 44 | 45 | // Adds plausible items to the dictionary it recieves as input. 46 | void autoDictionaryForRealign( SentenceList& huSentenceList, SentenceList& enSentenceList, 47 | DictionaryItems& dictionary, 48 | double minScore, int minCoocc ); 49 | 50 | // Removes dictionary items for which it doesn't find cooccurrences in the bicorpus. 51 | // Typically, bicorpus is built from a primary alignment. 52 | void filterDictionaryForRealign( SentenceList& huSentenceList, SentenceList& enSentenceList, 53 | DictionaryItems& dictionary ); 54 | 55 | } // namespace Hunglish 56 | 57 | #endif // #define __HUNGLISH_ALIGNMENT_COOCCURRENCE_H 58 | -------------------------------------------------------------------------------- /src/hunalign/dicTree.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_TEIREADER_DICTIONARIES_H 12 | #define __HUNGLISH_TEIREADER_DICTIONARIES_H 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | namespace Hunglish 20 | { 21 | 22 | // A simple tree class. 23 | // 24 | template 25 | class DicTree 26 | { 27 | public: 28 | // Gets value a bit below. Ugly C++. 29 | static const bool WarnOnConflict; 30 | 31 | DicTree() : id(0) {} 32 | DicTree( const Identifier& id_ ) : id(id_) {} 33 | 34 | ~DicTree(); 35 | 36 | const Identifier& getIdentifier() const { return id; } 37 | void setIdentifier( const Identifier& id_) { id=id_; } 38 | DicTree* lookup( const Atom& word ) const; 39 | DicTree& add( const Atom& word, const Identifier& id ); 40 | bool empty() const { return children.empty(); } 41 | 42 | void dump( std::ostream& os ) const; 43 | 44 | private: 45 | typedef std::map DicTreeMap; 46 | DicTreeMap children; 47 | Identifier id; 48 | }; 49 | 50 | template 51 | const bool DicTree::WarnOnConflict = false; 52 | 53 | // This structure stores a very sparse set-system of words. 54 | // (A dictionary of complex expressions.) 55 | // 56 | // It supports the following query: 57 | // It receives a set of words S. It gives back the sets 58 | // of the set system that are contained in this set S. 59 | // 60 | // For it to be effective, we must be careful during the building phase: 61 | // words in vector 'words' must be ordered by INCREASING frequency. Rare words first. 62 | 63 | template 64 | class SubsetLookup 65 | { 66 | public: 67 | 68 | typedef std::vector Atoms; 69 | 70 | void add( const Atoms& words, const Identifier& id ); 71 | 72 | void lookup( const Atoms& words, std::set& results ) const; 73 | 74 | void dump( std::ostream& os ) const; 75 | 76 | private: 77 | DicTree tree; 78 | }; 79 | 80 | // Implementation. F.ck C++ for having to put this in a header. 81 | 82 | template 83 | DicTree::~DicTree() 84 | { 85 | for ( typename DicTreeMap::iterator it=children.begin(); it!=children.end(); ++it ) 86 | { 87 | delete it->second; 88 | } 89 | } 90 | 91 | // Az id-t soha nem irja at nullarol nemnullara. 92 | // Ha nemnullarol nemnullara irja at, akkor kiabal elotte. 93 | template 94 | DicTree& DicTree::add( const Atom& word, const Identifier& id ) 95 | { 96 | DicTree* v = lookup(word); 97 | if (!v) 98 | { 99 | v = new DicTree(); 100 | v->id = id; 101 | children[word] = v; 102 | } 103 | else 104 | { 105 | if ( ( v->id != 0 ) && ( id != 0 ) ) 106 | { 107 | if (WarnOnConflict) 108 | std::cerr << "warning: conflict in tree" << std::endl; 109 | } 110 | if ( id != 0 ) 111 | { 112 | v->id = id; 113 | } 114 | } 115 | 116 | return (*v); 117 | } 118 | 119 | template 120 | DicTree* DicTree::lookup( const Atom& word ) const 121 | { 122 | typename DicTreeMap::const_iterator ft = children.find(word); 123 | 124 | if (ft==children.end()) 125 | { 126 | return 0; 127 | } 128 | else 129 | { 130 | return ft->second; 131 | } 132 | } 133 | 134 | template 135 | void DicTree::dump( std::ostream& os ) const 136 | { 137 | if (id!=0) 138 | { 139 | os << id << " "; 140 | } 141 | os << "{" << std::endl; 142 | for ( typename DicTreeMap::const_iterator it=children.begin(); it!=children.end(); ++it ) 143 | { 144 | os << it->first << " "; 145 | it->second->dump(os); 146 | } 147 | os << "}" << std::endl; 148 | } 149 | 150 | template 151 | void SubsetLookup::add( const Atoms& words, const Identifier& id ) 152 | { 153 | DicTree* v = &tree; 154 | 155 | for ( typename Atoms::const_iterator it=words.begin(); it!=words.end(); ++it ) 156 | { 157 | DicTree& newv = v->add(*it,0); 158 | v = &newv; 159 | } 160 | if ( v->getIdentifier() == 0 ) 161 | { 162 | v->setIdentifier(id); 163 | } 164 | else 165 | { 166 | if (DicTree::WarnOnConflict) 167 | std::cerr << "warning: conflict in tree" << std::endl; 168 | } 169 | } 170 | 171 | template 172 | void SubsetLookup::lookup( const Atoms& words, std::set& results ) const 173 | { 174 | typedef std::set*> Pebbles; 175 | Pebbles pebbles; 176 | pebbles.insert(&tree); 177 | 178 | results.clear(); 179 | 180 | for ( typename Atoms::const_iterator it=words.begin(); it!=words.end(); ++it ) 181 | { 182 | const Atom& word = *it; 183 | 184 | for ( typename Pebbles::const_iterator jt=pebbles.begin(); jt!=pebbles.end(); ++jt ) 185 | { 186 | const DicTree* subTree = (*jt)->lookup(word) ; 187 | 188 | if (!subTree) 189 | continue; 190 | 191 | const Identifier& id = subTree->getIdentifier(); 192 | if (id!=0) 193 | { 194 | results.insert(id); 195 | } 196 | 197 | if (!subTree->empty()) 198 | { 199 | pebbles.insert(subTree); 200 | } 201 | } 202 | } 203 | } 204 | 205 | template 206 | void SubsetLookup::dump( std::ostream& os ) const 207 | { 208 | tree.dump(os); 209 | } 210 | 211 | } // namespace Hunglish 212 | 213 | 214 | #endif // #define __HUNGLISH_TEIREADER_DICTIONARIES_H 215 | -------------------------------------------------------------------------------- /src/hunalign/dictionary.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/src/hunalign/dictionary.cpp -------------------------------------------------------------------------------- /src/hunalign/dictionary.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_ALIGNMENT_DICTIONARY_H 12 | #define __HUNGLISH_ALIGNMENT_DICTIONARY_H 13 | 14 | #include "words.h" 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | 22 | namespace Hunglish 23 | { 24 | 25 | typedef std::pair DictionaryItem; 26 | 27 | class DictionaryItems : public std::vector 28 | { 29 | public: 30 | void read( std::istream& is ); 31 | }; 32 | 33 | class HalfDictionary : public std::vector 34 | { 35 | public: 36 | void read( std::istream& is ); 37 | }; 38 | 39 | 40 | // After reading, this dictionary cannot be altered. 41 | // Also, this is a strictly one-directional dictionary. 42 | // If the other direction is needed, reverse( const Dictionary& dic ) another dictionary. 43 | class Dictionary 44 | { 45 | public: 46 | void read( const char* dictionaryFile ); 47 | void reverse( const Dictionary& dic ); 48 | void build( const DictionaryItems& dictionaryItems ); 49 | 50 | bool lookupWord( const Word& word, DictionaryItems& results ) const; 51 | bool lookupWordSet( const WordList& words, DictionaryItems& results ) const; 52 | 53 | private: 54 | void buildWordLookupTable(); 55 | 56 | private: 57 | DictionaryItems dictionaryItems; 58 | 59 | typedef std::map wordLookupTable; 60 | }; 61 | 62 | class FrequencyMap : public std::map 63 | { 64 | public: 65 | void add( const Word& word ); 66 | void remove( const Word& word ); 67 | void build( const WordList& wordList ); 68 | void remove( const WordList& wordList ); 69 | void build( const SentenceList& sentenceList ); // Just for convenience. 70 | int total() const; 71 | void dump( std::ostream& os, int itemNum ) const; 72 | void lowPassFilter( WordList& allowedWords, double ratio ) const; 73 | void highPassFilter( WordList& allowedWords, double ratio ) const; 74 | 75 | private: 76 | typedef std::multimap ReFrequencyMap; 77 | void reverseMap( ReFrequencyMap& reFrequencyMap ) const; 78 | }; 79 | 80 | 81 | void filterSentences( SentenceList& sentenceList, const WordList& words ); 82 | 83 | void removeHungarianStopwords( SentenceList& huSentenceList ); 84 | void removeEnglishStopwords ( SentenceList& enSentenceList ); 85 | void removeStopwords ( SentenceList& huSentenceList, SentenceList& enSentenceList ); 86 | 87 | 88 | typedef std::pair WordPair; 89 | 90 | class TransLex 91 | { 92 | public: 93 | 94 | typedef std::multimap WordMultimap; 95 | typedef WordMultimap::const_iterator WordMultimapIt; 96 | typedef std::pair DictInterval; 97 | 98 | void add( const Word& huWord, const Word& enWord ); 99 | void build( const DictionaryItems& dictionaryItems ); 100 | 101 | DictInterval lookupLeftWord ( const Word& huWord ) const; 102 | DictInterval lookupRightWord( const Word& enWord ) const; 103 | bool isPresent( const Word& huWord, const Word& enWord ) const; 104 | 105 | private: 106 | WordMultimap forward; 107 | WordMultimap backward; 108 | }; 109 | 110 | class IBMModelOne 111 | { 112 | public: 113 | double lookup( const Word& hu, const Word& en ) const; 114 | 115 | double distance( const Phrase& hu, const Phrase& en ) const; 116 | 117 | void build( const SentenceList& huSentenceList, const SentenceList& enSentenceList ); 118 | 119 | void reestimate( const SentenceList& huSentenceList, const SentenceList& enSentenceList ); 120 | 121 | public: 122 | typedef std::pair WordPair; 123 | typedef std::map TransProbs; 124 | 125 | TransProbs transProbs; 126 | }; 127 | 128 | } // namespace Hunglish 129 | 130 | #endif // #define __HUNGLISH_ALIGNMENT_DICTIONARY_H 131 | -------------------------------------------------------------------------------- /src/hunalign/help.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | std::string helpString = "Usage (either):\n\ 4 | alignerTool [ common_arguments ] [ -hand=hand_align_file ] dictionary_file source_text target_text\n\ 5 | \n\ 6 | or:\n\ 7 | alignerTool [ common_arguments ] -batch dictionary_file batch_file\n\ 8 | \n\ 9 | where\n\ 10 | common_arguments ::= [ -text ] [ -bisent ] [ -utf ] [ -cautious ] [ -realign [ -autodict=filename ] ]\n\ 11 | [ -thresh=n ] [ -ppthresh=n ] [ -headerthresh=n ] [ -topothresh=n ]\n\ 12 | \n\ 13 | Arguments:\n\ 14 | \n\ 15 | -text\n\ 16 | The output should be in text format, rather than the default (numeric) ladder format.\n\ 17 | \n\ 18 | -bisent\n\ 19 | Only bisentences (one-to-one alignment segments) are printed. In non-text mode, their\n\ 20 | starting rung is printed.\n\ 21 | \n\ 22 | -cautious\n\ 23 | In -bisent mode, only bisentences for which both the preceeding and the following\n\ 24 | segments are one-to-one are printed. In the default non-bisent mode, only rungs\n\ 25 | for which both the preceeding and the following segments are one-to-one are printed.\n\ 26 | \n\ 27 | -hand=file\n\ 28 | When this argument is given, the precision and recall of the alignment is calculated\n\ 29 | based on the manually built ladder file. Information like the following is written\n\ 30 | on the standard error: \n\ 31 | 53 misaligned out of 6446 correct items, 6035 bets.\n\ 32 | Precision: 0.991218, Recall: 0.928017\n\ 33 | \n\ 34 | Note that by default, 'item' means rung. The switch -bisent also changes the semantics\n\ 35 | of the scoring from rung-based to bisentence-based and in this case 'item' means bisentences.\n\ 36 | See File formats about the format of this input align file.\n\ 37 | \n\ 38 | -realign\n\ 39 | If this option is set, the alignment is built in three phases.\n\ 40 | After an initial alignment, the algorithm heuristically adds items\n\ 41 | to the dictionary based on cooccurrences in the identified bisentences.\n\ 42 | Then it re-runs the alignment process based on this larger dictionary.\n\ 43 | This option is recommended to achieve the highest possible alignment quality.\n\ 44 | It is not set by default because it approximately triples the running time\n\ 45 | while the quality improvement it yields are typically small.\n\ 46 | \n\ 47 | -autodict=filename\n\ 48 | The dictionary built during realign is saved to this file. By default, it is not saved.\n\ 49 | \n\ 50 | \n\ 51 | -onebyteencoding\n\ 52 | The system uses the character counts of the sentences as information for the\n\ 53 | pairing of sentences. By default, it assumes UTF-8 encoding.\n\ 54 | With this switch, it treats byte count as character count.\n\ 55 | This should be used for ISO encodings.\n\ 56 | -utf\n\ 57 | This switch is obsolete, UTF-8 is the default input encoding in later versions.\n\ 58 | Note: UTF-16 input is not supported.\n\ 59 | \n\ 60 | Postfiltering options:\n\ 61 | There are various postprocessors which remove implausible rungs based on various heuristics.\n\ 62 | \n\ 63 | -thresh=n\n\ 64 | Don't print out segments with score lower than n/100.\n\ 65 | \n\ 66 | -ppthresh=n\n\ 67 | Filter rungs with less than n/100 average score in their vicinity.\n\ 68 | \n\ 69 | -headerthresh=n\n\ 70 | Filter all rungs at the start and end of texts until finding a reliably\n\ 71 | plausible region.\n\ 72 | \n\ 73 | -topothresh=n\n\ 74 | Filter rungs with less than n percent of one-to-one segments in their vicinity.\n\ 75 | \n\ 76 | "; 77 | -------------------------------------------------------------------------------- /src/hunalign/main.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #pragma warning ( disable : 4786 ) 12 | 13 | 14 | #include 15 | 16 | 17 | namespace Hunglish 18 | { 19 | 20 | #ifdef WIN32 21 | const std::string globalHome = "/"; 22 | #else 23 | const std::string globalHome = "/home/daniel/"; 24 | #endif 25 | 26 | std::string hunglishHome = globalHome + "hunglish/"; 27 | std::string hunglishExperimentsHome = hunglishHome + "data/experiments/"; 28 | std::string hunglishDictionaryHome = hunglishHome + "data/szotar/"; 29 | 30 | } // namespace Hunglish 31 | 32 | 33 | namespace Hunglish 34 | { 35 | /////////////////////////////////// 36 | // Entry points of important tools: 37 | 38 | // Implemented in alignerTool.cpp 39 | int main_alignerTool(int argC, char* argV[]); 40 | 41 | // Implemented in cooccurrenceTool.cpp 42 | int main_cooccurrenceTool(int argC, char* argV[]); 43 | 44 | // Implemented in cooccurrenceTool.cpp 45 | int main_bicorpusProcessor(int argC, char* argV[]); 46 | 47 | /////////////////////////////////// 48 | // Just Tests: 49 | 50 | // We don't want to include DOM just for this function. 51 | // On the other hand, we don't want to create a header file just for this function. :) 52 | // Implemented in TEIReader.cpp 53 | int main_TEIReader( int argC, char* argV[] ); 54 | 55 | // Implemented in networkFlow.cpp 56 | void main_edmondsKarpTest(); 57 | 58 | // Implemented in oldAlignTest.cpp 59 | void main_alignTest(); 60 | 61 | // Implemented in oldAlignTest.cpp 62 | void main_scoreByHandAlign(); 63 | 64 | // Implemented in oldAlignTest.cpp 65 | void main_SmallSubsetLookupTest(); 66 | 67 | // Implemented in oldAlignTest.cpp 68 | void main_HunHalfTest(); 69 | 70 | // Implemented in oldAlignTest.cpp 71 | void main_translationTest(); 72 | 73 | // Implemented in wordAlignment.cpp 74 | void main_wordAlignmentTest(); 75 | 76 | // Implemented in bookToMatrix.cpp 77 | void main_similarityEvaluatorTool(int argC, char* argV[]); 78 | 79 | } // namespace Hunglish 80 | 81 | #include 82 | #include 83 | 84 | void rectangleCacheTest() 85 | { 86 | int xmax = 5000; 87 | 88 | const int ymaxmax=10000; 89 | const int step=100; 90 | 91 | char* a = new char[xmax*ymaxmax]; 92 | 93 | { 94 | for ( int ymax=step; ymax<=ymaxmax; ymax+=step ) 95 | { 96 | Hunglish::Ticker ticker; 97 | for ( int i=0; i 15 | #include 16 | #include 17 | #include 18 | 19 | namespace Hunglish 20 | { 21 | 22 | // It cannot represent graphs with isolated vertices. 23 | // But you don't really need them, do you? 24 | // I could add an addNode class, and throw an exception 25 | // when forwardNeighbours-ing a non-node. 26 | class DiGraph 27 | { 28 | public: 29 | void addEdge( int a, int b ); 30 | bool isEdge ( int a, int b ) const; 31 | void clear(); 32 | 33 | public: 34 | typedef std::set Nodes; 35 | 36 | const Nodes& forwardNeighbours ( int a ) const; 37 | const Nodes& backwardNeighbours( int a ) const; 38 | 39 | private: 40 | typedef std::map< int, Nodes > ToNodes; 41 | 42 | ToNodes forward; 43 | ToNodes backward; 44 | }; 45 | 46 | 47 | class NetworkWithFlow : public DiGraph 48 | { 49 | public: 50 | typedef std::pair Edge; 51 | typedef std::map Valuation; 52 | 53 | public: 54 | void addEdge( int a, int b, double v ); 55 | void edmondsKarp( int s, int t ); 56 | const Valuation& getFlow() const 57 | { 58 | return flow; 59 | } 60 | const Valuation& getCapacity() const 61 | { 62 | return capacity; 63 | } 64 | void dumpFlow( std::ostream& os, int s ) const; 65 | 66 | private: 67 | 68 | double dfs( int s, int t, std::vector& path, bool justWithForwards ); 69 | void augment( const std::vector& path, const double& excess ); 70 | double evaluateAugmentation( const std::vector& path ); 71 | DiGraph::Nodes::const_iterator nextFwd 72 | ( int x, DiGraph::Nodes::const_iterator it, DiGraph::Nodes::const_iterator end, double& excess ); 73 | DiGraph::Nodes::const_iterator nextBwd 74 | ( int x, DiGraph::Nodes::const_iterator it, DiGraph::Nodes::const_iterator end, double& excess ); 75 | 76 | private: 77 | Valuation capacity; 78 | Valuation flow; 79 | }; 80 | 81 | } // namespace Hunglish 82 | 83 | #endif // #define __HUNGLISH_ALIGNMENT_NETWORKFLOW_H 84 | -------------------------------------------------------------------------------- /src/hunalign/oldAlignTest.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/src/hunalign/oldAlignTest.cpp -------------------------------------------------------------------------------- /src/hunalign/quasiDiagonal.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_ALIGNMENT_QUASIDIAGONAL_H 12 | #define __HUNGLISH_ALIGNMENT_QUASIDIAGONAL_H 13 | 14 | #include 15 | 16 | namespace Hunglish 17 | { 18 | 19 | template 20 | class QuasiDiagonal 21 | { 22 | public: 23 | 24 | // Quite slow, because of the many bounds checks. 25 | class QuasiDiagonalRow 26 | { 27 | public: 28 | 29 | // QuasiDiagonalRow is similar to a vector of size size_. The difference is 30 | // that only the [offset_,offset_+thickness) subinterval can be written. 31 | // Reading from outside this interval yields the default T(). 32 | // Reading from outside the [0,size) interval yields a throw. 33 | // It is NOT asserted that [offset_,offset_+thickness) 34 | // should be a subset of [0,size). 35 | // 36 | QuasiDiagonalRow( int size_=0, int offset_=0, int thickness=0, T outsideDefault_=T() ) 37 | : offset(offset_), size(size_), data(thickness,T()), outsideDefault(outsideDefault_) {} 38 | 39 | //x T operator[](int k) const 40 | //x { 41 | //x if ( ! ((k>=0) && (k=0) && (d<(int)data.size()) ) 47 | //x { 48 | //x return data[k-offset]; 49 | //x } 50 | //x else 51 | //x { 52 | //x return outsideDefault; 53 | //x } 54 | //x } 55 | 56 | enum ZoneType 57 | { 58 | DiagZone = 1, 59 | MatrixZone = 2, 60 | OutsideZone = 3 61 | }; 62 | 63 | ZoneType zone(int k) const 64 | { 65 | if ( ! ((k>=0) && (k=0) && (d<(int)data.size()) ) 71 | { 72 | return DiagZone; 73 | } 74 | else 75 | { 76 | return MatrixZone; 77 | } 78 | } 79 | 80 | const T& operator[](int k) const 81 | { 82 | if ( ! ((k>=0) && (k=0) && (d<(int)data.size()) ) 88 | { 89 | return data[k-offset]; 90 | } 91 | else 92 | { 93 | return outsideDefault; 94 | } 95 | } 96 | 97 | T& cell(int k) 98 | { 99 | if ( ! ((k>=0) && (k=0) && (d<(int)data.size()) ) 105 | { 106 | return data[k-offset]; 107 | } 108 | else 109 | { 110 | throw "out of quasidiagonal"; 111 | } 112 | } 113 | 114 | private: 115 | std::vector data; 116 | int offset; 117 | int size; 118 | T outsideDefault; 119 | }; 120 | 121 | QuasiDiagonal( int height_, int width_, int thickness_, T outsideDefault_=T() ) 122 | : height(height_), width(width_), thicknes(thickness_) 123 | { 124 | for ( int i=0; i0 ? s : 0 ); 141 | } 142 | 143 | int rowEnd( int row ) const 144 | { 145 | int e=offset(row)+thicknes; 146 | return ( e=height)) 158 | { 159 | throw "out of matrix"; 160 | } 161 | 162 | return rows[y].cell(x); 163 | } 164 | 165 | bool setCell( int y, int x, const T& t ) 166 | { 167 | cell(y,x) = t; 168 | return true; 169 | } 170 | 171 | int size() const { return height; } 172 | // Yes, I know it's a stupid name. The reason is, I don't want to 173 | // put width/height on the interface, because usually 174 | // the first coord is the columns, but not here. 175 | // This could lead to confusion. 176 | int otherSize() const { return width; } 177 | 178 | int thickness() const { return thicknes; } 179 | 180 | private: 181 | std::vector rows; 182 | int height,width,thicknes; 183 | }; 184 | 185 | } // namespace Hunglish 186 | 187 | #endif // #define __HUNGLISH_ALIGNMENT_QUASIDIAGONAL_H 188 | -------------------------------------------------------------------------------- /src/hunalign/similarityEvaluator.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #pragma warning ( disable : 4786 ) 12 | 13 | #include "bookToMatrix.h" 14 | #include "translate.h" 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | namespace Hunglish 24 | { 25 | 26 | 27 | void bisentenceListToBicorpus( 28 | const SentenceList& huSentenceListC, const SentenceList& enSentenceListC, 29 | const BisentenceList& bisentenceList, 30 | SentenceList& huBisentenceHalves, SentenceList& enBisentenceHalves 31 | ) 32 | { 33 | huBisentenceHalves.clear(); 34 | enBisentenceHalves.clear(); 35 | 36 | for ( int i=0; ifirst << "\t" << it->second << std::endl; 121 | } 122 | } 123 | 124 | return ( sum / huSentenceList.size() ); 125 | } 126 | 127 | 128 | void similarityEvaluator( const DictionaryItems& dictionary, 129 | const SentenceList& huSentenceListPretty, const SentenceList& enSentenceListPretty ) 130 | { 131 | SentenceList huSentenceList, enSentenceList; 132 | 133 | normalizeTextsForIdentity( dictionary, huSentenceListPretty, enSentenceListPretty, huSentenceList, enSentenceList ); 134 | 135 | for ( int i=0; i<5; ++i ) 136 | { 137 | std::cout << huSentenceList[i].words << " --- " << enSentenceList[i].words << std::endl; 138 | } 139 | 140 | DiscreteDoubleMap distribution; 141 | 142 | IdentityScorer identityScorer; 143 | GaleScorer galeScorer; 144 | 145 | CombinatorScorer similarityScorer( identityScorer, galeScorer, 1.0 ); 146 | 147 | double realSimilarity = averageSimilarity( huSentenceList, enSentenceList, similarityScorer, distribution ); 148 | 149 | std::cerr << "Real similarity " << realSimilarity << std::endl; 150 | 151 | SentenceList huSentenceListWarped(huSentenceList); 152 | SentenceList enSentenceListWarped(enSentenceList); 153 | 154 | huSentenceListWarped.insert( huSentenceListWarped.begin(), huSentenceListWarped.back() ); 155 | huSentenceListWarped.resize( huSentenceListWarped.size()-1 ); 156 | double warpedSimilarity1 = averageSimilarity( huSentenceListWarped, enSentenceListWarped, similarityScorer, distribution ); 157 | 158 | std::cerr << "Placebo similarity #1 " << warpedSimilarity1 << std::endl; 159 | 160 | //x huSentenceListWarped.insert( huSentenceListWarped.begin(), huSentenceListWarped.back() ); 161 | //x huSentenceListWarped.resize( huSentenceListWarped.size()-1 ); 162 | //x double warpedSimilarity2 = averageSimilarity( huSentenceListWarped, enSentenceListWarped, similarityScorer, distribution ); 163 | //x 164 | //x std::cerr << "Placebo similarity #2 " << warpedSimilarity2 << std::endl; 165 | //x 166 | //x huSentenceListWarped.insert( huSentenceListWarped.begin(), huSentenceListWarped.back() ); 167 | //x huSentenceListWarped.resize( huSentenceListWarped.size()-1 ); 168 | //x double warpedSimilarity3 = averageSimilarity( huSentenceListWarped, enSentenceListWarped, similarityScorer, distribution ); 169 | //x 170 | //x std::cerr << "Placebo similarity #3 " << warpedSimilarity3 << std::endl; 171 | //x 172 | //x std::random_shuffle( huSentenceListWarped.begin(), huSentenceListWarped.end() ); 173 | //x double randomSimilarity = averageSimilarity( huSentenceListWarped, enSentenceListWarped, similarityScorer, distribution ); 174 | //x 175 | //x std::cerr << "Random similarity " << randomSimilarity << std::endl; 176 | } 177 | 178 | void main_similarityEvaluatorTool(int argC, char* argV[]) 179 | { 180 | if (argC!=4) 181 | throw "argument error"; 182 | 183 | const char* dicFilename = argV[1]; 184 | const char* huFilename = argV[2]; 185 | const char* enFilename = argV[3]; 186 | 187 | DictionaryItems dictionary; 188 | std::ifstream dis(dicFilename); 189 | dictionary.read(dis); 190 | 191 | SentenceList huSentenceList; 192 | SentenceList enSentenceList; 193 | 194 | std::ifstream hus(huFilename); 195 | huSentenceList.readNoIds(hus); 196 | std::ifstream ens(enFilename); 197 | enSentenceList.readNoIds(ens); 198 | 199 | if (huSentenceList.size()!=enSentenceList.size()) 200 | { 201 | std::cerr << "Number of sentences not matching: " 202 | << huSentenceList.size() << " versus " << enSentenceList.size() << "." 203 | << std::endl; 204 | throw "data error"; 205 | } 206 | else 207 | { 208 | std::cerr << huSentenceList.size() << " bisentences read." << std::endl; 209 | } 210 | 211 | similarityEvaluator( dictionary, huSentenceList, enSentenceList ); 212 | } 213 | 214 | } // namespace Hunglish 215 | -------------------------------------------------------------------------------- /src/hunalign/similarityEvaluator.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_ALIGNMENT_SIMILARITYEVALUATOR_H 12 | #define __HUNGLISH_ALIGNMENT_SIMILARITYEVALUATOR_H 13 | 14 | namespace Hunglish 15 | { 16 | 17 | } // namespace Hunglish 18 | 19 | #endif // #define __HUNGLISH_ALIGNMENT_SIMILARITYEVALUATOR_H 20 | -------------------------------------------------------------------------------- /src/hunalign/trailPostprocessors.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_ALIGNMENT_TRAILPOSTPROCESSORS_H 12 | #define __HUNGLISH_ALIGNMENT_TRAILPOSTPROCESSORS_H 13 | 14 | #include "alignment.h" 15 | 16 | namespace Hunglish 17 | { 18 | 19 | // Helper class that calculates scores of holes. 20 | class TrailScores 21 | { 22 | public: 23 | TrailScores( const Trail& trail_, const AlignMatrix& dynMatrix_ ); 24 | // The score of the jth segmentum. The bigger the better. 25 | double operator()( int j ) const; 26 | 27 | private: 28 | const Trail& trail; 29 | const AlignMatrix& dynMatrix; 30 | }; 31 | 32 | 33 | class SentenceList; 34 | 35 | 36 | // Helper class that calculates scores of segmentums. 37 | class TrailScoresInterval 38 | { 39 | public: 40 | TrailScoresInterval( const Trail& trail_, const AlignMatrix& dynMatrix_, 41 | const SentenceList& huSentenceList_, const SentenceList& enSentenceList_ ); 42 | 43 | // The average score of the jth segmentum. The bigger the better. 44 | // Division is by the maximum of the Hungarian and English intervals. 45 | // This is a somewhat arbritary decision, and goes very badly with the 46 | // scoring of the knight's moves. But we really have no better choice. 47 | // 48 | // Also, the method applies some very ugly hacks to avoid the effect of 49 | // paragraph-delimiters. It strips both intervals of

s, and 50 | // modifies the dynMatrix-based score assuming that all

s got paired. 51 | // except surplus

s. 52 | double scoreSegmentum( const Rundle& start, const Rundle& end ) const; 53 | 54 | // The score of a segment identified by its index. 55 | double operator()( int j ) const; 56 | // The score of a union of segments identified by its start and end rundles' index. 57 | // Both these methods rely on scoreSegmentum(): 58 | // This means an important thing: the score only depends 59 | // on the start and end rundle, not the rundles in between. 60 | double operator()( int j, int k ) const; 61 | 62 | private: 63 | const Trail& trail; 64 | const AlignMatrix& dynMatrix; 65 | const SentenceList& huSentenceList; 66 | const SentenceList& enSentenceList; 67 | }; 68 | 69 | // Helper class that calculates scores of one-to-one holes. 70 | class BisentenceListScores 71 | { 72 | public: 73 | BisentenceListScores( const BisentenceList& bisentenceList_, const AlignMatrix& dynMatrix_ ); 74 | // The score of the jth bisentence. The bigger the better. 75 | double operator()( int j ) const; 76 | 77 | private: 78 | const BisentenceList& bisentenceList; 79 | const AlignMatrix& dynMatrix; 80 | }; 81 | 82 | void removeRundles( Trail& trail, const std::set& rundlesToKill ); 83 | 84 | // In cautious mode, auto-aligned rundles are thrown away if 85 | // their left or right neighbour holes are not one-to-one. 86 | // From the point of view of the resultant bisentences: 87 | // In cautious mode, one-to-one bisentences are thrown away if 88 | // they have left or right neighbours which are not one-to-one. 89 | // This of course dramatically improves precision while slightly degrading recall. 90 | void cautiouslyFilterTrail( Trail& bestTrail ); 91 | 92 | void spaceOutBySentenceLength( Trail& bestTrail, 93 | const SentenceList& huSentenceListPretty, 94 | const SentenceList& enSentenceList, 95 | bool utfCharCountingMode ); 96 | 97 | // The function gets a nonconst reference to bestTrail. 98 | // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. 99 | // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. 100 | void postprocessTrailStart( Trail& bestTrail, 101 | const TrailScoresInterval& trailScoresInterval, 102 | const double& qualityThreshold ); 103 | 104 | // The function gets a nonconst reference to bestTrail. 105 | // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. 106 | // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. 107 | void postprocessTrailStartAndEnd( Trail& bestTrail, 108 | const TrailScoresInterval& trailScoresInterval, 109 | double qualityThreshold ); 110 | 111 | // The function gets a nonconst reference to bestTrail. 112 | // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. 113 | // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. 114 | void postprocessTrail( Trail& bestTrail, 115 | const TrailScoresInterval& trailScoresInterval, 116 | double qualityThreshold ); 117 | 118 | 119 | // Throws away rundles which are predominantly surrounded by not-one-to-one holes. 120 | void postprocessTrailByTopology( Trail& bestTrail, double qualityThreshold ); 121 | 122 | 123 | // Only collect bisentences with score at least qualityThreshold. 124 | void trailToBisentenceList( const Trail& bestTrail, const TrailScores& trailScores, double qualityThreshold, 125 | BisentenceList& bisentenceList ); 126 | 127 | // This is basically incorrect. 128 | // Here we use the score of the right-hand segment to decide about the rundle. 129 | // 130 | // The function gets a nonconst reference to bestTrail. 131 | // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. 132 | // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. 133 | void filterTrailByQuality( Trail& trail, const TrailScoresInterval& trailScoresInterval, 134 | const double& qualityThreshold ); 135 | 136 | void filterBisentenceListByQuality( BisentenceList& bisentenceList, const AlignMatrix& dynMatrix, 137 | const double& qualityThreshold ); 138 | 139 | } // namespace Hunglish 140 | 141 | #endif // #define __HUNGLISH_ALIGNMENT_TRAILPOSTPROCESSORS_H 142 | -------------------------------------------------------------------------------- /src/hunalign/translate.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_ALIGNMENT_TRANSLATE_H 12 | #define __HUNGLISH_ALIGNMENT_TRANSLATE_H 13 | 14 | #include "words.h" 15 | #include "dictionary.h" 16 | 17 | namespace Hunglish 18 | { 19 | 20 | typedef std::map< std::string, Phrase > DumbDictionary; 21 | 22 | // This will become a class, with dictionary initialization, and a translate method. 23 | // It will have various implementations. 24 | 25 | void buildDumbDictionary( const DictionaryItems& dictionary, DumbDictionary& dumbDictionary ); 26 | 27 | void buildDumbDictionaryUsingFrequencies( 28 | const DictionaryItems& dictionary, 29 | FrequencyMap& enFreq, 30 | DumbDictionary& dumbDictionary ); 31 | 32 | void buildDumbDictionary( Hunglish::DumbDictionary& dumbDictionary, 33 | const std::string& dictionaryFilename, 34 | const Hunglish::SentenceList& enSentenceList = Hunglish::SentenceList() 35 | ); 36 | 37 | void trivialTranslateWord( 38 | const DumbDictionary& dumbDictionary, 39 | const Word& originalWord, 40 | Phrase& words 41 | ); 42 | 43 | void trivialTranslate( 44 | const DumbDictionary& dumbDictionary, 45 | const Sentence& sentence, 46 | Sentence& translatedSentence 47 | ); 48 | 49 | void trivialTranslateSentenceList( 50 | const DumbDictionary& dumbDictionary, 51 | const SentenceList& sentenceList, 52 | SentenceList& translatedSentenceList 53 | ); 54 | 55 | void naiveTranslate( 56 | const DictionaryItems& dictionary, 57 | const SentenceList& sentenceList, 58 | SentenceList& translatedSentenceList 59 | ); 60 | 61 | typedef std::multimap< std::string, Phrase > DumbMultiDictionary; 62 | 63 | void buildDumbMultiDictionary( const DictionaryItems& dictionary, DumbMultiDictionary& dumbMultiDictionary, bool reverse ); 64 | 65 | void sortNormalizeSentences( Hunglish::SentenceList& sentenceList ); 66 | 67 | // This function preprocesses the sentences so that sentenceListsToAlignMatrixIdentity can be applied to them. 68 | // It does a rough translation and an alphabetic sort of words. 69 | void normalizeTextsForIdentity( const DictionaryItems& dictionary, 70 | const SentenceList& huSentenceListPretty, const SentenceList& enSentenceListPretty, 71 | SentenceList& huSentenceListGarbled, SentenceList& enSentenceListGarbled ); 72 | 73 | } // namespace Hunglish 74 | 75 | #endif // #define __HUNGLISH_ALIGNMENT_TRANSLATE_H 76 | -------------------------------------------------------------------------------- /src/hunalign/wordAlignment.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_ALIGNMENT_WORDALIGNMENT_H 12 | #define __HUNGLISH_ALIGNMENT_WORDALIGNMENT_H 13 | 14 | #include "words.h" 15 | #include "dictionary.h" 16 | 17 | #include 18 | #include 19 | 20 | 21 | namespace Hunglish 22 | { 23 | 24 | const int NullWord = -3 ; 25 | 26 | typedef int WordIndex; 27 | 28 | typedef std::pair WordRelation; 29 | 30 | typedef std::vector WordRelations; 31 | 32 | typedef std::set WordSet; 33 | 34 | // Describes the word-to-word structure of a bisentence. Many-to-one and one-to-NIL relationships are allowed. 35 | // Many-to-many is currently supported but not encouraged: disjoint complete bigraphs are allowed. 36 | // The bisentence itself is not stored. It is referred into by integer word indices. 37 | // Word-to-NIL relations must be made expicit. An initial empty WordAlignment means no knowledge, not knowledge of NIL. 38 | // TODO confidence values may be incorporated later. 39 | // TODO maybe even more importantly, flags to denote kinds of relations: 40 | // - suitable as dictionary item 41 | // - the result of ellipsis, not suitable as dictionary item 42 | // - ? 43 | // 44 | class WordAlignment 45 | { 46 | public: 47 | const WordRelations& getWordRelations() const ; 48 | void addWordRelation( const WordRelation& wordRelation ) ; 49 | 50 | // Under the current, unindexed implementation this is an O(n) operation. 51 | // leftSide refers to the argument being on the leftside, not the result! Major f.ck up possibility! 52 | WordSet relation( WordIndex wordIndex, bool leftSide ) const ; 53 | 54 | // Under the current, unindexed implementation this is implemented by two *Friends operations, so it is very very slow. 55 | // leftSide refers to the argument being on the leftside, not the result! Major f.ck up possibility! 56 | WordSet group ( WordIndex wordIndex, bool leftSide ) const ; 57 | 58 | // Inconsistency can be caused by the following: 59 | // - word connected to NIL and other. 60 | // - two words connected twice. 61 | // - graph is not disjoint union of stars.( Or complete bigraphs, if many-to-many is supported.) 62 | bool isConsistent() const; 63 | 64 | // Reorders the data lexicographically, without changing its semantics in any way. 65 | void resort(); 66 | 67 | void clear(); 68 | 69 | private: 70 | WordSet rightFriends( WordIndex wordIndex ) const; 71 | WordSet leftFriends ( WordIndex wordIndex ) const; 72 | 73 | private: 74 | WordRelations wordRelations; 75 | }; 76 | 77 | // BiSentence::first is the source (Hungarian) sentence. 78 | typedef std::pair BiSentence; 79 | 80 | class WordAlignedBisentence : public BiSentence // Inheritance from nonvirtual. It sounds so strange but it feels so good. 81 | { 82 | public: 83 | void markDictionaryItem( const DictionaryItem& dictionaryItem ); 84 | 85 | void findDictionaryItemsByGaps( DictionaryItems& dictionaryItems ); // Not const because it resorts. 86 | 87 | // Removes all words that the align can account for. 88 | void elimination(); 89 | 90 | public: 91 | WordAlignment wordAlignment; 92 | }; 93 | 94 | class WordAlignedBisentences : public std::vector // Inheritance from nonvirtual. It sounds so strange but it feels so good. 95 | { 96 | public: 97 | 98 | void markDictionaryItem( const DictionaryItem& dictionaryItem ); 99 | 100 | void importBicorpus( SentenceList& huSentenceList, SentenceList& enSentenceList ); 101 | 102 | void findDictionaryItemsByGaps( DictionaryItems& dictionaryItems ); // Not const because it resorts. 103 | 104 | // Removes all words that the align can account for. 105 | void elimination(); 106 | }; 107 | 108 | } // namespace Hunglish 109 | 110 | #endif // #define __HUNGLISH_ALIGNMENT_WORDALIGNMENT_H 111 | -------------------------------------------------------------------------------- /src/hunalign/words.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_ALIGNMENT_WORDS_H 12 | #define __HUNGLISH_ALIGNMENT_WORDS_H 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | namespace Hunglish 19 | { 20 | 21 | typedef std::string String; 22 | 23 | typedef String Word; 24 | 25 | typedef std::vector WordList; 26 | 27 | typedef WordList Phrase; 28 | 29 | typedef std::vector Book; 30 | 31 | struct Sentence 32 | { 33 | WordList words; 34 | String sentence; 35 | String id; 36 | }; 37 | 38 | // Implemented in dictionary.cpp 39 | class SentenceList : public std::vector 40 | { 41 | public: 42 | void read ( std::istream& is ); 43 | void readNoIds( std::istream& is ); 44 | void write( std::ostream& os ) const; 45 | void writeNoIds( std::ostream& os ) const; 46 | }; 47 | 48 | // Implemented in dictionary.cpp 49 | void readBicorpus( std::istream& is, SentenceList& huSentenceList, SentenceList& enSentenceList); 50 | void writeBicorpus( std::ostream& os, const SentenceList& huSentenceList, const SentenceList& enSentenceList); 51 | 52 | } // namespace Hunglish 53 | 54 | #endif // #define __HUNGLISH_ALIGNMENT_WORDS_H 55 | -------------------------------------------------------------------------------- /src/include/argumentsParser.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __ARGUMENTSPARSER_H 12 | #define __ARGUMENTSPARSER_H 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | // Current usage and limitations: 19 | // Every argument starts with a '-'. 20 | // It is a key/value pair. The delimiter 21 | // is either the first '=' (erased), or the 22 | // first nonalphabetic character (not erased). 23 | 24 | class AnyData 25 | { 26 | public: 27 | enum Kind { Int, String, Float, Set }; 28 | 29 | public: 30 | AnyData() : kind(String), dInt(-1) {} 31 | AnyData( const int& d ) : kind(Int), dInt(d) {} 32 | AnyData( const std::string& d ) : kind(String), dInt(-1), dString(d) {} 33 | // AnyData( const float& d ) : kind(Float), dFloat(d) {} 34 | // AnyData( const std::set& d ) : kind(Set), dSet(d), dInt(-1) {} 35 | 36 | public: 37 | Kind kind; 38 | int dInt; 39 | std::string dString; 40 | // float dFloat; 41 | // std::set dSet; 42 | }; 43 | 44 | typedef std::string ArgName; 45 | typedef std::map< ArgName, AnyData > ArgumentMap; 46 | 47 | class Arguments : public ArgumentMap 48 | { 49 | public: 50 | // Very important note: When read finds a numeric/set argument, 51 | // it sets anyData.kind to Int. But STILL, it fills anyData.dString, 52 | // just in case. So if the ArgumentMap was built by Arguments::read, 53 | // the dString fields are all filled. 54 | bool read( int argc, char **argv ); 55 | 56 | // remains is filled with the arguments not starting with '-'. 57 | bool read( int argc, char **argv, std::vector& remains ); 58 | 59 | // const if fails, erases arg if succeeds. 60 | bool getNumericParam( const ArgName& name, int& num ); 61 | 62 | // sw is true if the switch is present. The function 63 | // returns false if the argument value is not empty. 64 | bool getSwitch( const ArgName& name, bool& sw ); 65 | 66 | bool getSwitchConst( const ArgName& name, bool& sw ) const; 67 | 68 | // Returns true if the switch is present. Throws an error message if 69 | // if the argument value is not empty. 70 | bool getSwitchCompact( const ArgName& name ); 71 | 72 | void checkEmptyArgs() const; 73 | }; 74 | 75 | #endif // #define __ARGUMENTSPARSER_H 76 | -------------------------------------------------------------------------------- /src/include/histogram.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Applied Logic Laboratory, Ltd. * 4 | * All rights reserved. * 5 | * * 6 | * Developed by Daniel Varga * 7 | * * 8 | *************************************************************************/ 9 | 10 | #ifndef __PRIVATE_DANIEL_NEIGHBOURS_HISTOGRAM_H 11 | #define __PRIVATE_DANIEL_NEIGHBOURS_HISTOGRAM_H 12 | 13 | #include 14 | #include 15 | 16 | class Histogram : public std::vector 17 | { 18 | public: 19 | void add( int x, double val = 1 ); 20 | 21 | void write( std::ostream& os ) const; 22 | void write_othernonull( std::ostream& os ) const; 23 | void read( std::istream& is ); 24 | 25 | double sumFromOne() const; 26 | void setZeroByTotal( double total ); 27 | }; 28 | 29 | // Szemantikus kaosz: 30 | // Ket szemantikusan radikalisan kulonbozo strukturat tarolhatunk DoubleMap-ben. 31 | // Az egyik egy olyan dolog, ami mindenhol nulla, ahol nem mondtuk meg, mennyi. 32 | // A hisztogrammok ilyenek. 33 | // A masik, amelyiket interpolalnank az explicit adott ertekek kozott. 34 | // A binning eredmenyek ilyenek. 35 | 36 | // TODO Szarmaztassunk le egy Interpolable osztalyt. 37 | class DoubleMap : public std::map 38 | { 39 | public: 40 | 41 | void read ( std::istream& is ); 42 | void write( std::ostream& os ) const; 43 | 44 | }; 45 | 46 | class SmoothDoubleMap : public DoubleMap 47 | { 48 | public: 49 | void from( const Histogram& h ); 50 | 51 | // Ide majd interpolalo metodusok jonnek. 52 | }; 53 | 54 | class DiscreteDoubleMap : public DoubleMap 55 | { 56 | public: 57 | // A nulla implicit (azaz nem) jelenik meg a DoubleMap-ben! 58 | void from( const Histogram& h ); 59 | 60 | void binning( bool logBin, bool dontShowZeros, double step, SmoothDoubleMap& binned ) const; 61 | }; 62 | 63 | #endif // #define __PRIVATE_DANIEL_NEIGHBOURS_HISTOGRAM_H 64 | -------------------------------------------------------------------------------- /src/include/portableHash.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_INCLUDE_PORTABLEHASH_H 12 | #define __HUNGLISH_INCLUDE_PORTABLEHASH_H 13 | 14 | 15 | #ifdef WIN32 16 | 17 | #define EXTNAMESPACE std 18 | #include 19 | 20 | #else 21 | 22 | #define EXTNAMESPACE __gnu_cxx 23 | #include 24 | 25 | #endif 26 | 27 | #endif // #define __HUNGLISH_INCLUDE_PORTABLEHASH_H 28 | -------------------------------------------------------------------------------- /src/include/serializeImpl.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_INCLUDE_SERIALIZEIMPL_H 12 | #define __HUNGLISH_INCLUDE_SERIALIZEIMPL_H 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | template 19 | std::ostream& operator<<( std::ostream& os, const std::vector& v ) 20 | { 21 | for ( typename std::vector::const_iterator it=v.begin(); it!=v.end(); ++it ) 22 | { 23 | os << *it ; 24 | if (it+1!=v.end()) 25 | os << " "; 26 | } 27 | return os; 28 | } 29 | 30 | template 31 | std::ostream& operator<<( std::ostream& os, const std::set& v ) 32 | { 33 | typename std::set::const_iterator it=v.begin(); 34 | while (true) 35 | { 36 | os << *it ; 37 | 38 | typename std::set::const_iterator itplus = it; 39 | ++itplus; 40 | 41 | if (itplus == v.end()) 42 | break; 43 | else 44 | os << " "; 45 | 46 | it = itplus; 47 | } 48 | return os; 49 | } 50 | 51 | #endif // #define __HUNGLISH_INCLUDE_SERIALIZEIMPL_H 52 | -------------------------------------------------------------------------------- /src/include/stringsAndStreams.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_INCLUDE_STRINGSANDSTREAMS_H 12 | #define __HUNGLISH_INCLUDE_STRINGSANDSTREAMS_H 13 | 14 | #include 15 | #include 16 | 17 | namespace Hunglish 18 | { 19 | 20 | void split( const std::string line, std::vector& words, char delim='\t' ); 21 | 22 | } // namespace Hunglish 23 | 24 | #endif // #define __HUNGLISH_INCLUDE_STRINGSANDSTREAMS_H 25 | -------------------------------------------------------------------------------- /src/include/timer.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #ifndef __HUNGLISH_INCLUDE_TIMER_H 12 | #define __HUNGLISH_INCLUDE_TIMER_H 13 | 14 | // Don't use it for anything important: 15 | // The windows version overflows at 2^32=4294967296 milliseconds (49.71 days) after boot time. 16 | // The unix version overflows in every seven weeks (3600*24*49*1000=4233600000 ms). 17 | 18 | namespace Hunglish 19 | { 20 | 21 | // In microseconds. 22 | class Timer 23 | { 24 | public: 25 | static int getTick(); 26 | }; 27 | 28 | class Ticker 29 | { 30 | public: 31 | Ticker() { start(); } 32 | 33 | void start() { time = Timer::getTick(); } 34 | int get() { return Timer::getTick()-time; } 35 | int next() { int t=get(); start(); return t; } 36 | 37 | private: 38 | int time; 39 | }; 40 | 41 | } // namespace Hunglish 42 | 43 | #ifndef WIN32 44 | 45 | void itoa( int n, char* s, int radix ); 46 | 47 | #endif 48 | 49 | #endif // #define __HUNGLISH_INCLUDE_TIMER_H 50 | -------------------------------------------------------------------------------- /src/utils/argumentsParser.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2004. Media Research Centre at the * 4 | * Sociology and Communications Department of the * 5 | * Budapest University of Technology and Economics. * 6 | * * 7 | * Developed by Daniel Varga. * 8 | * * 9 | *************************************************************************/ 10 | 11 | #pragma warning ( disable : 4786 ) 12 | 13 | #include "argumentsParser.h" 14 | 15 | #include 16 | #include 17 | 18 | // Could be better. 19 | bool alphabetic( char c) 20 | { 21 | return ((c>='a')&&(c<='z')) || ((c>='A')&&(c<='Z')) || (c=='_'); 22 | } 23 | 24 | bool Arguments::read( int argc, char **argv ) 25 | { 26 | for ( int i=1; i& remains ) 73 | { 74 | remains.clear(); 75 | 76 | for ( int i=1; isecond.kind != AnyData::Int) 132 | { 133 | std::cerr << "Argument -" << name << ": integer expected.\n"; 134 | throw "argument error"; 135 | } 136 | 137 | num = it->second.dInt; 138 | erase(name); 139 | return true; 140 | } 141 | 142 | bool Arguments::getSwitchConst( const ArgName& name, bool& sw ) const 143 | { 144 | const_iterator it=find(name); 145 | if (it==end()) 146 | { 147 | sw = false; 148 | return true; 149 | } 150 | else if (! it->second.dString.empty()) 151 | { 152 | std::cerr << "Argument -" << name << ": value is not allowed.\n"; 153 | return false; 154 | } 155 | else 156 | { 157 | sw = true; 158 | return true; 159 | } 160 | } 161 | 162 | bool Arguments::getSwitch( const ArgName& name, bool& sw ) 163 | { 164 | bool ok = getSwitchConst(name, sw); 165 | if (ok) 166 | erase(name); 167 | 168 | return ok; 169 | } 170 | 171 | bool Arguments::getSwitchCompact( const ArgName& name ) 172 | { 173 | bool sw(false); 174 | bool ok = getSwitchConst(name, sw); 175 | if (ok) 176 | { 177 | erase(name); 178 | return sw; 179 | } 180 | else 181 | { 182 | std::cerr << "No value is allowed for argument -" << name << ".\n"; 183 | throw "argument error"; 184 | } 185 | } 186 | 187 | void Arguments::checkEmptyArgs() const 188 | { 189 | if (!empty()) 190 | { 191 | std::cerr << "Invalid argument: "; 192 | 193 | for ( Arguments::const_iterator it=begin(); it!=end(); ++it ) 194 | { 195 | std::cerr << "-" << it->first; 196 | if (!it->second.dString.empty()) 197 | std::cerr << "=" << it->second.dString; 198 | std::cerr << " "; 199 | } 200 | std::cerr << std::endl; 201 | 202 | throw "argument error"; 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /src/utils/histogram.cpp: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * * 3 | * (C) Copyright 2002. Daniel Varga * 4 | * All rights reserved by the author. * 5 | * * 6 | *************************************************************************/ 7 | 8 | #pragma warning ( disable : 4786 ) 9 | 10 | #include 11 | 12 | #include 13 | 14 | void DiscreteDoubleMap::binning( bool logBin, bool dontShowZeros, double step, SmoothDoubleMap& binned ) const 15 | { 16 | const DiscreteDoubleMap& m = *this; 17 | 18 | binned.clear(); 19 | 20 | if (m.empty()) 21 | return; 22 | 23 | double leftestValue = m.begin()->first; 24 | 25 | double leftFloat = 0.0; 26 | if (logBin) 27 | { 28 | if ( leftestValue < 0 ) 29 | { 30 | std::cerr << "Logbinning currently does not work for values smaller than 0." << std::endl; 31 | throw "data error"; 32 | } 33 | else if ( leftestValue < 1 ) 34 | { 35 | // A very primitive, basically incorrect way to get something for sub-one values. Not a real logbin. 36 | double left = 0.0; 37 | double right = 1.0; 38 | 39 | DoubleMap::const_iterator leftit = m.lower_bound(left); 40 | DoubleMap::const_iterator rightit = m.lower_bound(right); 41 | 42 | if (leftit!=m.end()) 43 | { 44 | double sum=0; 45 | 46 | for ( ; leftit!=rightit; ++leftit ) 47 | { 48 | sum += leftit->second; 49 | } 50 | 51 | // Nem vilagos, hogy x meghatarozasara mi a jo politika. Itt van ket primitiv: 52 | // double adHocCenter = left; 53 | double adHocCenter = (left+right-1)/2; 54 | 55 | if ( (!dontShowZeros) || (sum>0) ) 56 | { 57 | binned[adHocCenter] = sum/(right-left); 58 | } 59 | } 60 | } 61 | 62 | leftFloat = 1.0; 63 | } 64 | else 65 | { 66 | leftFloat = 0.0; 67 | if ( leftestValue < 0 ) 68 | { 69 | std::cerr << "Binning currently does not work for values smaller than 0." << std::endl; 70 | throw "data error"; 71 | } 72 | } 73 | 74 | while (true) 75 | { 76 | double rightFloat; 77 | if (logBin) 78 | rightFloat = leftFloat * step; 79 | else 80 | rightFloat = leftFloat + step; 81 | 82 | // Nulla hosszu bin intervallum. 83 | if ((int)leftFloat==(int)rightFloat) 84 | { 85 | leftFloat = rightFloat; 86 | continue; 87 | } 88 | 89 | double left = (int)leftFloat; 90 | double right = (int)rightFloat; 91 | 92 | DoubleMap::const_iterator leftit = m.lower_bound(left); 93 | DoubleMap::const_iterator rightit = m.lower_bound(right); 94 | 95 | if (leftit==m.end()) 96 | break; 97 | 98 | double sum=0; 99 | 100 | for ( ; leftit!=rightit; ++leftit ) 101 | { 102 | sum += leftit->second; 103 | } 104 | 105 | // Nem vilagos, hogy x meghatarozasara mi a jo politika. Itt van ket primitiv: 106 | // double adHocCenter = left; 107 | double adHocCenter = (left+right-1)/2; 108 | 109 | if ( (!dontShowZeros) || (sum>0) ) 110 | { 111 | binned[adHocCenter] = sum/(right-left); 112 | } 113 | leftFloat = rightFloat; 114 | } 115 | } 116 | 117 | void DoubleMap::read( std::istream& is ) 118 | { 119 | clear(); 120 | 121 | while ( !is.eof() && (is.good()) ) 122 | { 123 | double x(-1024),y(-1024); 124 | is >> x >> y; 125 | is.ignore(); // New line. 126 | 127 | if (!is.good()) 128 | break; 129 | 130 | operator[](x) = y; 131 | } 132 | } 133 | 134 | void DoubleMap::write( std::ostream& os ) const 135 | { 136 | for ( DoubleMap::const_iterator it=begin(); it!=end(); ++it ) 137 | { 138 | os << it->first << "\t" << it->second << std::endl; 139 | } 140 | } 141 | 142 | // A nulla implicit (azaz nem) jelenik meg a DiscreteDoubleMap-ben! 143 | void DiscreteDoubleMap::from( const Histogram& h ) 144 | { 145 | clear(); 146 | for ( int i=0; i=size()) 168 | { 169 | resize(x+1); 170 | } 171 | operator[](x) += val; 172 | } 173 | 174 | void Histogram::write( std::ostream& os ) const 175 | { 176 | for ( int i=0; i> x >> y; 198 | is.ignore(); // New line. 199 | 200 | if (!is.good()) 201 | break; 202 | 203 | if (x>=size()) 204 | { 205 | resize(x+1); 206 | } 207 | operator[](x) = y; 208 | } 209 | } 210 | 211 | double Histogram::sumFromOne() const 212 | { 213 | double n=0; 214 | for ( int i=1; i 14 | 15 | namespace Hunglish 16 | { 17 | 18 | void split( const std::string line, std::vector& words, char delim /*='\t'*/ ) 19 | { 20 | words.clear(); 21 | 22 | std::string current; 23 | int i; 24 | for ( i=0; i 16 | #include 17 | 18 | //#include // Just for testing. 19 | 20 | #ifdef WIN32 21 | #include 22 | #else 23 | #include 24 | #include 25 | #endif 26 | 27 | #ifndef WIN32 28 | #include // For itoa implementation. 29 | #endif 30 | 31 | namespace Hunglish 32 | { 33 | 34 | int Timer::getTick() 35 | { 36 | #ifdef WIN32 37 | 38 | return GetTickCount(); 39 | 40 | #else 41 | 42 | timeval tv; 43 | 44 | assert( 0 == gettimeofday( &tv, 0) ); 45 | 46 | // std::cerr << "sec:"<< tv.tv_sec << " usec:"<< tv.tv_usec << std::endl; 47 | 48 | return (tv.tv_sec % (3600*24*49))*1000+tv.tv_usec/1000; 49 | 50 | #endif 51 | } 52 | 53 | } // namespace Hunglish 54 | 55 | // Ugly portability layer: 56 | 57 | #ifndef WIN32 58 | 59 | void itoa( int n, char* s, int radix ) 60 | { 61 | assert( radix==10 ); 62 | std::ostringstream ss; 63 | ss << n; 64 | strcpy(s,ss.str().c_str()); 65 | } 66 | 67 | #endif 68 | --------------------------------------------------------------------------------