├── LICENSE
├── README.md
├── data
    ├── english.aff
    ├── english.dic
    ├── hu-en.dic
    ├── hu-en.stem.dic
    ├── hungarian.aff
    ├── hungarian.dic
    └── null.dic
├── examples
    ├── demo.en.stem
    ├── demo.hu.stem
    ├── demo.manual.ladder
    ├── en.raw
    └── hu.raw
├── regtest
    ├── handaligns
    │   ├── 1984.hu.handstem
    │   │   ├── README
    │   │   ├── en.pre
    │   │   ├── en.sen
    │   │   ├── hand.ladder
    │   │   ├── hu.pre
    │   │   └── hu.sen
    │   ├── 1984.hu
    │   │   ├── README
    │   │   ├── en.pre
    │   │   ├── en.sen
    │   │   ├── hand.ladder
    │   │   ├── hu.pre
    │   │   └── hu.sen
    │   ├── 1984.ro.utf8
    │   │   ├── README
    │   │   ├── en.pre
    │   │   ├── en.sen
    │   │   ├── hand.ladder
    │   │   ├── hu.pre
    │   │   ├── hu.sen
    │   │   └── sgmlTolatin2.sed
    │   ├── 1984.ro
    │   │   ├── README
    │   │   ├── en.pre
    │   │   ├── en.sen
    │   │   ├── hand.ladder
    │   │   ├── hu.pre
    │   │   ├── hu.sen
    │   │   └── sgmlTolatin2.sed
    │   ├── dtm
    │   │   ├── README
    │   │   ├── dtm.bi
    │   │   ├── en.pre
    │   │   ├── en.sen
    │   │   ├── hand.ladder
    │   │   ├── hu.pre
    │   │   └── hu.sen
    │   ├── steinbeck.huntoken.nopara
    │   │   ├── README
    │   │   ├── en.pre
    │   │   ├── en.sen
    │   │   ├── hand.ladder
    │   │   ├── hu.pre
    │   │   └── hu.sen
    │   ├── steinbeck.huntoken
    │   │   ├── README
    │   │   ├── en.pre
    │   │   ├── en.sen
    │   │   ├── hand.ladder
    │   │   ├── hu.pre
    │   │   └── hu.sen
    │   └── steinbeck
    │   │   ├── README
    │   │   ├── auto.ladder
    │   │   ├── en.pre
    │   │   ├── en.sen
    │   │   ├── hand.ladder
    │   │   ├── hu.pre
    │   │   └── hu.sen
    ├── regtest.sh
    ├── results
    │   └── dummy
    └── targets
    │   ├── 1984.hu.cerr
    │   ├── 1984.hu.handstem.realign.cerr
    │   ├── 1984.ro.realign.cerr
    │   ├── 1984.ro.utf8.realign.cerr
    │   ├── dtm.realign.cerr
    │   └── steinbeck.huntoken.nopara.cerr
├── scripts
    ├── DCEP
    │   ├── README
    │   ├── README.md
    │   ├── batchfilebylangpair.2ndcpu.sh
    │   ├── batchfilebylangpair.sh
    │   ├── dictforlanguagepair.sh
    │   ├── dictsforalllanguagepairs.sh
    │   ├── extract-bisentences.sh
    │   ├── filteralign.sh
    │   ├── finalpackage.sh
    │   ├── finalpackageforlangpairs.sh
    │   ├── flatladdertolangpair.sh
    │   ├── flatladdertolangpairs.sh
    │   ├── ladder2text.py
    │   ├── languagepair.py
    │   ├── mergedicts.sh
    │   ├── normalizedict.sh
    │   ├── normalizesztakidict.sh
    │   ├── packaligninfobylangpair.sh
    │   ├── readme.sh
    │   ├── realignall.2ndcpu.sh
    │   ├── realignall.sh
    │   ├── renamesztakidicts.sh
    │   ├── reorg.py
    │   ├── tokenizeAll.sh
    │   └── verifylangpair.sh
    ├── en.sen.one.sh
    ├── hu.sen.one.sh
    ├── hunalignDriver.py
    ├── ladder2text.py
    ├── partialAlign.py
    ├── process.sh
    ├── release.howto.txt
    ├── subprocessTest.py
    ├── teed.py
    ├── testProcess-RealHunalign.sh
    ├── testProcess.sh
    ├── testProcess1.sh
    ├── testProcessWithInput.sh
    ├── tok.one.sh
    ├── translate.txt
    ├── visualizeAlignQuality.awk
    ├── visualizeLadder.awk
    └── visualizeLadder.noshrink.awk
└── src
    ├── hunalign
        ├── DOMTreeErrorReporter.cpp
        ├── Makefile
        ├── TEIReader.cpp
        ├── TEIReader.h
        ├── alignerTool.cpp
        ├── alignment.cpp
        ├── alignment.h
        ├── bloom.cpp
        ├── bloom.h
        ├── bookToMatrix.cpp
        ├── bookToMatrix.h
        ├── cooccurrence.cpp
        ├── cooccurrence.h
        ├── cooccurrenceTool.cpp
        ├── dicTree.h
        ├── dictionary.cpp
        ├── dictionary.h
        ├── help.h
        ├── main.cpp
        ├── networkFlow.cpp
        ├── networkFlow.h
        ├── oldAlignTest.cpp
        ├── quasiDiagonal.h
        ├── similarityEvaluator.cpp
        ├── similarityEvaluator.h
        ├── trailPostprocessors.cpp
        ├── trailPostprocessors.h
        ├── translate.cpp
        ├── translate.h
        ├── wordAlignment.cpp
        ├── wordAlignment.h
        └── words.h
    ├── include
        ├── argumentsParser.h
        ├── histogram.h
        ├── portableHash.h
        ├── serializeImpl.h
        ├── stringsAndStreams.h
        └── timer.h
    └── utils
        ├── argumentsParser.cpp
        ├── histogram.cpp
        ├── stringsAndStreams.cpp
        └── timer.cpp


/data/english.aff:
--------------------------------------------------------------------------------
  1 | 
  2 | SET ISO8859-2
  3 | FORBIDDENWORD !
  4 | ONLYROOT ~
  5 | WORDCHARS -_
  6 | 
  7 | SFX n Y 1
  8 | SFX n 0 es [^o]o <SING3>>  
  9 | 
 10 | SFX i Y 1
 11 | SFX i 0 s [^o]o <SING3>>  
 12 | 
 13 | SFX k Y 2
 14 | SFX k 0 s' [^o]o <PLUR><POSS>>  
 15 | SFX k 0 s [^o]o <PLUR>>  
 16 | 
 17 | SFX u Y 5
 18 | SFX u 0 's [ck][^s] <POSS>>  
 19 | SFX u 0 's [^ck]. <POSS>>  
 20 | SFX u 0 ' nce <POSS>>  
 21 | SFX u 0 ' [ck]s <POSS>>  
 22 | SFX u 0 ' s <POSS>>  
 23 | 
 24 | SFX h Y 1
 25 | SFX h 0 0 . >  
 26 | 
 27 | SFX g Y 1
 28 | SFX g e ing ge <PART>>  
 29 | 
 30 | SFX d Y 6
 31 | SFX d y ied [^aeiou]y <PAST><?PART>>  
 32 | SFX d 0 ed [bcdfgklmnprstvwxyz]c <PAST><?PART>>  
 33 | SFX d 0 ked [aeiou]c <PAST><?PART>>  
 34 | SFX d 0 ed [aeiou]y <PAST><?PART>>  
 35 | SFX d 0 ed [xw] <PAST><?PART>>  
 36 | SFX d e ed e <PAST><?PART>>  
 37 | 
 38 | SFX c Y 14
 39 | SFX c e ing [^ieg]e <PART>>  
 40 | SFX c 0 es [^c]s <SING3>>  
 41 | SFX c 0 s [^sc]h <SING3>>  
 42 | SFX c y ies [bcdfgklmnprstvwxyz]y <SING3>>  
 43 | SFX c 0 es [sc]h <SING3>>  
 44 | SFX c 0 s [aeiou]y <SING3>>  
 45 | SFX c 0 s oo <SING3>>  
 46 | SFX c 0 es x <SING3>>  
 47 | SFX c 0 s [^shoxy] <SING3>>  
 48 | SFX c 0 ing [xyw] <PART>>  
 49 | SFX c 0 ing [bcdfgklmnprstvwxyz]c <PART>>  
 50 | SFX c 0 king [aeiou]c <PART>>  
 51 | SFX c 0 ing ee <PART>>  
 52 | SFX c ie ying ie <PART>>  
 53 | 
 54 | SFX e Y 14
 55 | SFX e 0 zing [^z]z <PART>>  
 56 | SFX e 0 ping [^p]p <PART>>  
 57 | SFX e 0 king [^ck]k <PART>>  
 58 | SFX e 0 ding [^d]d <PART>>  
 59 | SFX e 0 bing [^b]b <PART>>  
 60 | SFX e 0 ring [^r]r <PART>>  
 61 | SFX e 0 sing [^s]s <PART>>  
 62 | SFX e 0 ting [^t]t <PART>>  
 63 | SFX e 0 ning [^n]n <PART>>  
 64 | SFX e 0 fing [^f]f <PART>>  
 65 | SFX e 0 ming [^m]m <PART>>  
 66 | SFX e 0 ling [^l]l <PART>>  
 67 | SFX e 0 ving [^v]v <PART>>  
 68 | SFX e 0 ging [^g]g <PART>>  
 69 | 
 70 | SFX b Y 1
 71 | SFX b 0 0 . >  
 72 | 
 73 | SFX a Y 1
 74 | SFX a 0 0 . >  
 75 | 
 76 | SFX w Y 6
 77 | SFX w 0 er [aciouxw] <COMP>>  
 78 | SFX w y iest y <SUPER>>  
 79 | SFX w e est e <SUPER>>  
 80 | SFX w y ier y <COMP>>  
 81 | SFX w e er e <COMP>>  
 82 | SFX w 0 est [aciouxw] <SUPER>>  
 83 | 
 84 | SFX f Y 1
 85 | SFX f 0 ing ge <PART>>  
 86 | 
 87 | SFX j Y 17
 88 | SFX j 0 s [bcdfgklmnprstvwxyz]y <PLUR>>  
 89 | SFX j 0 0 cs <PLUR>>  
 90 | SFX j 0 es [sc]h <PLUR>>  
 91 | SFX j 0 s [aeiou]y <PLUR>>  
 92 | SFX j 0 s oo <PLUR>>  
 93 | SFX j 0 s [^shoxy] <PLUR>>  
 94 | SFX j 0 es' x <PLUR><POSS>>  
 95 | SFX j 0 es [^c]s <PLUR>>  
 96 | SFX j 0 s [^sc]h <PLUR>>  
 97 | SFX j 0 es x <PLUR>>  
 98 | SFX j 0 s' [^shoxy] <PLUR><POSS>>  
 99 | SFX j 0 es' [^c]s <PLUR><POSS>>  
100 | SFX j 0 s' [bcdfgklmnprstvwxyz]y <PLUR><POSS>>  
101 | SFX j 0 es' [sc]h <PLUR><POSS>>  
102 | SFX j 0 s' [aeiou]y <PLUR><POSS>>  
103 | SFX j 0 s' oo <PLUR><POSS>>  
104 | SFX j 0 s' [^sc]h <PLUR><POSS>>  
105 | 
106 | SFX l Y 4
107 | SFX l fe ves fe <PLUR>>  
108 | SFX l f ves' f <PLUR><POSS>>  
109 | SFX l f ves f <PLUR>>  
110 | SFX l fe ves' fe <PLUR><POSS>>  
111 | 
112 | SFX m Y 1
113 | SFX m 0 ing [^ecxyw] <PART>>  
114 | 
115 | SFX v Y 1
116 | SFX v 0 ed [^eycxw] <PAST><?PART>>  
117 | 
118 | SFX o Y 2
119 | SFX o 0 es' [^o]o <PLUR><POSS>>  
120 | SFX o 0 es [^o]o <PLUR>>  
121 | 
122 | SFX p Y 28
123 | SFX p 0 ler [^l]l <COMP>>  
124 | SFX p 0 pest [^p]p <SUPER>>  
125 | SFX p 0 ker [^ck]k <COMP>>  
126 | SFX p 0 ver [^v]v <COMP>>  
127 | SFX p 0 ger [^g]g <COMP>>  
128 | SFX p 0 ber [^b]b <COMP>>  
129 | SFX p 0 kest [^ck]k <SUPER>>  
130 | SFX p 0 rer [^r]r <COMP>>  
131 | SFX p 0 ser [^s]s <COMP>>  
132 | SFX p 0 dest [^d]d <SUPER>>  
133 | SFX p 0 best [^b]b <SUPER>>  
134 | SFX p 0 rest [^r]r <SUPER>>  
135 | SFX p 0 sest [^s]s <SUPER>>  
136 | SFX p 0 test [^t]t <SUPER>>  
137 | SFX p 0 nest [^n]n <SUPER>>  
138 | SFX p 0 der [^d]d <COMP>>  
139 | SFX p 0 ter [^t]t <COMP>>  
140 | SFX p 0 ner [^n]n <COMP>>  
141 | SFX p 0 fest [^f]f <SUPER>>  
142 | SFX p 0 mest [^m]m <SUPER>>  
143 | SFX p 0 zest [^z]z <SUPER>>  
144 | SFX p 0 fer [^f]f <COMP>>  
145 | SFX p 0 lest [^l]l <SUPER>>  
146 | SFX p 0 mer [^m]m <COMP>>  
147 | SFX p 0 zer [^z]z <COMP>>  
148 | SFX p 0 vest [^v]v <SUPER>>  
149 | SFX p 0 gest [^g]g <SUPER>>  
150 | SFX p 0 per [^p]p <COMP>>  
151 | 
152 | SFX q Y 2
153 | SFX q 0 est [^ecxyw] <SUPER>>  
154 | SFX q 0 er [^ecxyw] <COMP>>  
155 | 
156 | SFX r Y 14
157 | SFX r 0 ded [^d]d <PAST><?PART>>  
158 | SFX r 0 ted [^t]t <PAST><?PART>>  
159 | SFX r 0 ned [^n]n <PAST><?PART>>  
160 | SFX r 0 fed [^f]f <PAST><?PART>>  
161 | SFX r 0 med [^m]m <PAST><?PART>>  
162 | SFX r 0 zed [^z]z <PAST><?PART>>  
163 | SFX r 0 ped [^p]p <PAST><?PART>>  
164 | SFX r 0 led [^l]l <PAST><?PART>>  
165 | SFX r 0 ked [^ck]k <PAST><?PART>>  
166 | SFX r 0 ved [^v]v <PAST><?PART>>  
167 | SFX r 0 ged [^g]g <PAST><?PART>>  
168 | SFX r 0 bed [^b]b <PAST><?PART>>  
169 | SFX r 0 red [^r]r <PAST><?PART>>  
170 | SFX r 0 sed [^s]s <PAST><?PART>>  
171 | 
172 | SFX s Y 2
173 | SFX s y ies [bcdfgklmnprstvwxyz]y <PLUR>>  
174 | SFX s y ies' [bcdfgklmnprstvwxyz]y <PLUR><POSS>>  
175 | 
176 | SFX t Y 2
177 | SFX t 0 's [^s] <POSS>>  
178 | SFX t 0 ' s <POSS>>  
179 | 
180 | 


--------------------------------------------------------------------------------
/data/english.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/english.dic


--------------------------------------------------------------------------------
/data/hu-en.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/hu-en.dic


--------------------------------------------------------------------------------
/data/hu-en.stem.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/hu-en.stem.dic


--------------------------------------------------------------------------------
/data/hungarian.aff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/hungarian.aff


--------------------------------------------------------------------------------
/data/hungarian.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/hungarian.dic


--------------------------------------------------------------------------------
/data/null.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/data/null.dic


--------------------------------------------------------------------------------
/examples/demo.hu.stem:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/examples/demo.hu.stem


--------------------------------------------------------------------------------
/examples/demo.manual.ladder:
--------------------------------------------------------------------------------
  1 | 0 0
  2 | 1 1
  3 | 2 2
  4 | 3 2
  5 | 4 2
  6 | 5 3
  7 | 6 3
  8 | 7 3
  9 | 8 3
 10 | 9 4
 11 | 10 4
 12 | 12 5
 13 | 13 5
 14 | 15 6
 15 | 16 7
 16 | 17 8
 17 | 18 9
 18 | 19 10
 19 | 20 11
 20 | 21 12
 21 | 22 13
 22 | 23 14
 23 | 24 15
 24 | 25 16
 25 | 26 17
 26 | 27 18
 27 | 28 19
 28 | 29 20
 29 | 30 21
 30 | 31 22
 31 | 32 23
 32 | 33 24
 33 | 34 25
 34 | 35 26
 35 | 36 27
 36 | 37 28
 37 | 38 29
 38 | 39 30
 39 | 40 31
 40 | 41 32
 41 | 42 33
 42 | 43 34
 43 | 44 35
 44 | 45 36
 45 | 46 37
 46 | 47 38
 47 | 48 39
 48 | 49 40
 49 | 50 41
 50 | 51 42
 51 | 52 43
 52 | 53 44
 53 | 54 45
 54 | 55 46
 55 | 56 47
 56 | 57 48
 57 | 58 49
 58 | 59 50
 59 | 60 51
 60 | 61 52
 61 | 62 53
 62 | 63 54
 63 | 64 55
 64 | 65 56
 65 | 66 57
 66 | 67 58
 67 | 68 59
 68 | 69 60
 69 | 70 61
 70 | 71 62
 71 | 72 63
 72 | 73 64
 73 | 74 65
 74 | 76 66
 75 | 77 67
 76 | 78 68
 77 | 79 69
 78 | 80 70
 79 | 81 71
 80 | 82 72
 81 | 83 73
 82 | 85 74
 83 | 86 75
 84 | 89 76
 85 | 90 77
 86 | 91 78
 87 | 92 79
 88 | 93 80
 89 | 94 81
 90 | 95 82
 91 | 96 82
 92 | 97 83
 93 | 98 84
 94 | 99 85
 95 | 100 86
 96 | 101 87
 97 | 102 88
 98 | 103 89
 99 | 104 90
100 | 105 91
101 | 106 92
102 | 107 93
103 | 108 94
104 | 109 95
105 | 110 96
106 | 112 97
107 | 113 98
108 | 114 99
109 | 115 99
110 | 116 100
111 | 117 101
112 | 118 103
113 | 119 104
114 | 120 105
115 | 121 106
116 | 122 107
117 | 123 108
118 | 124 109
119 | 125 110
120 | 126 111
121 | 127 112
122 | 130 113
123 | 131 114
124 | 132 115
125 | 132 116
126 | 132 117
127 | 133 118
128 | 134 119
129 | 135 120
130 | 136 121
131 | 137 122
132 | 138 123
133 | 139 124
134 | 140 125
135 | 141 126
136 | 142 127
137 | 143 128
138 | 144 129
139 | 145 130
140 | 146 131
141 | 147 132
142 | 148 133
143 | 149 134
144 | 150 135
145 | 151 136
146 | 152 137
147 | 153 138
148 | 154 139
149 | 155 140
150 | 156 141
151 | 157 142
152 | 158 143
153 | 159 144
154 | 160 145
155 | 161 146
156 | 162 147
157 | 163 148
158 | 164 150
159 | 165 151
160 | 166 152
161 | 167 153
162 | 168 154
163 | 169 155
164 | 170 156
165 | 171 157
166 | 172 158
167 | 173 159
168 | 174 160
169 | 175 161
170 | 176 162
171 | 177 163
172 | 178 164
173 | 179 165
174 | 180 166
175 | 181 167
176 | 182 168
177 | 183 169
178 | 183 170
179 | 184 171
180 | 185 172
181 | 186 173
182 | 186 174
183 | 187 175
184 | 189 176
185 | 190 177
186 | 191 178
187 | 192 179
188 | 193 179
189 | 194 180
190 | 195 181
191 | 195 182
192 | 196 183
193 | 197 184
194 | 198 185
195 | 199 186
196 | 200 187
197 | 201 188
198 | 202 189
199 | 203 190
200 | 204 191
201 | 205 192
202 | 206 193
203 | 207 194
204 | 207 195
205 | 208 197
206 | 209 198
207 | 210 199
208 | 211 200
209 | 212 201
210 | 213 202
211 | 214 203
212 | 214 204
213 | 215 206
214 | 216 207
215 | 217 208
216 | 218 209
217 | 219 210
218 | 220 211
219 | 221 212
220 | 222 213
221 | 222 214
222 | 223 215
223 | 224 216
224 | 225 217
225 | 226 218
226 | 227 219
227 | 228 220
228 | 229 221
229 | 229 222
230 | 230 223
231 | 231 224
232 | 232 225
233 | 233 226
234 | 234 227
235 | 235 228
236 | 236 229
237 | 237 230
238 | 238 231
239 | 239 232
240 | 240 234
241 | 241 235
242 | 242 236
243 | 243 237
244 | 246 238
245 | 247 239
246 | 248 240
247 | 249 241
248 | 251 242
249 | 252 243
250 | 253 244
251 | 255 245
252 | 256 245
253 | 257 246
254 | 258 247
255 | 259 248
256 | 260 249
257 | 261 250
258 | 263 251
259 | 265 252
260 | 266 253
261 | 267 254
262 | 268 255
263 | 269 256
264 | 270 257
265 | 271 258
266 | 272 259
267 | 273 260
268 | 274 261
269 | 275 262
270 | 276 263
271 | 277 264
272 | 278 265
273 | 279 266
274 | 280 267
275 | 281 268
276 | 282 269
277 | 283 270
278 | 284 271
279 | 285 272
280 | 286 273
281 | 288 274
282 | 289 275
283 | 290 276
284 | 292 277
285 | 293 278
286 | 295 279
287 | 296 280
288 | 297 281
289 | 298 282
290 | 299 283
291 | 300 284
292 | 301 285
293 | 302 286
294 | 303 286
295 | 304 288
296 | 305 289
297 | 306 290
298 | 307 291
299 | 308 292
300 | 309 293
301 | 310 294
302 | 311 295
303 | 312 296
304 | 313 297
305 | 314 298
306 | 315 299
307 | 316 300
308 | 317 301
309 | 318 302
310 | 319 303
311 | 320 303
312 | 322 304
313 | 323 305
314 | 324 306
315 | 325 307
316 | 326 308
317 | 327 309
318 | 328 310
319 | 329 311
320 | 331 312
321 | 332 313
322 | 333 314
323 | 334 315
324 | 335 315
325 | 336 316
326 | 337 317
327 | 338 318
328 | 339 319
329 | 340 320
330 | 341 321
331 | 342 322
332 | 343 323
333 | 344 324
334 | 345 325
335 | 346 326
336 | 347 327
337 | 349 328
338 | 350 329
339 | 352 331
340 | 353 332
341 | 354 333
342 | 355 334
343 | 357 335
344 | 358 336
345 | 359 337
346 | 359 338
347 | 360 340
348 | 361 341
349 | 362 342
350 | 363 343
351 | 364 344
352 | 365 345
353 | 366 346
354 | 367 347
355 | 368 348
356 | 369 349
357 | 370 350
358 | 371 351
359 | 372 352
360 | 374 353
361 | 375 354
362 | 376 355
363 | 377 356
364 | 379 357
365 | 380 358
366 | 381 359
367 | 382 360
368 | 383 361
369 | 384 362
370 | 385 363
371 | 387 364
372 | 388 365
373 | 389 367
374 | 390 368
375 | 391 369
376 | 392 370
377 | 393 371
378 | 394 372
379 | 395 373
380 | 396 374
381 | 398 375
382 | 400 376
383 | 402 378
384 | 403 379
385 | 404 380
386 | 408 381
387 | 409 382
388 | 410 383
389 | 411 384
390 | 412 385
391 | 413 386
392 | 414 387
393 | 415 388
394 | 416 389
395 | 417 390
396 | 418 391
397 | 419 392
398 | 420 393
399 | 421 394
400 | 422 395
401 | 423 397
402 | 424 398
403 | 425 399
404 | 426 400
405 | 427 401
406 | 428 402
407 | 429 403
408 | 430 404
409 | 431 405
410 | 432 406
411 | 433 407
412 | 434 408
413 | 435 409
414 | 436 410
415 | 437 411
416 | 438 414
417 | 439 415
418 | 440 416
419 | 441 417
420 | 442 418
421 | 443 419
422 | 444 420
423 | 445 421
424 | 446 422
425 | 447 423
426 | 448 424
427 | 449 425
428 | 450 425
429 | 451 426
430 | 452 427
431 | 453 428
432 | 454 429
433 | 455 430
434 | 456 431
435 | 457 432
436 | 458 433
437 | 459 434
438 | 460 435
439 | 461 436
440 | 462 437
441 | 463 438
442 | 464 439
443 | 465 440
444 | 467 441
445 | 468 442
446 | 470 443
447 | 471 444
448 | 472 445
449 | 473 446
450 | 473 447
451 | 474 448
452 | 475 449
453 | 476 450
454 | 477 451
455 | 478 452
456 | 479 453
457 | 480 454
458 | 481 455
459 | 482 456
460 | 483 457
461 | 484 458
462 | 485 459
463 | 486 460
464 | 487 461
465 | 488 462
466 | 489 463
467 | 490 464
468 | 492 465
469 | 493 466
470 | 494 467
471 | 495 468
472 | 496 469
473 | 497 470
474 | 498 471
475 | 499 472
476 | 500 473
477 | 501 474
478 | 502 475
479 | 503 476
480 | 504 477
481 | 505 478
482 | 506 479
483 | 507 480
484 | 508 481
485 | 509 482
486 | 510 483
487 | 511 484
488 | 512 485
489 | 513 486
490 | 514 487
491 | 515 488
492 | 516 489
493 | 517 490
494 | 518 491
495 | 519 492
496 | 520 493
497 | 521 494
498 | 522 495
499 | 523 496
500 | 524 496
501 | 525 497
502 | 526 498
503 | 527 499
504 | 528 500
505 | 529 501
506 | 


--------------------------------------------------------------------------------
/examples/hu.raw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/examples/hu.raw


--------------------------------------------------------------------------------
/regtest/handaligns/1984.hu.handstem/README:
--------------------------------------------------------------------------------
 1 | ====
 2 | Notes for users:
 3 | 
 4 | 
 5 | Data obtained from the English-Hungarian parallel Multext-East "1984" corpus.
 6 | 
 7 | .sen files contain the sentence-level information of that corpus, excluding word-level information and paragraph structure.
 8 | 
 9 | .pre files (unlike in ../1984.hu) contain tokenization and word stems that were manually obtained (well, were partly manually verified) by the Multext-East people.
10 | 
11 | ====
12 | Notes for Hunglish developers:
13 | 
14 | Originally at
15 | 
16 | sen:
17 | ~/hunglish/data/experiments/1984.nonlemmatized.improved/Hungarian/1984.sen
18 | ~/hunglish/data/experiments/1984.nonlemmatized.improved/English/1984.sen
19 | 
20 | preproc:
21 | ~/hunglish/data/experiments/1984.hu.lemmas
22 | ~/hunglish/data/experiments/1984.en.lemmas
23 | 
24 | (Actually, Multext-East ids were truncated from these files.)
25 | 
26 | hand:
27 | ~/hunglish/data/experiments/hand.indexes
28 | 


--------------------------------------------------------------------------------
/regtest/handaligns/1984.hu.handstem/hu.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.hu.handstem/hu.pre


--------------------------------------------------------------------------------
/regtest/handaligns/1984.hu.handstem/hu.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.hu.handstem/hu.sen


--------------------------------------------------------------------------------
/regtest/handaligns/1984.hu/README:
--------------------------------------------------------------------------------
 1 | ====
 2 | Notes for users:
 3 | 
 4 | 
 5 | Data obtained from the English-Hungarian parallel Multext-East "1984" corpus.
 6 | 
 7 | .sen files contain the sentence-level information of that corpus, excluding word-level information and paragraph structure.
 8 | 
 9 | .pre files contain an automatically processed version of these, for aligner consumption.
10 | Processing steps are: rude tokenization, lowercasing, stemming by the hunstem tool with Hungarian and English resources.
11 | 
12 | 
13 | ====
14 | Notes for Hunglish developers:
15 | 
16 | Originally at
17 | 
18 | sen:
19 | /home/daniel/hunglish/data/experiments/1984.nonlemmatized.improved/Hungarian/1984.sen
20 | /home/daniel/hunglish/data/experiments/1984.nonlemmatized.improved/English/1984.sen
21 | 
22 | preproc:
23 | /home/daniel/hunglish/data/experiments/1984.nonlemmatized.improved/Hungarian/1984.sen.low.rok.stem
24 | /home/daniel/hunglish/data/experiments/1984.nonlemmatized.improved/English/1984.sen.low.rok.stem
25 | 
26 | hand:
27 | /home/daniel/hunglish/data/experiments/hand.indexes
28 | 


--------------------------------------------------------------------------------
/regtest/handaligns/1984.hu/en.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.hu/en.pre


--------------------------------------------------------------------------------
/regtest/handaligns/1984.hu/hu.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.hu/hu.pre


--------------------------------------------------------------------------------
/regtest/handaligns/1984.hu/hu.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.hu/hu.sen


--------------------------------------------------------------------------------
/regtest/handaligns/1984.ro.utf8/README:
--------------------------------------------------------------------------------
 1 | ====
 2 | Notes for users:
 3 | 
 4 | Data obtained from the English-Romanian parallel Multext-East "1984" corpus.
 5 | 
 6 | Note: The Romanian files are named hu.* , to make life easier for our scripts. We apologize for this approach. :) Basically, "hu" is a codeword for "source language" and "en" is a codeword for "target language".
 7 | 
 8 | .sen files contain the token-level information of that corpus, not including stem information and paragraph structure.
 9 | 
10 | Encoding is UTF-8. This is the only difference to ../1984.ro
11 | 
12 | .pre files contain versions of the sen files with some very crude tokenization, and with punctuation marks deleted.
13 | 
14 | ====
15 | Notes for Hunglish developers:
16 | 
17 | Originally at
18 | 
19 | sen:
20 | /home/daniel/hunglish/data/experiments/roman2/ro.sen
21 | /home/daniel/hunglish/data/experiments/roman2/en.sen
22 | 
23 | preproc:
24 | /home/daniel/hunglish/data/experiments/roman2/ro.sen.deent.low.rok
25 | /home/daniel/hunglish/data/experiments/roman2/en.sen.low.rok
26 | 
27 | hand:
28 | /home/daniel/hunglish/data/experiments/roman2/hand.indexes
29 | 


--------------------------------------------------------------------------------
/regtest/handaligns/1984.ro.utf8/sgmlTolatin2.sed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.ro.utf8/sgmlTolatin2.sed


--------------------------------------------------------------------------------
/regtest/handaligns/1984.ro/README:
--------------------------------------------------------------------------------
 1 | ====
 2 | Notes for users:
 3 | 
 4 | Data obtained from the English-Romanian parallel Multext-East "1984" corpus.
 5 | 
 6 | Note: The Romanian files are named hu.* , to make life easier for our scripts. We apologize for this approach. :) Basically, "hu" is a codeword for "source language" and "en" is a codeword for "target language".
 7 | 
 8 | .sen files contain the token-level information of that corpus, not including stem information and paragraph structure.
 9 | 
10 | Encoding is ISO Latin 2.
11 | 
12 | .pre files contain versions of the sen files with some very crude tokenization, and with punctuation marks deleted.
13 | 
14 | ====
15 | Notes for Hunglish developers:
16 | 
17 | Originally at
18 | 
19 | sen:
20 | /home/daniel/hunglish/data/experiments/roman2/ro.sen
21 | /home/daniel/hunglish/data/experiments/roman2/en.sen
22 | 
23 | preproc:
24 | /home/daniel/hunglish/data/experiments/roman2/ro.sen.deent.low.rok
25 | /home/daniel/hunglish/data/experiments/roman2/en.sen.low.rok
26 | 
27 | hand:
28 | /home/daniel/hunglish/data/experiments/roman2/hand.indexes
29 | 


--------------------------------------------------------------------------------
/regtest/handaligns/1984.ro/hu.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.ro/hu.pre


--------------------------------------------------------------------------------
/regtest/handaligns/1984.ro/hu.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.ro/hu.sen


--------------------------------------------------------------------------------
/regtest/handaligns/1984.ro/sgmlTolatin2.sed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/1984.ro/sgmlTolatin2.sed


--------------------------------------------------------------------------------
/regtest/handaligns/dtm/README:
--------------------------------------------------------------------------------
 1 | ====
 2 | Notes for users:
 3 | 
 4 | 
 5 | From Diplomacy and Trade Magazine. (I'm not exactly sure what part of which issue, exactly.)
 6 | 
 7 | We are grateful to the original copyright holder for the raw data.
 8 | 
 9 | Sentence-level segmentation and manual alignment built at the 
10 | Department of Corpus Linguistics at the Hungarian Research Institute for Linguistics.
11 | 
12 | .sen files contain the sentence-level information.
13 | 
14 | .pre files contain an automatically processed version of these, for aligner consumption.
15 | Processing steps are: rude tokenization and stemming by the hunstem tool with Hungarian and English resources.
16 | 
17 | hand.ladder is the manual align of the bitext.
18 | 
19 | ====
20 | Notes for Hunglish developers:
21 | 
22 | Based on the text format align ./dtm.bi
23 | 
24 | See ../steinbeck/README for details on the text to .sen,.ladder conversion.
25 | 
26 | The .pre files were built like this:
27 | 
28 | export BINDIR=/home/daniel/Bicorpus/scripts
29 | cat hu.sen | $BINDIR/tok.one.sh | $BINDIR/hu.stem.one.sh > hu.pre
30 | cat en.sen | $BINDIR/tok.one.sh | $BINDIR/en.stem.one.sh > en.pre
31 | 


--------------------------------------------------------------------------------
/regtest/handaligns/dtm/dtm.bi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/dtm/dtm.bi


--------------------------------------------------------------------------------
/regtest/handaligns/dtm/en.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/dtm/en.pre


--------------------------------------------------------------------------------
/regtest/handaligns/dtm/en.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/dtm/en.sen


--------------------------------------------------------------------------------
/regtest/handaligns/dtm/hand.ladder:
--------------------------------------------------------------------------------
  1 | 0 0
  2 | 1 1
  3 | 2 2
  4 | 3 3
  5 | 4 4
  6 | 5 5
  7 | 6 6
  8 | 7 7
  9 | 8 8
 10 | 9 9
 11 | 10 10
 12 | 11 11
 13 | 12 12
 14 | 13 13
 15 | 14 14
 16 | 15 15
 17 | 16 16
 18 | 17 17
 19 | 18 18
 20 | 19 19
 21 | 20 20
 22 | 21 21
 23 | 22 22
 24 | 23 23
 25 | 24 24
 26 | 25 25
 27 | 26 26
 28 | 27 27
 29 | 28 28
 30 | 29 29
 31 | 30 30
 32 | 31 31
 33 | 32 32
 34 | 33 33
 35 | 34 34
 36 | 35 35
 37 | 36 36
 38 | 37 37
 39 | 38 38
 40 | 39 39
 41 | 40 40
 42 | 41 41
 43 | 42 42
 44 | 43 43
 45 | 44 44
 46 | 45 45
 47 | 46 46
 48 | 47 47
 49 | 48 48
 50 | 49 49
 51 | 50 50
 52 | 51 51
 53 | 52 52
 54 | 53 53
 55 | 54 54
 56 | 55 55
 57 | 56 56
 58 | 57 57
 59 | 58 58
 60 | 59 59
 61 | 60 60
 62 | 61 61
 63 | 62 62
 64 | 63 63
 65 | 64 64
 66 | 65 65
 67 | 66 66
 68 | 67 67
 69 | 68 68
 70 | 69 69
 71 | 70 70
 72 | 71 71
 73 | 72 72
 74 | 73 73
 75 | 74 74
 76 | 75 75
 77 | 76 76
 78 | 77 77
 79 | 78 78
 80 | 79 79
 81 | 80 80
 82 | 81 81
 83 | 82 82
 84 | 83 83
 85 | 84 84
 86 | 85 85
 87 | 86 86
 88 | 87 87
 89 | 88 88
 90 | 89 89
 91 | 90 90
 92 | 91 91
 93 | 92 92
 94 | 93 93
 95 | 94 94
 96 | 95 96
 97 | 96 97
 98 | 97 98
 99 | 98 99
100 | 100 100
101 | 101 101
102 | 102 102
103 | 103 103
104 | 104 104
105 | 105 105
106 | 106 106
107 | 107 107
108 | 108 108
109 | 109 109
110 | 110 110
111 | 111 111
112 | 112 113
113 | 114 114
114 | 115 115
115 | 116 116
116 | 117 117
117 | 118 118
118 | 119 119
119 | 120 120
120 | 122 122
121 | 123 123
122 | 124 124
123 | 126 125
124 | 127 126
125 | 128 127
126 | 129 128
127 | 130 129
128 | 131 130
129 | 132 131
130 | 133 132
131 | 134 133
132 | 135 134
133 | 136 135
134 | 137 136
135 | 138 137
136 | 139 138
137 | 140 139
138 | 141 140
139 | 142 141
140 | 143 142
141 | 144 143
142 | 145 144
143 | 146 145
144 | 147 146
145 | 148 147
146 | 149 148
147 | 150 149
148 | 151 150
149 | 152 151
150 | 153 152
151 | 154 153
152 | 155 154
153 | 156 155
154 | 157 156
155 | 158 157
156 | 159 158
157 | 160 159
158 | 161 160
159 | 162 161
160 | 163 162
161 | 164 163
162 | 165 164
163 | 166 165
164 | 167 166
165 | 168 167
166 | 169 168
167 | 170 169
168 | 171 170
169 | 172 171
170 | 173 172
171 | 174 173
172 | 175 174
173 | 176 175
174 | 177 176
175 | 178 177
176 | 179 178
177 | 180 179
178 | 181 180
179 | 182 181
180 | 183 182
181 | 184 183
182 | 185 184
183 | 186 185
184 | 187 186
185 | 188 187
186 | 189 188
187 | 191 190
188 | 192 192
189 | 193 193
190 | 194 194
191 | 195 195
192 | 196 196
193 | 197 197
194 | 198 198
195 | 199 199
196 | 200 200
197 | 201 201
198 | 202 202
199 | 203 203
200 | 204 204
201 | 205 205
202 | 206 206
203 | 207 207
204 | 208 208
205 | 209 209
206 | 210 210
207 | 211 211
208 | 212 212
209 | 213 213
210 | 214 214
211 | 215 215
212 | 216 216
213 | 217 218
214 | 218 219
215 | 219 220
216 | 220 221
217 | 221 222
218 | 222 223
219 | 223 224
220 | 224 225
221 | 225 226
222 | 226 227
223 | 227 228
224 | 228 229
225 | 229 230
226 | 231 231
227 | 232 232
228 | 233 233
229 | 234 234
230 | 235 235
231 | 236 236
232 | 237 237
233 | 238 238
234 | 239 239
235 | 240 240
236 | 241 241
237 | 242 242
238 | 243 243
239 | 244 244
240 | 245 245
241 | 246 246
242 | 247 247
243 | 248 248
244 | 249 249
245 | 250 250
246 | 251 251
247 | 252 252
248 | 253 253
249 | 254 254
250 | 255 255
251 | 256 256
252 | 257 257
253 | 258 258
254 | 259 259
255 | 260 260
256 | 261 261
257 | 262 262
258 | 263 263
259 | 264 264
260 | 265 265
261 | 266 266
262 | 267 267
263 | 268 268
264 | 269 269
265 | 270 270
266 | 271 271
267 | 272 272
268 | 273 273
269 | 274 274
270 | 275 275
271 | 276 276
272 | 277 277
273 | 278 278
274 | 279 279
275 | 280 280
276 | 281 281
277 | 282 282
278 | 283 283
279 | 284 284
280 | 285 285
281 | 286 286
282 | 287 287
283 | 288 288
284 | 289 289
285 | 290 290
286 | 291 291
287 | 292 292
288 | 293 293
289 | 295 295
290 | 296 296
291 | 297 297
292 | 298 298
293 | 300 300
294 | 301 301
295 | 302 302
296 | 303 303
297 | 304 304
298 | 305 305
299 | 306 306
300 | 307 307
301 | 308 308
302 | 309 309
303 | 310 310
304 | 311 311
305 | 312 312
306 | 313 313
307 | 314 314
308 | 315 315
309 | 316 316
310 | 317 317
311 | 318 318
312 | 319 319
313 | 320 320
314 | 321 321
315 | 322 322
316 | 323 323
317 | 324 324
318 | 325 325
319 | 326 326
320 | 327 327
321 | 328 328
322 | 329 329
323 | 330 330
324 | 331 331
325 | 332 332
326 | 333 333
327 | 334 334
328 | 335 335
329 | 336 336
330 | 337 337
331 | 338 338
332 | 339 339
333 | 340 340
334 | 341 341
335 | 342 342
336 | 343 343
337 | 344 344
338 | 345 345
339 | 346 346
340 | 347 347
341 | 348 348
342 | 349 349
343 | 350 350
344 | 351 351
345 | 352 352
346 | 353 353
347 | 354 354
348 | 355 355
349 | 356 356
350 | 357 357
351 | 358 358
352 | 359 359
353 | 360 360
354 | 361 361
355 | 362 362
356 | 363 363
357 | 364 364
358 | 365 365
359 | 366 366
360 | 367 367
361 | 368 369
362 | 369 370
363 | 370 371
364 | 371 372
365 | 372 373
366 | 373 374
367 | 374 375
368 | 375 376
369 | 376 377
370 | 377 378
371 | 378 379
372 | 379 380
373 | 380 381
374 | 381 382
375 | 382 383
376 | 383 384
377 | 384 385
378 | 385 386
379 | 386 387
380 | 387 388
381 | 388 389
382 | 389 390
383 | 390 391
384 | 391 392
385 | 392 393
386 | 393 394
387 | 394 395
388 | 395 396
389 | 396 397
390 | 397 398
391 | 398 399
392 | 399 400
393 | 400 401
394 | 401 402
395 | 402 403
396 | 403 404
397 | 404 405
398 | 405 406
399 | 407 407
400 | 408 408
401 | 409 410
402 | 410 411
403 | 411 412
404 | 412 413
405 | 413 414
406 | 414 415
407 | 415 416
408 | 416 417
409 | 417 418
410 | 418 419
411 | 419 420
412 | 420 421
413 | 421 422
414 | 422 423
415 | 423 424
416 | 424 425
417 | 425 426
418 | 426 427
419 | 427 428
420 | 428 429
421 | 429 430
422 | 430 431
423 | 431 432
424 | 432 433
425 | 433 434
426 | 434 435
427 | 435 436
428 | 436 437
429 | 437 438
430 | 438 439
431 | 439 440
432 | 440 441
433 | 441 442
434 | 442 443
435 | 443 444
436 | 444 445
437 | 445 446
438 | 446 447
439 | 447 448
440 | 448 449
441 | 449 450
442 | 450 451
443 | 451 452
444 | 452 453
445 | 453 454
446 | 454 455
447 | 455 456
448 | 456 457
449 | 457 458
450 | 458 459
451 | 459 460
452 | 460 461
453 | 461 462
454 | 462 463
455 | 463 464
456 | 464 465
457 | 465 467
458 | 466 468
459 | 467 469
460 | 468 470
461 | 469 471
462 | 470 472
463 | 471 473
464 | 472 474
465 | 473 475
466 | 474 476
467 | 475 477
468 | 476 478
469 | 477 479
470 | 478 480
471 | 479 481
472 | 480 482
473 | 481 483
474 | 482 484
475 | 483 485
476 | 484 486
477 | 485 487
478 | 486 488
479 | 487 489
480 | 489 491
481 | 490 492
482 | 491 493
483 | 492 494
484 | 493 495
485 | 494 496
486 | 495 497
487 | 496 498
488 | 497 499
489 | 498 500
490 | 499 501
491 | 500 502
492 | 501 503
493 | 502 504
494 | 503 505
495 | 504 506
496 | 505 507
497 | 506 508
498 | 507 509
499 | 508 510
500 | 509 511
501 | 510 512
502 | 511 513
503 | 512 514
504 | 513 515
505 | 514 516
506 | 515 517
507 | 516 518
508 | 517 519
509 | 518 520
510 | 519 521
511 | 520 522
512 | 521 523
513 | 522 524
514 | 523 525
515 | 524 526
516 | 525 527
517 | 526 528
518 | 527 529
519 | 528 530
520 | 529 531
521 | 530 532
522 | 531 533
523 | 532 534
524 | 533 535
525 | 534 536
526 | 535 537
527 | 536 538
528 | 537 539
529 | 538 540
530 | 539 541
531 | 540 542
532 | 541 543
533 | 542 544
534 | 543 545
535 | 544 546
536 | 545 547
537 | 546 548
538 | 547 549
539 | 548 550
540 | 549 551
541 | 550 552
542 | 551 553
543 | 552 554
544 | 553 555
545 | 554 556
546 | 555 557
547 | 556 558
548 | 557 559
549 | 558 560
550 | 559 561
551 | 560 562
552 | 561 563
553 | 562 564
554 | 563 565
555 | 564 566
556 | 565 567
557 | 566 568
558 | 567 569
559 | 568 570
560 | 569 571
561 | 570 572
562 | 571 573
563 | 572 574
564 | 573 575
565 | 574 576
566 | 575 577
567 | 576 578
568 | 577 579
569 | 578 580
570 | 579 581
571 | 580 582
572 | 581 583
573 | 


--------------------------------------------------------------------------------
/regtest/handaligns/dtm/hu.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/dtm/hu.pre


--------------------------------------------------------------------------------
/regtest/handaligns/dtm/hu.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/dtm/hu.sen


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck.huntoken.nopara/README:
--------------------------------------------------------------------------------
 1 | ====
 2 | Notes for users:
 3 | 
 4 | John Steinbeck's "Cup of Gold". Raw data obtained from the Hungarian Electronic Library and Project Gutenberg.
 5 | 
 6 | .sen files contain the sentence-level information. Sentence-level segmentation by our huntoken tool. (Note: Unedited by hand, so not perfect at all.)
 7 | 
 8 | Although it strictly respects this imperfect sentence-level segmentation, the alignment itself is hand-edited, and should be error-free.
 9 | 
10 | IMPORTANT NOTE: This directory contains the same data as steinbeck.huntoken, with just one difference: This corpus was built by throwing away all the (automatically obtained) paragraph information from the steinbeck.huntoken corpus.
11 | 
12 | .pre files contain an automatically processed version of the .sen files, for aligner consumption.
13 | Processing steps are: rude tokenization, lowercasing, stemming by the hunstem tool with Hungarian and English resources.
14 | 
15 | ====
16 | Notes for Hunglish developers:
17 | 
18 | Originally at
19 | 
20 | sen:
21 | ~/hunglish/data/experiments/microsoft/steinbeck/kornaiscript/hu.really.sen
22 | ~/hunglish/data/experiments/microsoft/steinbeck/kornaiscript/en.really.sen
23 | 
24 | preproc:
25 | ~/hunglish/data/experiments/microsoft/steinbeck/kornaiscript/hu.sen.low.rok.stem
26 | ~/hunglish/data/experiments/microsoft/steinbeck/kornaiscript/en.sen.low.rok.stem
27 | 
28 | hand:
29 | ~/hunglish/data/experiments/microsoft/steinbeck/kornaiscript/autole1tra
30 | 


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck.huntoken.nopara/en.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken.nopara/en.pre


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck.huntoken.nopara/en.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken.nopara/en.sen


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck.huntoken.nopara/hu.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken.nopara/hu.pre


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck.huntoken.nopara/hu.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken.nopara/hu.sen


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck.huntoken/README:
--------------------------------------------------------------------------------
 1 | ====
 2 | Notes for users:
 3 | 
 4 | John Steinbeck's "Cup of Gold". Raw data obtained from the Hungarian Electronic Library and Project Gutenberg.
 5 | 
 6 | Sentence-level segmentation and paragraph-information by our huntoken tool. (Note: Unedited by hand, so not perfect at all.)
 7 | 
 8 | Although it strictly respects this imperfect sentence-level segmentation, the alignment itself is hand-edited, and should be error-free. 
 9 | It was built at the Department of Corpus Linguistics at the Hungarian Research Institute for Linguistics.
10 | 
11 | .sen files contain the sentence-level information. Paragraph structure is described by paragraph-delimiter quasi-sentences.
12 | 
13 | .pre files contain an automatically processed version of these, for aligner consumption.
14 | Processing steps are: rude tokenization, lowercasing, stemming by the hunstem tool with Hungarian and English resources.
15 | 
16 | ====
17 | Notes for Hunglish developers:
18 | 
19 | Originally at
20 | 
21 | sen:
22 | ~/hunglish/data/experiments/Steinbeck2/Steinbeck_1.hu.sen
23 | ~/hunglish/data/experiments/Steinbeck2/Steinbeck_1.en.sen
24 | 
25 | preproc:
26 | ~/hunglish/data/experiments/Steinbeck.improve/Steinbeck_1.hu.sen.low.rok.stem
27 | ~/hunglish/data/experiments/Steinbeck.improve/Steinbeck_1.en.sen.low.rok.stem
28 | 
29 | hand:
30 | ~/hunglish/data/experiments/Steinbeck.compare/ladder.hand.nostartendpara.txt
31 | 


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck.huntoken/en.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken/en.pre


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck.huntoken/en.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken/en.sen


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck.huntoken/hu.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken/hu.pre


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck.huntoken/hu.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck.huntoken/hu.sen


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck/README:
--------------------------------------------------------------------------------
 1 | ====
 2 | Notes for users:
 3 | 
 4 | John Steinbeck's "Cup of Gold". Raw data obtained from the Hungarian Electronic Library and Project Gutenberg.
 5 | 
 6 | Sentence-level segmentation and paragraph-information was hand-edited at the 
 7 | Department of Corpus Linguistics at the Hungarian Research Institute for Linguistics.
 8 | 
 9 | .sen files contain the sentence-level information. Paragraph structure is described by paragraph-delimiter quasi-sentences.
10 | 
11 | .pre files contain an automatically processed version of these, for aligner consumption.
12 | Processing steps are: rude tokenization and stemming by the hunstem tool with Hungarian and English resources.
13 | 
14 | auto.ladder was built by hunalign from hu.pre and en.pre, with default arguments.
15 | 
16 | hand.ladder is the manual align of the bitext, also by the
17 | Department of Corpus Linguistics at the Hungarian Research Institute for Linguistics.
18 | It is based on auto.ladder.
19 | 
20 | ====
21 | Notes for Hunglish developers:
22 | 
23 | Originally at
24 | 
25 | sen:
26 | 
27 | Originally from
28 | /home/daniel/hunglish/data/experiments/Steinbeck.handsegment/hu.sen
29 | /home/daniel/hunglish/data/experiments/Steinbeck.handsegment/en.sen
30 | 
31 | , but some whitespaces were retroactively removed, based on the text version of the manaual align:
32 | cat align.hand.txt | cut -f1 | awk 'BEGIN {FS=" ~~~ "} { for (i=1;i<=NF;++i) { print $(i) } }' | grep -v "^$" | sed "s/ $//" | sed "s/^ //" > hu.sen.hand
33 | cat align.hand.txt | cut -f2 | awk 'BEGIN {FS=" ~~~ "} { for (i=1;i<=NF;++i) { print $(i) } }' | grep -v "^$" | sed "s/ $//" | sed "s/^ //" > en.sen.hand
34 | 
35 | preproc:
36 | /home/daniel/hunglish/data/experiments/Steinbeck.handsegment/hu.pre
37 | /home/daniel/hunglish/data/experiments/Steinbeck.handsegment/en.pre
38 | 
39 | auto:
40 | /home/daniel/hunglish/data/experiments/Steinbeck.handsegment/auto.ladder
41 | 
42 | hand:
43 | /home/joker/steinbeck/kesz.steinbeck_align_rev.v1.0.txt
44 | or
45 | /home/daniel/hunglish/data/handaligns/steinbeck/kesz.steinbeck_align_rev.v1.0.txt
46 | 
47 | The text to ladder conversion script used:
48 | 
49 | cat kesz.steinbeck_align_rev.v1.0.txt | sed "s/\([^ ]\)~~~ /\1 ~~~ /g" | sed "s/ ~~~\([^ ]\)/ ~~~ \1/g" | sed "s/\([^ ]\)~~~\([^ ]\)/\1 ~~~ \2/g" | grep -v "^$" > align.hand.txt
50 | mkdir tmp
51 | export file=align.hand.txt
52 | half=1 ; cat $file | grep -v "^.$" | cut -f$half | awk 'BEGIN {FS=" ~~~ "; s=0 } { print s; s+=NF } END { print s }' > tmp/ladder.$half
53 | half=2 ; cat $file | grep -v "^.$" | cut -f$half | awk 'BEGIN {FS=" ~~~ "; s=0 } { print s; s+=NF } END { print s }' > tmp/ladder.$half
54 | paste tmp/ladder.1 tmp/ladder.2 | tr '\t' ' ' > hand.ladder
55 | 


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck/en.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck/en.pre


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck/en.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck/en.sen


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck/hu.pre:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck/hu.pre


--------------------------------------------------------------------------------
/regtest/handaligns/steinbeck/hu.sen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/handaligns/steinbeck/hu.sen


--------------------------------------------------------------------------------
/regtest/regtest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | bindir=../src/hunalign
 4 | 
 5 | 
 6 | fscorer() {
 7 |     prec=`cat $1 | grep "^Precision" | tr -d ',' | cut -f2 -d' '`
 8 |     recall=`cat $1 | grep "^Precision" | tr -d ',' | cut -f4 -d' '`
 9 |     fscore=`echo "2/(1/$prec+1/$recall)" | bc -l | awk '{ print $0+0 }'`
10 |     echo "F-score: $fscore"
11 |     echo
12 | }
13 | 
14 | evaluator() {
15 |     echo "=================================="
16 |     echo "Expected:"
17 |     cat $1 | tail -3
18 |     fscorer $1
19 |     targetfscore=$fscore
20 |     echo "Achieved:"
21 |     cat $2 | tail -3
22 |     fscorer $2
23 |     targetfscore=$targetfscore fscore=$fscore awk '
24 |     BEGIN{
25 | 	print "Expected F-score:", ENVIRON["targetfscore"], " Achieved F-score:",ENVIRON["fscore"]
26 |         if (ENVIRON["targetfscore"]>ENVIRON["fscore"])
27 | 	{
28 | 	    print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> REGRESSION in F-score <<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
29 | 	}
30 | 	else
31 | 	{
32 | 	    print "No regression found"
33 |         }
34 |     }'
35 |     echo
36 | }
37 | 
38 | name=1984.ro.utf8.realign
39 | echo "Testing $name ..."
40 | target=targets/$name.cerr
41 | file=results/$name.cerr
42 | ofile=results/$name.cout
43 | $bindir/hunalign -realign -hand=handaligns/1984.ro.utf8/hand.ladder ../data/null.dic handaligns/1984.ro.utf8/hu.pre handaligns/1984.ro.utf8/en.pre -bisent > $ofile 2> $file
44 | evaluator $target $file
45 | 
46 | name=1984.hu
47 | echo "Testing $name ..."
48 | target=targets/$name.cerr
49 | file=results/$name.cerr
50 | ofile=results/$name.cout
51 | $bindir/hunalign -onebyteencoding -hand=handaligns/1984.hu/hand.ladder ../data/hu-en.dic handaligns/1984.hu.handstem/hu.pre handaligns/1984.hu.handstem/en.pre -bisent > $ofile 2> $file
52 | evaluator $target $file
53 | 
54 | name=1984.hu.handstem.realign
55 | echo "Testing $name ..."
56 | target=targets/$name.cerr
57 | file=results/$name.cerr
58 | ofile=results/$name.cout
59 | $bindir/hunalign -onebyteencoding -realign -hand=handaligns/1984.hu/hand.ladder ../data/hu-en.dic handaligns/1984.hu.handstem/hu.pre handaligns/1984.hu.handstem/en.pre -bisent > $ofile 2> $file
60 | evaluator $target $file
61 | 
62 | name=steinbeck.huntoken.nopara
63 | echo "Testing $name ..."
64 | target=targets/$name.cerr
65 | file=results/$name.cerr
66 | ofile=results/$name.cout
67 | $bindir/hunalign -onebyteencoding -hand=handaligns/steinbeck.huntoken.nopara/hand.ladder ../data/hu-en.dic handaligns/steinbeck.huntoken.nopara/hu.pre handaligns/steinbeck.huntoken.nopara/en.pre > $ofile 2> $file
68 | evaluator $target $file
69 | 
70 | name=1984.ro.realign
71 | echo "Testing $name ..."
72 | target=targets/$name.cerr
73 | file=results/$name.cerr
74 | ofile=results/$name.cout
75 | $bindir/hunalign -onebyteencoding -realign -hand=handaligns/1984.ro/hand.ladder ../data/null.dic handaligns/1984.ro/hu.pre handaligns/1984.ro/en.pre -bisent > $ofile 2> $file
76 | evaluator $target $file
77 | 
78 | name=dtm.realign
79 | echo "Testing $name ..."
80 | target=targets/$name.cerr
81 | file=results/$name.cerr
82 | ofile=results/$name.cout
83 | $bindir/hunalign -onebyteencoding -hand=handaligns/dtm/hand.ladder ../data/hu-en.dic handaligns/dtm/hu.pre handaligns/dtm/en.pre > $ofile 2> $file
84 | evaluator $target $file
85 | 


--------------------------------------------------------------------------------
/regtest/results/dummy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/regtest/results/dummy


--------------------------------------------------------------------------------
/regtest/targets/1984.hu.cerr:
--------------------------------------------------------------------------------
 1 | Reading dictionary...
 2 | 6733 hungarian sentences read.
 3 | 6738 english sentences read.
 4 | quasiglobal_stopwordRemoval is set to 0
 5 | Simplified dictionary ready.
 6 | Rough translation ready.
 7 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 
 8 | Rough translation-based similarity matrix ready.
 9 | Matrix built.
10 | Trail found.
11 | Align ready.
12 | Global quality of unfiltered align 1.06407
13 | quasiglobal_spaceOutBySentenceLength is set to 1
14 | Trail spaced out by sentence length.
15 | Global quality of unfiltered align after realign 1.06407
16 | 42 misaligned out of 6446 correct items, 6450 bets.
17 | Precision: 0.993488, Recall: 0.994105
18 | Quality 1.06407
19 | 


--------------------------------------------------------------------------------
/regtest/targets/1984.hu.handstem.realign.cerr:
--------------------------------------------------------------------------------
 1 | Reading dictionary...
 2 | 6733 hungarian sentences read.
 3 | 6738 english sentences read.
 4 | quasiglobal_stopwordRemoval is set to 0
 5 | Simplified dictionary ready.
 6 | Rough translation ready.
 7 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 
 8 | Rough translation-based similarity matrix ready.
 9 | Matrix built.
10 | Trail found.
11 | Align ready.
12 | Global quality of unfiltered align 1.06407
13 | 141409 items inside the border.
14 | Border of realign zone determined.
15 | 6558 bisentences collected.
16 | Plausible bisentences filtered.
17 | Removing stopwords...Removing identicals... 1900 1914 1920 1925 1930 1940 1944 1945 1960 1965 1968 1970 1973 1983 1984 2050 2713 6079 B Berlin Brazzaville Ceylon Colchester Emmanuel Goldstein India J. Kent London Martin's-in-the-Fields Minipax O'Brien Parsons Smith Swift Syme Tibet Winston Withers York agitprop album alibi atom cent film frigid front gallon gin hall hang lift memorandum mind minimum modern most overall park pint propaganda reflex reform reformer sport staccato stop tank tea terror times vitriol 
18 | 73 identical translations found.
19 | Removing hapaxes...503 hapax-based dictionary items found.
20 | Building CorpusConstellation... Done.
21 | 3795 items left in original dictionary.
22 | Removing stopwords...Removing identicals... 
23 | 0 identical translations found.
24 | Removing hapaxes...0 hapax-based dictionary items found.
25 | Building CorpusConstellation... Done.
26 | 2234 new dictionary items found.
27 | Simplified dictionary ready.
28 | Rough translation ready.
29 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 Matrix built.
30 | Trail found.
31 | Detail realign ready.
32 | Global quality of unfiltered align after realign 1.02754
33 | quasiglobal_spaceOutBySentenceLength is set to 1
34 | Trail spaced out by sentence length.
35 | Global quality of unfiltered align after realign 1.02754
36 | 49 misaligned out of 6446 correct items, 6446 bets.
37 | Precision: 0.992398, Recall: 0.992398
38 | Quality 1.02754
39 | 


--------------------------------------------------------------------------------
/regtest/targets/1984.ro.realign.cerr:
--------------------------------------------------------------------------------
 1 | Reading dictionary...
 2 | 6481 hungarian sentences read.
 3 | 6706 english sentences read.
 4 | quasiglobal_stopwordRemoval is set to 0
 5 | Simplified dictionary ready.
 6 | Rough translation ready.
 7 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 
 8 | Rough translation-based similarity matrix ready.
 9 | Matrix built.
10 | Trail found.
11 | Align ready.
12 | Global quality of unfiltered align 0.318931
13 | 138326 items inside the border.
14 | Border of realign zone determined.
15 | 6151 bisentences collected.
16 | Plausible bisentences filtered.
17 | Removing stopwords...Removing identicals... ! ( ) - . 100 12 1900 1914 1920 1925 1930 1940 1944 1945 1960 1965 1968 1970 1973 1983 1984 20 2050 2713 3 6079 83 85 98 99 : ; ? aaronson absurd accent accident acid act adam adjectival adoptive adverb africa african agent agitprop ah album alibi amersham ampleforth ancestral animal anti-sex apologetic are artificial atlantic australasia avenue b baal bar barman berkhamsted boy brazzaville bumstead byron c canada cap capitalism capitalist care caricaturist caste cent central charrington chaucer china civil clang clement colchester comintern competent complex concept concrete conflict congo consider constant contact continue contralto control coral crime cromwell darwin de definitive demonstrative dickens din direct district doctor doctrine document elaborate electric emmanuel etc eurasia eurasian european evident exact exist face false familiar fanatic film force fragment franca frigid front general gestapo gin goldstein helicopter i ideal ignorant iii imagine important in incident independent india indian indirect individual individualism inferior inprecorr instinct instinctive instrument interval j java jefferson jones julia karl kipling leopoldville lift lingua malabar martin marx material metal milton minimum minipax minute moment mongolia monument mother motor murmur natural nazi negroid new noctis normal o'brien oceania ogilvy oliver omnipotent optimism or orator orgiastic osiris pacific paddington palimpsest pancras pardon paris parsons patriotic patriotism pedant perfect permanent persia pistol plan pneumatic popular pornosec post pretext primae primitive principal protector protest public pure raid rare real respect rest reverie rival robin romantic rutherford secret separate sex sexual shaftesbury shakespeare shoreditch siberia simple slogan smith socialism socialist solar solemn solid solipsism spasm special specialist spirit splendid sport standard stepney stop submarine superior surplus suspect swift syme tibet tic tillotson times tolerant tom total tour transparent trivial valet vast verb verbal versificator violent vistula vitriol w weeks wilsher winston withers york 
18 | 294 identical translations found.
19 | Removing hapaxes...558 hapax-based dictionary items found.
20 | Building CorpusConstellation... Done.
21 | 0 items left in original dictionary.
22 | Removing stopwords...Removing identicals... 
23 | 0 identical translations found.
24 | Removing hapaxes...0 hapax-based dictionary items found.
25 | Building CorpusConstellation... Done.
26 | 2305 new dictionary items found.
27 | Simplified dictionary ready.
28 | Rough translation ready.
29 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 Matrix built.
30 | Trail found.
31 | Detail realign ready.
32 | Global quality of unfiltered align after realign 0.819735
33 | quasiglobal_spaceOutBySentenceLength is set to 1
34 | Trail spaced out by sentence length.
35 | Global quality of unfiltered align after realign 0.819735
36 | 119 misaligned out of 6015 correct items, 6060 bets.
37 | Precision: 0.980363, Recall: 0.987697
38 | Quality 0.819735
39 | 


--------------------------------------------------------------------------------
/regtest/targets/1984.ro.utf8.realign.cerr:
--------------------------------------------------------------------------------
 1 | Reading dictionary...
 2 | 6481 hungarian sentences read.
 3 | 6706 english sentences read.
 4 | quasiglobal_stopwordRemoval is set to 0
 5 | Simplified dictionary ready.
 6 | Rough translation ready.
 7 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 
 8 | Rough translation-based similarity matrix ready.
 9 | Matrix built.
10 | Trail found.
11 | Align ready.
12 | Global quality of unfiltered align 0.318931
13 | 138326 items inside the border.
14 | Border of realign zone determined.
15 | 6151 bisentences collected.
16 | Plausible bisentences filtered.
17 | Removing stopwords...Removing identicals... ! ( ) - . 100 12 1900 1914 1920 1925 1930 1940 1944 1945 1960 1965 1968 1970 1973 1983 1984 20 2050 2713 3 6079 83 85 98 99 : ; ? aaronson absurd accent accident acid act adam adjectival adoptive adverb africa african agent agitprop ah album alibi amersham ampleforth ancestral animal anti-sex apologetic are artificial atlantic australasia avenue b baal bar barman berkhamsted boy brazzaville bumstead byron c canada cap capitalism capitalist care caricaturist caste cent central charrington chaucer china civil clang clement colchester comintern competent complex concept concrete conflict congo consider constant contact continue contralto control coral crime cromwell darwin de definitive demonstrative dickens din direct district doctor doctrine document elaborate electric emmanuel etc eurasia eurasian european evident exact exist face false familiar fanatic film force fragment franca frigid front general gestapo gin goldstein helicopter i ideal ignorant iii imagine important in incident independent india indian indirect individual individualism inferior inprecorr instinct instinctive instrument interval j java jefferson jones julia karl kipling leopoldville lift lingua malabar martin marx material metal milton minimum minipax minute moment mongolia monument mother motor murmur natural nazi negroid new noctis normal o'brien oceania ogilvy oliver omnipotent optimism or orator orgiastic osiris pacific paddington palimpsest pancras pardon paris parsons patriotic patriotism pedant perfect permanent persia pistol plan pneumatic popular pornosec post pretext primae primitive principal protector protest public pure raid rare real respect rest reverie rival robin romantic rutherford secret separate sex sexual shaftesbury shakespeare shoreditch siberia simple slogan smith socialism socialist solar solemn solid solipsism spasm special specialist spirit splendid sport standard stepney stop submarine superior surplus suspect swift syme tibet tic tillotson times tolerant tom total tour transparent trivial valet vast verb verbal versificator violent vistula vitriol w weeks wilsher winston withers york 
18 | 294 identical translations found.
19 | Removing hapaxes...558 hapax-based dictionary items found.
20 | Building CorpusConstellation... Done.
21 | 0 items left in original dictionary.
22 | Removing stopwords...Removing identicals... 
23 | 0 identical translations found.
24 | Removing hapaxes...0 hapax-based dictionary items found.
25 | Building CorpusConstellation... Done.
26 | 2305 new dictionary items found.
27 | Simplified dictionary ready.
28 | Rough translation ready.
29 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 Matrix built.
30 | Trail found.
31 | Detail realign ready.
32 | Global quality of unfiltered align after realign 0.819839
33 | quasiglobal_spaceOutBySentenceLength is set to 1
34 | Trail spaced out by sentence length.
35 | Global quality of unfiltered align after realign 0.819839
36 | 119 misaligned out of 6015 correct items, 6060 bets.
37 | Precision: 0.980363, Recall: 0.987697
38 | Quality 0.819839
39 | 


--------------------------------------------------------------------------------
/regtest/targets/dtm.realign.cerr:
--------------------------------------------------------------------------------
 1 | Reading dictionary...
 2 | 582 hungarian sentences read.
 3 | 584 english sentences read.
 4 | quasiglobal_stopwordRemoval is set to 0
 5 | Simplified dictionary ready.
 6 | Rough translation ready.
 7 | 0 100 200 300 400 500 
 8 | Rough translation-based similarity matrix ready.
 9 | Matrix built.
10 | Trail found.
11 | Align ready.
12 | Global quality of unfiltered align 1.31448
13 | quasiglobal_spaceOutBySentenceLength is set to 1
14 | Trail spaced out by sentence length.
15 | Global quality of unfiltered align after realign 1.31448
16 | 11 misaligned out of 572 correct items, 580 bets.
17 | Precision: 0.981034, Recall: 0.994755
18 | Quality 1.31448
19 | 


--------------------------------------------------------------------------------
/regtest/targets/steinbeck.huntoken.nopara.cerr:
--------------------------------------------------------------------------------
 1 | Reading dictionary...
 2 | 5487 hungarian sentences read.
 3 | 5357 english sentences read.
 4 | quasiglobal_stopwordRemoval is set to 0
 5 | Simplified dictionary ready.
 6 | Rough translation ready.
 7 | 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 
 8 | Rough translation-based similarity matrix ready.
 9 | Matrix built.
10 | Trail found.
11 | Align ready.
12 | Global quality of unfiltered align 0.951101
13 | quasiglobal_spaceOutBySentenceLength is set to 1
14 | Trail spaced out by sentence length.
15 | Global quality of unfiltered align after realign 0.951101
16 | 152 misaligned out of 5180 correct items, 5229 bets.
17 | Precision: 0.970931, Recall: 0.980116
18 | Quality 0.951101
19 | 


--------------------------------------------------------------------------------
/scripts/DCEP/README:
--------------------------------------------------------------------------------
  1 | #######################################################
  2 | #
  3 | # DCEP sentence aligned corpora for 276 langugage pairs
  4 | #
  5 | #######################################################
  6 | 
  7 | ########
  8 | # Usage
  9 | 
 10 | Example: How to get Danish-Lithuanian sentence-aligned text?
 11 | 
 12 | 0. Enter a directory where the corpus building will take place.
 13 | (You can build several language pairs in this same directory.)
 14 | 
 15 | 1. Download and extract the two sentence-segmented monolingual corpora:
 16 | 
 17 | wget http://optima.jrc.it/Resources/DCEP-2013/sentences/DCEP-sentence-DA-pub.tar.bz2
 18 | wget http://optima.jrc.it/Resources/DCEP-2013/sentences/DCEP-sentence-LV-pub.tar.bz2
 19 | (Note to coauthors: shame on me, maybe we should remove the year reference from the path?)
 20 | tar jxf DCEP-sentence-DA-pub.tar.bz2
 21 | tar jxf DCEP-sentence-LV-pub.tar.bz2
 22 | 
 23 | The sentence segmented text is now in the ./DCEP/sentence/(xml|sgml)/(DA|LV) subdirectories.
 24 | 
 25 | 2. Download and extract the alignment information:
 26 | 
 27 | wget http://people.mokk.bme.hu/~daniel/DCEP/langpairs/DCEP-DA-LV.tar.bz2
 28 | tar jxf DCEP-DA-LV.tar.bz2
 29 | 
 30 | The alignment information contains correspondence between numerical indices
 31 | of sentences, in the next step we will turn these into actual sentence pairs.
 32 | 
 33 | Note that the order is alphabetical in language code: DA-LV is good, LV-DA is not.
 34 | The alignment information is now in the aligns/DA-LV subdirectory,
 35 | and the index describing the correspondence between text documents is in the indices/DA-LV text file.
 36 | Bidocuments are indentified by 6 digit numeric ids.
 37 | 
 38 | 3. Download, extract, and run the tool that generates the bicorpus from the above data:
 39 | 
 40 | wget http://people.mokk.bme.hu/~daniel/DCEP/DCEP-tools.tgz
 41 | tar zxvf DCEP-tools.tgz
 42 | ./src/languagepair.py DA-LV > DA-LV-bisentences.txt
 43 | 
 44 | You have to have python installed, version 2.[567].
 45 | 
 46 | The output is a tab-delimited UTF-8 text file with two columns.
 47 | It contains all corresponding sentence pairs identified by hunalign, the
 48 | automatic sentence aligner we used to create the alignment information.
 49 | The information about the source document of the sentence pair is lost
 50 | in this output format. See below for command line switches that can alter this
 51 | behavior.
 52 | 
 53 | If you don't roll your own filter, we recommend to use the --numbering-filter
 54 | switch that drops much of the numberings and other lower-quality sentences:
 55 | 
 56 | ./src/languagepair.py --numbering-filter DA-LV > DA-LV-bisentences.txt
 57 | 
 58 | See below for more detail.
 59 | 
 60 | 
 61 | #################
 62 | # Advanced usage
 63 | 
 64 | ./src/languagepair.py -h shows the available command line options.
 65 | Here we give a bit more background for them.
 66 | 
 67 | The original document structure is preserved with the --no-merge command line switch.
 68 | This will create aligned text documents in ./bitexts/DA-LV.
 69 | The numeric ids are used as file names, e.g. bitexts/DA-LV/013563.
 70 | The indices/DA-LV table can be used to find the correspondence between the bidocument and the
 71 | original DCEP filenames.
 72 | 
 73 | By default, the script takes the index file describing the document pairings
 74 | from the indices/DA-LV . This behavior can be changed with the --index-file argument.
 75 | Here is a Unix example that only processes the first 10 documents of the index:
 76 | ./src/languagepair.py --index-file <( head -10 indices/DA-LV ) > DA-LV-bisentences.txt
 77 | 
 78 | With the --not-just-bisentences switch, the output format changes:
 79 | It is one alignment unit per line, where an alignment unit consists of
 80 | two tab-separated columns, one for both languages. In each column,
 81 | there is a " ~~~ "-separated list of sentences. It is possible that one
 82 | of the columns is empty: that means that the aligner did not find matching
 83 | pair for the other column. The default " ~~~ " can be changed with the
 84 | --delimiter command line argument.
 85 | 
 86 | There are command line arguments that can be used to throw away suspicious
 87 | bisentences if extra precision is required, at the expense of recall.
 88 | 
 89 | The --numbering-filter is a crude but useful heuristic that attempts to drop numberings
 90 | and short titles from the output. It works simply by matching sentences on both sides
 91 | against a Unicode regex that looks for two alphabetic characters with space between them.
 92 | 
 93 | The --length-filter-level=LENGTH_FILTER_LEVEL argument is used to throw away as suspicious
 94 | all bisentences where the ratio of the shorter and the longer sentence (in character length)
 95 | is less than LENGTH_FILTER_LEVEL percent.
 96 | 
 97 | The --topo-filter-level=TOPO_FILTER_LEVEL argument is used to throw away
 98 | bisentences that appear in suspicious blocks of bisegments. A block of
 99 | bisegments is determined to be suspicious if the ratio of 1-to-1 bisegments it contains
100 | is less than TOPO_FILTER_LEVEL percent. The heuristic works with blocks of size 100.
101 | This heuristic is useful to identify and remove segments of text where the original
102 | documents differed in larger parts. (Parts were left untranslated, different order of chapters, etc.)
103 | 
104 | 


--------------------------------------------------------------------------------
/scripts/DCEP/README.md:
--------------------------------------------------------------------------------
  1 | # DCEP sentence aligned corpora for 276 langugage pairs
  2 | 
  3 | ## Basic usage
  4 | 
  5 | Example: How to get Danish-Lithuanian sentence-aligned text?
  6 | 
  7 | ###### Get monolingual data
  8 | 
  9 | Enter a directory where the corpus building will take place.
 10 | (You can build several language pairs in this same directory.)
 11 | 
 12 | Download and extract the two sentence-segmented monolingual corpora:
 13 | 
 14 | ```
 15 | wget http://optima.jrc.it/Resources/DCEP-2013/sentences/DCEP-sentence-DA-pub.tar.bz2
 16 | wget http://optima.jrc.it/Resources/DCEP-2013/sentences/DCEP-sentence-LV-pub.tar.bz2
 17 | tar jxf DCEP-sentence-DA-pub.tar.bz2
 18 | tar jxf DCEP-sentence-LV-pub.tar.bz2
 19 | ```
 20 | 
 21 | The sentence segmented text is now in the `./DCEP/sentence/(xml|sgml)/(DA|LV)` subdirectories.
 22 | 
 23 | ###### Get alignment data
 24 | 
 25 | Download and extract the alignment information:
 26 | 
 27 | ```
 28 | wget http://people.mokk.bme.hu/~daniel/DCEP/langpairs/DCEP-DA-LV.tar.bz2
 29 | tar jxf DCEP-DA-LV.tar.bz2
 30 | ```
 31 | 
 32 | The alignment information contains correspondence between numerical indices
 33 | of sentences, in the next step we will turn these into actual sentence pairs.
 34 | 
 35 | Note that the order is alphabetical in language code: `DA-LV` is good, `LV-DA` is not.
 36 | The alignment information is now in the `aligns/DA-LV` subdirectory,
 37 | and the index describing the correspondence between text documents is in the `indices/DA-LV` text file.
 38 | Bidocuments are indentified by 6 digit numeric ids.
 39 | 
 40 | ###### Create bicorpus
 41 | 
 42 | Now we download, extract, and run the tool that generates the bicorpus from the above data:
 43 | 
 44 | ```
 45 | wget http://people.mokk.bme.hu/~daniel/DCEP/DCEP-tools.tgz
 46 | tar zxvf DCEP-tools.tgz
 47 | ./src/languagepair.py DA-LV > DA-LV-bisentences.txt
 48 | ```
 49 | 
 50 | You have to have python version 2.[567] installed to run the tool.
 51 | 
 52 | The output is a tab-delimited UTF-8 text file with two columns.
 53 | It contains all corresponding sentence pairs identified by hunalign, the
 54 | automatic sentence aligner we used to create the alignment information.
 55 | The information about the source document of the sentence pair is lost
 56 | in this output format. See below for command line switches that can alter this
 57 | behavior.
 58 | 
 59 | If you don't roll your own sentence filter, we recommend to use the `--numbering-filter`
 60 | option that drops much of the numberings that are very common in the corpus:
 61 | 
 62 | ```
 63 | ./src/languagepair.py --numbering-filter DA-LV > DA-LV-bisentences.txt
 64 | ```
 65 | 
 66 | See below for more detail.
 67 | 
 68 | 
 69 | ## Advanced usage
 70 | 
 71 | `./src/languagepair.py -h` shows the available command line options.
 72 | Here we give a bit more background for them.
 73 | 
 74 | The original document structure is preserved with the `--no-merge` option.
 75 | This will create aligned text documents in `./bitexts/DA-LV`.
 76 | The numeric ids are used as file names, e.g. `bitexts/DA-LV/013563`.
 77 | The `indices/DA-LV` table can be used to find the correspondence between the bidocument and the
 78 | original DCEP filenames.
 79 | 
 80 | By default, the script looks for the index file describing the document pairings
 81 | at `indices/DA-LV`. This behavior can be changed with the `--index-file` argument.
 82 | Here is a Unix example that only processes the first 10 documents of the index:
 83 | 
 84 | ```./src/languagepair.py --index-file <( head -10 indices/DA-LV ) > DA-LV-bisentences.txt```
 85 | 
 86 | With the `--not-just-bisentences` switch, the output format changes:
 87 | It is one alignment unit per line, where an alignment unit consists of
 88 | two tab-separated columns, one for both languages. In each column,
 89 | there is a `" ~~~ "`-separated list of sentences. It is possible that one
 90 | of the columns is empty: that means that the aligner did not find matching
 91 | pair for the other column. The default `" ~~~ "` can be changed with the
 92 | `--delimiter` command line argument.
 93 | 
 94 | There are command line arguments that can be used to throw away suspicious
 95 | bisentences if extra precision is required, at the expense of recall.
 96 | 
 97 | The `--numbering-filter` is a crude but useful heuristic that attempts to drop numberings
 98 | and short titles from the output. It works simply by matching sentences on both sides
 99 | against a Unicode regex that looks for two alphabetic characters with space between them.
100 | 
101 | The `--length-filter-level=LENGTH_FILTER_LEVEL` argument is used to throw away as suspicious
102 | all bisentences where the ratio of the shorter and the longer sentence (in character length)
103 | is less than `LENGTH_FILTER_LEVEL` percent.
104 | 
105 | The `--topo-filter-level=TOPO_FILTER_LEVEL` argument is used to throw away
106 | bisentences that appear in suspicious blocks of bisegments. A block of
107 | bisegments is determined to be suspicious if the ratio of 1-to-1 bisegments it contains
108 | is less than `TOPO_FILTER_LEVEL` percent. The heuristic works with blocks of size 100.
109 | This heuristic is useful to identify and remove segments of text where the original
110 | documents differed in larger parts. (Parts were left untranslated, different order of chapters, etc.)
111 | 
112 | 


--------------------------------------------------------------------------------
/scripts/DCEP/batchfilebylangpair.2ndcpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p langpairs/batch2
 4 | 
 5 | awk 'BEGIN {
 6 | langnum=0
 7 | while ( getline < "langs.txt" )
 8 | {
 9 |     lang[langnum]=$0
10 |     ++langnum;
11 | }
12 | 
13 | for (i=0; i<langnum-1; ++i )
14 | {
15 | for (j=i+1; j<langnum; ++j )
16 | {
17 |     l1=lang[i];
18 |     l2=lang[j];
19 |     print l1,l2
20 |     system("cat langpairs/aligninfo/" l1 "-" l2 ".aligninfo | cut -f4,5,6 | sed s/ladder/ladder22/ > langpairs/batch2/" l1 "-" l2 ".batch")
21 | }
22 | }
23 | 
24 | }'
25 | 


--------------------------------------------------------------------------------
/scripts/DCEP/batchfilebylangpair.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p langpairs/batch
 4 | 
 5 | awk 'BEGIN {
 6 | langnum=0
 7 | while ( getline < "langs.txt" )
 8 | {
 9 |     lang[langnum]=$0
10 |     ++langnum;
11 | }
12 | 
13 | for (i=0; i<langnum-1; ++i )
14 | {
15 | for (j=i+1; j<langnum; ++j )
16 | {
17 |     l1=lang[i];
18 |     l2=lang[j];
19 |     print l1,l2
20 |     system("cat langpairs/aligninfo/" l1 "-" l2 ".aligninfo | cut -f4,5,6 | sed s/ladder/ladder2/ > langpairs/batch/" l1 "-" l2 ".batch")
21 | }
22 | }
23 | 
24 | }'
25 | 


--------------------------------------------------------------------------------
/scripts/DCEP/dictforlanguagepair.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p langpairs/autodict
 4 | mkdir -p langpairs/autodict.log
 5 | 
 6 | l1=$1
 7 | l2=$2
 8 | 
 9 | bi=langpairs/biqf/$l1-$l2
10 | 
11 | # I forgot to do this filtering step in the biqf creation step.
12 | # I'm lazy and do it with byte length
13 | cat $bi | awk '(length($0)<1000)' > tmp.bi
14 | 
15 | cat tmp.bi | cut -f1 > tmp.l1
16 | cat tmp.bi | cut -f2 > tmp.l2
17 | 
18 | ./acquisScripts/scripts/coocc.forAcquis -mc10 -ms40 tmp.l1 tmp.l2 2> langpairs/autodict.log/$l1-$l2 > langpairs/autodict/$l1-$l2.dic
19 | 


--------------------------------------------------------------------------------
/scripts/DCEP/dictsforalllanguagepairs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | awk 'BEGIN {
 4 | langnum=0
 5 | while ( getline < "langs.txt" )
 6 | {
 7 |     lang[langnum]=$0
 8 |     ++langnum;
 9 | }
10 | 
11 | for (i=0; i<langnum-1; ++i )
12 | {
13 | for (j=i+1; j<langnum; ++j )
14 | {
15 |     l1=lang[i];
16 |     l2=lang[j];
17 |     print l1,l2
18 |     system("hunalign/scripts/DCEP/dictforlanguagepair.sh " l1 " " l2 )
19 | }
20 | }
21 | 
22 | }'
23 | 


--------------------------------------------------------------------------------
/scripts/DCEP/extract-bisentences.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir langpairs
 4 | mkdir langpairs/raw
 5 | cat total.aligninfo.shuffled.limitin1000 | while read did l1 l2 tok1 tok2 ladder
 6 | do
 7 |     hunalign/scripts/ladder2text.py $ladder $tok1 $tok2 >> langpairs/raw/$l1-$l2 2>> cerr.ladder2text
 8 | done
 9 | 
10 | mkdir langpairs/biqf
11 | cat total.aligninfo.shuffled.limitin1000 | while read did l1 l2 tok1 tok2 ladder
12 | do
13 |     hunalign/scripts/ladder2text.py $ladder $tok1 $tok2 | hunalign/scripts/DCEP/filteralign.sh >> langpairs/biqf/$l1-$l2 2> /dev/null
14 | done
15 | 


--------------------------------------------------------------------------------
/scripts/DCEP/filteralign.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | grep -v "~~~" | grep -v "<p>" | awk 'BEGIN {FS="\t"} { ra = ( length($2)>length($3) ? (length($2)+10)/(length($3)+10) : (length($3)+10)/(length($2)+10) ) ; if ((ra<1.5)&&($2!=$3)) print $2 "\t" $3 }'
4 | 


--------------------------------------------------------------------------------
/scripts/DCEP/finalpackage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | p=$1
 4 | 
 5 | cat langpairs/aligninfo/$p.aligninfo | cut -f1,4,5 | sed "s/\.\/tree\/tok\///g" > final/indices/$p
 6 | cd final
 7 | tar jcf packages/DCEP-$p.tar.bz2 aligns/$p indices/$p
 8 | # scp DCEP-$p.tar.bz2 kruso.mokk.bme.hu:./public_html/DCEP/langpairs/
 9 | cd ..
10 | 


--------------------------------------------------------------------------------
/scripts/DCEP/finalpackageforlangpairs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p final/indices
 4 | mkdir -p final/packages
 5 | 
 6 | awk 'BEGIN {
 7 | langnum=0
 8 | while ( getline < "langs.txt" )
 9 | {
10 |     lang[langnum]=$0
11 |     ++langnum;
12 | }
13 | 
14 | for (i=0; i<langnum-1; ++i )
15 | {
16 | for (j=i+1; j<langnum; ++j )
17 | {
18 |     l1=lang[i];
19 |     l2=lang[j];
20 |     print l1,l2
21 |     p = l1 "-" l2
22 |     system("bash hunalign/scripts/DCEP/finalpackage.sh " p )
23 | }
24 | }
25 | 
26 | }'
27 | 


--------------------------------------------------------------------------------
/scripts/DCEP/flatladdertolangpair.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # With an uncomfortable tie-in, it does several things:
 4 | # 1. creates the final directory structure.
 5 | # 2. restructures the flatly structured ladder files into a language-pair based structure.
 6 | # 3. gets rid of the unreliable quality values in the ladder files. (cut -f1,2)
 7 | # 4. makes the final filenames as simple as possible,
 8 | #    from flat/ladder/89/033089.ES.LT.ladder to final/aligns/ES-LT/033089
 9 | 
10 | p=$1
11 | if [ ${#p} -ne 5 ]
12 | then
13 |     echo "Usage: flatladdertolangpair.sh L1-L2"
14 |     exit -1
15 | fi
16 | 
17 | pdot=`echo $p | sed "s/-/./"`
18 | 
19 | targ=final/aligns
20 | mkdir -p $targ/$p
21 | 
22 | sub=ladder2
23 | find flat/$sub -type f | grep "$pdot" | sed "s/^flat\/$sub\///" | sed "s/\.ladder$//" | tr '/.' ' ' |\
24 | while read dig id l1 l2
25 | do
26 |     cat flat/$sub/$dig/$id.$l1.$l2.ladder 2>> tmp/cerr.$p | cut -f1,2 > $targ/$l1-$l2/$id
27 | done
28 | 


--------------------------------------------------------------------------------
/scripts/DCEP/flatladdertolangpairs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # With an uncomfortable tie-in, it does several things:
 4 | # 1. creates the final directory structure.
 5 | # 2. restructures the flatly structured ladder files into a language-pair based structure.
 6 | # 3. gets rid of the unreliable quality values in the ladder files. (cut -f1,2)
 7 | # 4. makes the final filenames as simple as possible,
 8 | #    from flat/ladder/89/033089.ES.LT.ladder to final/aligns/ES-LT/033089
 9 | 
10 | targ=final/aligns
11 | mkdir -p $targ
12 | ls langpairs/batch | cut -f1 -d'.' | while read p ; do mkdir $targ/$p ; done
13 | 
14 | sub=ladder2 # This is the realign, not ladder22 that's only the second half, this one went through completely.
15 | find flat/$sub -type f | sed "s/^flat\/$sub\///" | sed "s/\.ladder$//" | tr '/.' ' ' |\
16 | while read dig id l1 l2
17 | do
18 |     cat flat/$sub/$dig/$id.$l1.$l2.ladder | cut -f1,2 > $targ/$l1-$l2/$id
19 | done
20 | 


--------------------------------------------------------------------------------
/scripts/DCEP/ladder2text.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # In the code you can see lots of variables named hu* and en*, as
  4 | # as in Hungarian and English. This does not mean that the tool
  5 | # is not completely language-agnostic. By convention, and
  6 | # for obvious historical reasons, hu and en should be interpreted
  7 | # as language #1 and language #2.
  8 | 
  9 | import sys
 10 | import itertools
 11 | import re
 12 | 
 13 | # An especially crude but quite useful heuristics for
 14 | # detecting sentences (as opposed to numberings, separators etc).
 15 | # Two alphabetic characters with space between them.
 16 | # See http://stackoverflow.com/a/2039476/383313 for an explanation.
 17 | TWO_WORDS_REGEX = re.compile(r"""\w \w""", re.UNICODE)
 18 | # TWO_WORDS_REGEX = re.compile(r"""[^\W\d_] [^\W\d_]""", re.UNICODE)
 19 | 
 20 | def readfile(name):
 21 |     # Open the input files and read lines
 22 |     infile = file(name, 'r')
 23 |     lines = map( lambda s : s.strip("\n"), infile.readlines() )
 24 |     return lines
 25 | 
 26 | '''s -> (s0,s1), (s1,s2), (s2, s3), ...
 27 | see http://docs.python.org/library/itertools.html'''
 28 | def pairwise(iterable):
 29 |     a, b = itertools.tee(iterable)
 30 |     b.next()
 31 |     return itertools.izip(a, b)
 32 | 
 33 | '''Create aligned text from two sentence files and hunalign's ladder-style output.
 34 | Usage: ladder2text.py <aligner.ladder> <hu.sen> <en.sen> > aligned.txt
 35 | See http://mokk.bme.hu/resources/hunalign for detailed format specification and more.
 36 | The output file is tab-delimited, with two or three columns.
 37 | The first and second columns are the chunks corresponding to each other.
 38 | " ~~~ " is the sentence delimiter inside chunks.
 39 | The third column is a probability score, if the input file had one.
 40 | '''
 41 | 
 42 | def parseLadderLine(l):
 43 |     a = l.split()
 44 |     # We allow both scored and score-less input.
 45 |     assert 2<=len(a)<=3
 46 |     # The score we leave as a string, to avoid small diffs caused by different numerical representations.
 47 |     a[0],a[1] = int(a[0]),int(a[1])
 48 |     return a
 49 | 
 50 | # a hole is supposed to be two consecutive items in the array holding the lines of the ladder. /an array of holes is returned by pairwise(ladder)/
 51 | # the following segment returns an interval of sentences corresponding to a hole:
 52 | # hulines[int(hole[0][0]):int(hole[1][0])]
 53 | def holeToBisegment(hole,hulines,enlines) :
 54 |     if len(hole[0])==3 :
 55 | 	quality = hole[0][2]
 56 |     else :
 57 | 	quality = None
 58 | 
 59 |     huSens = hulines[hole[0][0]:hole[1][0]]
 60 |     enSens = enlines[hole[0][1]:hole[1][1]]
 61 |     return huSens,enSens,quality
 62 | 
 63 |     #serializeSens(huSens, enSens, quality, delimiter)
 64 | 
 65 | def serializeBisegment(huSens,enSens,quality=None,delimiter=" ~~~ ") :
 66 |     huText = delimiter.join(huSens)
 67 |     enText = delimiter.join(enSens)
 68 |     text = huText+"\t"+enText
 69 |     if quality is not None :
 70 | 	text += "\t"+str(quality)
 71 |     return text
 72 | 
 73 | def isBisen(hole) :
 74 |     return (hole[1][0]-hole[0][0]==1) and (hole[1][1]-hole[0][1]==1)
 75 | 
 76 | def isBisenPos(pos,ladder) :
 77 |     assert pos+2<=len(ladder)
 78 |     hole = ladder[pos:pos+2]
 79 |     return isBisen(hole)
 80 | 
 81 | def crudeSentenceDetector(huSenUtf,enSenUtf) :
 82 |     return TWO_WORDS_REGEX.search(huSenUtf) is not None and TWO_WORDS_REGEX.search(enSenUtf) is not None
 83 | 
 84 | def isAcceptableLength(huSenUtf,enSenUtf,lengthFilterLevel) :
 85 |     lengthFilterRatio = float(lengthFilterLevel)/100 # TODO Casting in every inner loop, how lame is that.
 86 |     if lengthFilterLevel is None :
 87 | 	return True
 88 |     h = len(huSenUtf)+1
 89 |     e = len(enSenUtf)+1
 90 |     ratio = float(h)/e
 91 |     if ratio>1 :
 92 | 	ratio = 1/ratio
 93 |     return ratio>=lengthFilterRatio
 94 | 
 95 | def filterTopology(ladder, topoFilterLevel) :
 96 |     if topoFilterLevel is None :
 97 | 	return ladder
 98 | 
 99 |     WINDOW = 100
100 |     # the higher the stricter.
101 |     topoFilterRatio = float(topoFilterLevel)/100
102 |     rungsToKill = set()
103 |     trailSize = len(ladder)
104 |     for pos in range(1,trailSize-1-WINDOW) :
105 | 	huStart = ladder[pos][0]
106 | 	enStart = ladder[pos][1]
107 | 	huEnd = ladder[pos+WINDOW][0]
108 | 	enEnd = ladder[pos+WINDOW][1]
109 | 	deviation = float(huEnd-huStart+1)/(enEnd-enStart+1) # TODO We don't currently use it.
110 | 	if deviation>1 :
111 | 	    deviation = 1/deviation
112 | 	bisenCnt = 0
113 | 	for pos2 in range(pos,pos+WINDOW) :
114 | 	    if isBisenPos(pos2,ladder) :
115 | 		bisenCnt += 1
116 | 	ratio = float(bisenCnt)/WINDOW
117 | 	# sys.stderr.write("%f %f\n" % (ratio,deviation))
118 | 	# TODO That's lame algorithmically, will switch to proper window-sliding when the basic algorithm is validated.
119 | 	if ratio<topoFilterRatio :
120 | 	    for pos2 in range(pos,pos+WINDOW) :
121 | 		rungsToKill.add(pos2)
122 | 
123 |     newLadder = [ rung for pos,rung in enumerate(ladder) if pos not in rungsToKill ]
124 |     return newLadder
125 | 
126 | # topoFilter, lengthFilter, and sentenceDetector are only meaningingful for justBisen.
127 | # The former is applied before holeToBisegment, and works by removing rungs.
128 | # The latter two are applied after collecting the bisegments.
129 | def ladderToBisegments(ladderOrig, huSentences, enSentences, justBisen, topoFilterLevel, lengthFilterLevel, sentenceDetector) :
130 |     ladder = ladderOrig[:]
131 | 
132 |     if topoFilterLevel is not None or lengthFilterLevel is not None :
133 | 	assert justBisen
134 | 
135 |     if topoFilterLevel is not None :
136 | 	ladder = filterTopology(ladder, topoFilterLevel)
137 | 
138 |     bisegments = [ holeToBisegment(hole,huSentences,enSentences) for hole in pairwise(ladder) if ( isBisen(hole) or not justBisen ) ]
139 | 
140 |     if lengthFilterLevel is not None or sentenceDetector :
141 | 	keptBisegments = []
142 | 	for bisegment in bisegments :
143 | 	    # Lossiness is not an issue, we only use this to filter the raw bisegments.
144 | 	    huSenUtf = bisegment[0][0].decode("utf-8","ignore")
145 | 	    enSenUtf = bisegment[1][0].decode("utf-8","ignore")
146 | 	    if lengthFilterLevel is not None and not isAcceptableLength(huSenUtf, enSenUtf, lengthFilterLevel) :
147 | 		continue
148 | 	    if sentenceDetector and not crudeSentenceDetector(huSenUtf,enSenUtf) :
149 | 		continue
150 | 	    keptBisegments.append(bisegment)
151 | 	bisegments = keptBisegments
152 | 
153 |     return bisegments
154 | 
155 | 
156 | def process(ladderFile, huFile, enFile, justBisen, delimiter, topoFilterLevel, lengthFilterLevel, sentenceDetector) :
157 |     ladderLines = readfile(ladderFile)
158 |     huSentences = readfile(huFile)
159 |     enSentences = readfile(enFile)
160 |     ladder = map( parseLadderLine, ladderLines )
161 |     bisegments = ladderToBisegments(ladder, huSentences, enSentences, justBisen, topoFilterLevel, lengthFilterLevel, sentenceDetector)
162 |     lines = [ serializeBisegment(huSens,enSens,quality,delimiter) for huSens,enSens,quality in bisegments ]
163 |     return "\n".join(lines)+"\n"
164 | 
165 | 
166 | def main() :
167 |     justBisen = False
168 |     if "--bisen" in sys.argv[1:] :
169 | 	justBisen = True
170 | 	sys.argv.remove("--bisen")
171 |     if len(sys.argv)==4:
172 | 	ladderFile,huFile,enFile = sys.argv[1:]
173 | 	outputString = process(ladderFile, huFile, enFile, justBisen=justBisen, delimiter=" ~~~ ")
174 | 	sys.stdout.write(outputString) # There's a \n at the end of outputString already.
175 |     else:
176 | 	sys.stderr.write( 'usage: ladder2text.py [ --bisen ] <aligned.ladder> <hu.raw> <en.raw> > aligned.txt\n' )
177 | 	sys.exit(-1)
178 | 
179 | 
180 | if __name__ == "__main__" :
181 |     main()
182 | 


--------------------------------------------------------------------------------
/scripts/DCEP/languagepair.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import os.path
  5 | import os, errno
  6 | import optparse
  7 | 
  8 | import ladder2text
  9 | 
 10 | 
 11 | dcepLanguages = [ "BG", "CS", "DA", "DE", "EL", "EN", "ES", "ET", "FI", "FR", "GA", "HU", "IT", "LT", "LV", "MT", "NL", "PL", "PT", "RO", "SK", "SL", "SV", "TR" ]
 12 | 
 13 | 
 14 | class InputError(Exception):
 15 |     pass
 16 | 
 17 | def error(s) :
 18 |     sys.stderr.write("ERROR: "+s+"\n")
 19 |     sys.exit(-1)
 20 | 
 21 | def mkdir_p(path) :
 22 |     try :
 23 |         os.makedirs(path)
 24 |     except OSError, exc :
 25 |         if exc.errno == errno.EEXIST and os.path.isdir(path) :
 26 |             pass
 27 |         else :
 28 |             raise
 29 | 
 30 | def main():
 31 | 
 32 |     parser = optparse.OptionParser()
 33 |     defaultDelimiter = " ~~~ "
 34 |     parser.add_option("--no-merge", action="store_true", dest="noMerge", help="Keep the output bidocuments in separate files under bitext/L1-L2/, instead of merging them and writing them to the standard output.")
 35 |     parser.add_option("--not-just-bisentences", action="store_false", dest="justBisen", default=True, help="Save all alignment units, not just 1-to-1 correspondences.")
 36 |     parser.add_option("--delimiter", dest="delimiter", type="string", default=defaultDelimiter, help="String for delimiting sentences within alignment units. Only meaningful when combined with --not-just-bisentences. Default value: '"+defaultDelimiter+"'.")
 37 | 
 38 | # TODO Commented out, hardly finished.
 39 | #    parser.add_option("--topo-filter-level", action="store", type="int", dest="topoFilterLevel", metavar="TOPO_FILTER_LEVEL",
 40 | #	help="Agressiveness of context-based bisentence filtering. Between 0 and 100. By default it is not employed. Cannot be combined with --not-just-bisentences.")
 41 | 
 42 |     parser.add_option("--length-filter-level", action="store", type="int", dest="lengthFilterLevel", metavar="LENGTH_FILTER_LEVEL",
 43 | 	help="Agressiveness of sentence character length based bisentence filtering. Between 0 and 100. By default it is not employed. Cannot be combined with --not-just-bisentences.")
 44 |     parser.add_option("--numbering-filter", action="store_true", dest="sentenceDetector", default=False,
 45 | 	help="A crude heuristic that drops numberings and short titles from the output. Cannot be combined with --not-just-bisentences.")
 46 | 
 47 |     parser.add_option("--index-file", action="store", type="string", dest="indexFilename", metavar="INDEX_FILE",
 48 | 	help="Use this file to decide which documents to process, instead of the default indices/L1-L2. Tab-separated file with rows containing document-id L1-sentence-segmented-file L2-sentence-segmented-file. When combined with --no-merge, the bitext/L1-L2 directory is deduced from the sentence file paths, assuming DCEP directory structure.")
 49 | 
 50 |     parser.usage = "%prog [options] L1-L2\nwhere L1-L2 is a language pair, and L1 and L2 are in alphabetical order. E.g. DE-EN.\n"
 51 |     parser.usage += "or\n%prog [options] --index-file INDEX_FILE."
 52 | 
 53 |     try :
 54 | 	assert len(sys.argv)>1
 55 | 	(options, args) = parser.parse_args(sys.argv[1:])
 56 |     except :
 57 | 	parser.print_help()
 58 | 	sys.exit(-1)
 59 | 
 60 |     # TODO Remove after topoFilter is finished.
 61 |     options.topoFilterLevel = None
 62 | 
 63 |     if options.indexFilename :
 64 | 	if len(args)>0 :
 65 | 	    error("Should not give a language pair when the --index-file argument is used.")
 66 | 	l1 = None
 67 | 	l2 = None
 68 |     else :
 69 | 	try :
 70 | 	    assert len(args)==1
 71 | 	    lp = args[0]
 72 | 	    l1,l2 = lp.split("-")
 73 | 	    assert len(l1)==len(l2)==2
 74 | 	    assert l1<l2
 75 | 	except :
 76 | 	    error("One and only one language pair should be provided as argument, in L1-L2 format.\nL1 must be lexicographically smaller than L2.")
 77 | 
 78 | 	for l in (l1,l2) :
 79 | 	    if l not in dcepLanguages :
 80 | 		error(l1+" is not the language code of a DCEP language.")
 81 | 
 82 | 
 83 |     if options.topoFilterLevel is not None and not(0<=options.topoFilterLevel<=100) :
 84 | 	error("TOPO_FILTER_LEVEL should be between 0 and 100 inclusive.")
 85 |     if options.lengthFilterLevel is not None and not(0<=options.lengthFilterLevel<=100) :
 86 | 	error("LENGTH_FILTER_LEVEL should be between 0 and 100 inclusive.")
 87 | 
 88 |     if options.indexFilename :
 89 | 	indexFilename = options.indexFilename
 90 |     else :
 91 | 	indexFilename = "indices/"+lp
 92 | 
 93 |     # TODO This part has not really been figured out yet.
 94 |     if options.noMerge and not options.indexFilename :
 95 | 	mkdir_p("bitext/"+lp)
 96 | 
 97 |     if options.indexFilename :
 98 | 	languagePairsEncountered = set()
 99 |     else :
100 | 	languagePairsEncountered = set((lp,))
101 | 
102 |     prefix = "DCEP/sentence/" # TODO Should be generalized to tokenized text.
103 | 
104 |     # TODO Make it an option.
105 |     verbose = False
106 | 
107 |     docCounter = 0
108 |     errorCounter = 0
109 |     try :
110 | 	f = open(indexFilename)
111 |     except :
112 | 	error("Missing index file "+indexFilename)
113 |     for line in f :
114 | 	docid,doc1,doc2 = line.strip().split()
115 | 
116 | 	heuristicL1 = doc1.split("/")[1]
117 | 	heuristicL2 = doc2.split("/")[1]
118 | 	if l1 is not None :
119 | 	    assert heuristicL1==l1
120 | 	    assert l2 is not None
121 | 	    assert heuristicL2==l2
122 | 	else :
123 | 	    lp = heuristicL1+"-"+heuristicL2
124 | 	    if options.noMerge and (lp not in languagePairsEncountered) :
125 | 		mkdir_p("bitext/"+lp)
126 | 		languagePairsEncountered.add(lp)
127 | 
128 | 	if verbose :
129 | 	    sys.stderr.write(docid)
130 | 	ladder = "aligns/"+lp+"/"+docid
131 | 	doc1 = prefix+doc1
132 | 	doc2 = prefix+doc2
133 | 
134 | 	# See explanation of this 'if' at the 'else' path.
135 | 	if os.path.isfile(ladder) :
136 | 	    outputBytes = ladder2text.process(ladder, doc1, doc2,
137 | 	    justBisen=options.justBisen, delimiter=options.delimiter,
138 | 	    topoFilterLevel=options.topoFilterLevel, lengthFilterLevel=options.lengthFilterLevel,
139 | 	    sentenceDetector=options.sentenceDetector)
140 | 
141 | 	    if options.noMerge :
142 | 		resultFilename = "bitext/"+lp+"/"+docid
143 | 		try :
144 | 		    resultFile = open(resultFilename, "w")
145 | 		    resultFile.write(outputBytes)
146 | 		    resultFile.close()
147 | 		except :
148 | 		    error("Failed to write to file %s" % resultFilename)
149 | 	    else :
150 | 		sys.stdout.write(outputBytes)
151 | 
152 | 	    docCounter += 1
153 | 	    if verbose :
154 | 		sys.stderr.write(" done\n")
155 | 	else :
156 | 	    # Ugly special case code.
157 | 	    # Sometimes hunalign rejects a task (e.g. when sentence counts differ too much)
158 | 	    # Due to some sloppiness in the postprocessing, these become empty ladder files.
159 | 	    # Normally this happens rarely, so if it is common, we take it as a sign that the
160 | 	    # directory structure was not set up properly.
161 | 	    if verbose :
162 | 		sys.stderr.write(" skipped\n")
163 | 	    errorCounter += 1
164 | 	    if errorCounter>docCounter+100 :
165 | 		error("Too many align/"+lp+" files missing, the directory structure was probably not set up properly.")
166 | 
167 | main()
168 | 


--------------------------------------------------------------------------------
/scripts/DCEP/mergedicts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p langpairs/fulldict
 4 | 
 5 | awk 'BEGIN {
 6 | langnum=0
 7 | while ( getline < "langs.txt" )
 8 | {
 9 |     lang[langnum]=$0
10 |     ++langnum;
11 | }
12 | 
13 | for (i=0; i<langnum-1; ++i )
14 | {
15 | for (j=i+1; j<langnum; ++j )
16 | {
17 |     l1=lang[i];
18 |     l2=lang[j];
19 |     print l1,l2
20 |     p = l1 "-" l2 ".dic"
21 |     system("bash hunalign/scripts/DCEP/normalizedict.sh < langpairs/autodict/" p " > langpairs/fulldict/" p )
22 |     system("bash hunalign/scripts/DCEP/normalizesztakidict.sh < langpairs/sztaki/" p " >> langpairs/fulldict/" p " 2> /dev/null" )
23 | }
24 | }
25 | 
26 | }'
27 | 


--------------------------------------------------------------------------------
/scripts/DCEP/normalizedict.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cut -f1,2 | grep -v "@" | awk '{ print $2,"@",$1 }'
4 | 


--------------------------------------------------------------------------------
/scripts/DCEP/normalizesztakidict.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | awk '{ print $2,"@",$1 }'
4 | 


--------------------------------------------------------------------------------
/scripts/DCEP/packaligninfobylangpair.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p langpairs/aligninfo
 4 | 
 5 | awk 'BEGIN {
 6 | langnum=0
 7 | while ( getline < "langs.txt" )
 8 | {
 9 |     lang[langnum]=$0
10 |     ++langnum;
11 | }
12 | 
13 | for (i=0; i<langnum-1; ++i )
14 | {
15 | for (j=i+1; j<langnum; ++j )
16 | {
17 |     l1=lang[i];
18 |     l2=lang[j];
19 |     print l1,l2
20 |     system("grep " l1 "." l2   " < total.aligninfo > langpairs/aligninfo/" l1 "-" l2 ".aligninfo")
21 | }
22 | }
23 | 
24 | }'
25 | 


--------------------------------------------------------------------------------
/scripts/DCEP/realignall.2ndcpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # flat/ladder2 already created.
 4 | 
 5 | mkdir -p langpairs/realign2.log
 6 | 
 7 | awk 'BEGIN {
 8 | langnum=0
 9 | while ( getline < "langs.txt" )
10 | {
11 |     lang[langnum]=$0
12 |     ++langnum;
13 | }
14 | 
15 | k=0
16 | 
17 | for (i=0; i<langnum-1; ++i )
18 | {
19 | for (j=i+1; j<langnum; ++j )
20 | {
21 |     ++k;
22 |     if (k<=170) { continue }
23 |     l1=lang[i];
24 |     l2=lang[j];
25 |     print l1,l2
26 |     p = l1 "-" l2
27 |     system("./hunalign/src/hunalign/hunalign -batch langpairs/fulldict/" p ".dic langpairs/batch2/" p ".batch 2> langpairs/realign2.log/" p )
28 | }
29 | }
30 | 
31 | }'
32 | 


--------------------------------------------------------------------------------
/scripts/DCEP/realignall.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # flat/ladder2 already created.
 4 | 
 5 | mkdir -p langpairs/realign.log
 6 | 
 7 | awk 'BEGIN {
 8 | langnum=0
 9 | while ( getline < "langs.txt" )
10 | {
11 |     lang[langnum]=$0
12 |     ++langnum;
13 | }
14 | 
15 | for (i=0; i<langnum-1; ++i )
16 | {
17 | for (j=i+1; j<langnum; ++j )
18 | {
19 |     l1=lang[i];
20 |     l2=lang[j];
21 |     print l1,l2
22 |     p = l1 "-" l2
23 |     system("./hunalign/src/hunalign/hunalign -batch langpairs/fulldict/" p ".dic langpairs/batch/" p ".batch 2> langpairs/realign.log/" p )
24 | }
25 | }
26 | 
27 | }'
28 | 


--------------------------------------------------------------------------------
/scripts/DCEP/renamesztakidicts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p langpairs/sztaki
 4 | 
 5 | awk 'BEGIN {
 6 | langnum=0
 7 | while ( getline < "langs.txt" )
 8 | {
 9 |     lang[langnum]=$0
10 |     ++langnum;
11 | }
12 | 
13 | for (i=0; i<langnum-1; ++i )
14 | {
15 | for (j=i+1; j<langnum; ++j )
16 | {
17 |     l1=lang[i];
18 |     l2=lang[j];
19 |     ll1 = tolower(l1)
20 |     ll2 = tolower(l2)
21 |     print l1,l2
22 |     system("cp hlt.sztaki.dicts/" ll1 "_" ll2 " langpairs/sztaki/" l1 "-" l2 ".dic" )
23 | }
24 | }
25 | 
26 | }'
27 | 


--------------------------------------------------------------------------------
/scripts/DCEP/reorg.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import gzip
 3 | import os, errno
 4 | 
 5 | def logg(s) :
 6 |     sys.stderr.write(s+"\n")
 7 | 
 8 | def mkdir_p(path) :
 9 |     try :
10 |         os.makedirs(path)
11 |     except OSError, exc :
12 |         if exc.errno == errno.EEXIST and os.path.isdir(path) :
13 |             pass
14 |         else :
15 | 	    raise
16 | 
17 | def lang4doc(doc) :
18 |     a = doc.split("/")
19 |     assert a[0] in ('sgml','xml')
20 |     return a[1]
21 | 
22 | def getFlatFilename(did,lang,flatDir) :
23 |     return "%s/%s/%s.%s.txt" % (flatDir,did[-2:],did,lang)
24 | 
25 | def rename(did,docs,langs,sentenceRootDir,flatDir) :
26 |     for doc,lang in zip(docs,langs) :
27 | 	oldFilename = sentenceRootDir+"/"+doc
28 | 	newFilename = getFlatFilename(did,lang,flatDir)
29 | 	try :
30 | 	    shutil.copyfile(oldFilename,newFilename)
31 | 	except :
32 | 	    raise
33 | 
34 | def alignOnePair(doc1,doc2,did,l1,l2,tokRootDir,ladderRootDir) :
35 |     # If you send this through a cut -f4- , that's the input for a hunalign -batch run.
36 |     file1 = tokRootDir+"/"+doc1
37 |     file2 = tokRootDir+"/"+doc2
38 |     ladder = '{ladderRootDir}/{didPrefix}/{did}.{l1}.{l2}.ladder'.format(ladderRootDir=ladderRootDir,didPrefix=did[-2:],did=did,l1=l1,l2=l2)
39 |     print "\t".join((did,l1,l2,file1,file2,ladder))
40 | 
41 | def setupLadderDir(ladderRootDir) :
42 |     logg("Setting up ladderDir.")
43 |     for i in range(10) :
44 | 	for j in range(10) :
45 | 	    mkdir_p(ladderRootDir+"/"+str(i)+str(j))
46 |     logg("Done.")
47 | 
48 | # indexLine is coming from unzipped-named-cross-lingual-index.txt
49 | def doOneDocForAllLangPairs(indexLine, tokRootDir, ladderRootDir) :
50 |     a = indexLine.strip("\n").split()
51 |     did = a[0]
52 |     docs = a[1:]
53 |     langs = [ lang4doc(doc) for doc in docs ]
54 |     if int(did)%1000==0 :
55 | 	logg(did)
56 | 
57 |     # rename(did,docs,langs,tokRootDir,tokFlatDir)
58 | 
59 |     for i1,l1 in enumerate(langs) :
60 | 	doc1 = docs[i1]
61 | 	for i2 in range(i1+1,len(langs)) :
62 | 	    l2 = langs[i2]
63 | 	    doc2 = docs[i2]
64 | 	    alignOnePair(doc1,doc2,did,l1,l2,tokRootDir,ladderRootDir)
65 | 
66 | 
67 | # A typical tokenized file is under tree/tok/${some field in unzipped-named-cross-lingual-index.txt},
68 | # An example of ${some field} is sgml/DA/REPORT/511814__REPORT__A5-2000-0003__DA.txt
69 | 
70 | # A ladder file is under flat/ladder/${last two digits of did}/$did.$l1.$l2.ladder
71 | 
72 | def main() :
73 |     tokRootDir = "./tree/tok"
74 |     ladderRootDir = "./flat/ladder"
75 | 
76 |     setupLadderDir(ladderRootDir)
77 | 
78 |     for line in sys.stdin :
79 | 	doOneDocForAllLangPairs(line, tokRootDir, ladderRootDir)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     main()
84 | 


--------------------------------------------------------------------------------
/scripts/DCEP/tokenizeAll.sh:
--------------------------------------------------------------------------------
 1 | echo "Starting tokenization"
 2 | date
 3 | 
 4 | find tree/sentence/ -type f | while read f
 5 | do
 6 |     t=`echo $f | sed "s/^tree\/sentence/tree\/tok/"`
 7 |     cat "$f" | bash hunalign/scripts/tok.one.sh > "$t"
 8 | done
 9 | 
10 | echo "Starting sentence count verification"
11 | date
12 | 
13 | find tree/sentence/ -type f | while read f
14 | do
15 |     t=`echo $f | sed "s/^tree\/sentence/tree\/tok/"`
16 |     lSen=`wc -l < $f`
17 |     lStrip=`wc -l < $t`
18 |     if [ "$lSen" -ne "$lStrip" ]
19 |     then
20 | 	echo Mismatch: $f $lSen $t $lStrip 
21 |     fi
22 | done
23 | 
24 | echo "Done."
25 | date
26 | 


--------------------------------------------------------------------------------
/scripts/DCEP/verifylangpair.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # quantifies the difference between align and realign for a given language pair
 6 | 
 7 | p=$1
 8 | seed=$2
 9 | if [ -z "$seed" ]; then seed=0 ; fi
10 | 
11 | cat langpairs/aligninfo/$p.aligninfo | cut -f6 | awk -v seed=$seed '(NR%100==seed)' > tmp/$p.ladsam
12 | cat tmp/$p.ladsam | xargs cat | cut -f1,2 > tmp/$p.ladcat
13 | cat tmp/$p.ladsam | sed "s/ladder/ladder2/" | xargs cat | cut -f1,2 > tmp/$p.ladcat2
14 | l1=`wc -l < tmp/$p.ladcat`
15 | l2=`wc -l < tmp/$p.ladcat2`
16 | d=`diff tmp/$p.ladcat tmp/$p.ladcat2 | wc -l`
17 | echo $p $l1 $l2 $d
18 | 


--------------------------------------------------------------------------------
/scripts/en.sen.one.sh:
--------------------------------------------------------------------------------
1 | huntoken -e -b | grep "\(^<s>\)\|\(^  <p>\)" | sed "s/^<s>//" | sed "s/^  <p>/<p>/"
2 | 


--------------------------------------------------------------------------------
/scripts/hu.sen.one.sh:
--------------------------------------------------------------------------------
1 | huntoken -b | grep "\(^<s>\)\|\(^  <p>\)" | sed "s/^<s>//" | sed "s/^  <p>/<p>/"
2 | 


--------------------------------------------------------------------------------
/scripts/hunalignDriver.py:
--------------------------------------------------------------------------------
  1 | from StringIO import StringIO
  2 | 
  3 | from teed import *
  4 | 
  5 | from partialAlign import *
  6 | 
  7 | 
  8 | OUTPUT_FILENAME="tmp/tmp_chunk"
  9 | BATCH_JOB_FILENAME = "tmp/tmp_batch"
 10 | MAXIMAL_CHUNK_SIZE = 1000
 11 | 
 12 | 
 13 | def parseLadderData(fileContent) :
 14 |     ls = fileContent.split("\n")
 15 |     if ls[-1]=="" :
 16 |         del ls[-1]
 17 | 
 18 |     result = []
 19 |     for l in ls :
 20 |         a = l.split()
 21 |         if len(a)!=3 :
 22 |             raise Exception("hunalign should return 3-column data.")
 23 |         result.append((int(a[0]),int(a[1]),float(a[2])))
 24 | 
 25 |     return result
 26 | 
 27 | def serializeLadderData(ladder) :
 28 |     def serializeLine(l) :
 29 |         return "\t".join(map(str,l))
 30 |     return "\n".join( map(serializeLine, ladder) ) + "\n"
 31 | 
 32 | def hunalignDriver(hunalignExecutablePath, hunalignArgs) :
 33 | 
 34 |     cmd = [ hunalignExecutablePath ] + hunalignArgs
 35 | 
 36 |     fout, ferr = StringIO(), StringIO()
 37 |     exitcode = teed_call(cmd, stdout=fout, stderr=ferr)
 38 |     stdout = fout.getvalue()
 39 |     stderr = ferr.getvalue()
 40 | 
 41 |     if exitcode!=0 :
 42 |         raise Exception("hunalign returned with exit code "+str(exitcode))
 43 | 
 44 |     result = parseLadderData(stdout)
 45 | 
 46 |     return result, stderr
 47 | 
 48 | 
 49 | def batchHunalignDriver(hunalignExecutablePath, hunalignArgs) :
 50 |     cmd = [ hunalignExecutablePath, "-batch" ] + hunalignArgs
 51 | 
 52 |     fout, ferr = StringIO(), StringIO()
 53 |     exitcode = teed_call(cmd, stdout=fout, stderr=ferr)
 54 |     stdout = fout.getvalue()
 55 |     stderr = ferr.getvalue()
 56 | 
 57 |     if exitcode!=0 :
 58 |         raise Exception("hunalign returned with exit code "+str(exitcode))
 59 | 
 60 |     assert len(stdout)==0
 61 | 
 62 | 
 63 | def partialAlignDriver(huFilename, enFilename) :
 64 |     chain,stdout = partialAlignWithIO(huFilename, enFilename, outputFilename=OUTPUT_FILENAME, huLangName="a", enLangName="b", maximalChunkSize=MAXIMAL_CHUNK_SIZE)
 65 |     return chain,stdout
 66 | 
 67 | def fullStack(hunalignExecutablePath, huFilename, enFilename, dictFilename) :
 68 |     chain,stdout = partialAlignDriver(huFilename, enFilename)
 69 | 
 70 |     with open(BATCH_JOB_FILENAME,'w') as f :
 71 |         f.write(stdout)
 72 | 
 73 |     extraCareful = True
 74 |     if extraCareful :
 75 |         # Output should arrive in files named OUTPUT_FILENAME +"_"+ str(number) +"."+ ("a" if hu else "b")
 76 |         chunkNumber = len(chain)-1
 77 |         huSenCnt = 0
 78 |         enSenCnt = 0
 79 |         for chunkId in range(1,chunkNumber+1) :
 80 |             chunkFilename = "%s_%d." % (OUTPUT_FILENAME, chunkId)
 81 |             huChunkFilename = chunkFilename+"a"
 82 |             enChunkFilename = chunkFilename+"b"
 83 |             with open(huChunkFilename) as huChunkFile :
 84 |                 huSenCnt += len(huChunkFile.readlines())
 85 |             with open(enChunkFilename) as enChunkFile :
 86 |                 enSenCnt += len(enChunkFile.readlines())
 87 |             assert chain[chunkId] == (huSenCnt,enSenCnt)
 88 | 
 89 |     hunalignArgs = [ dictFilename, BATCH_JOB_FILENAME ]
 90 |     batchHunalignDriver(hunalignExecutablePath, hunalignArgs)
 91 | 
 92 |     # Output should now arrive in files named OUTPUT_FILENAME +"_"+ str(number) + ".align"
 93 | 
 94 |     totalLadder = []
 95 |     for chunkId in range(1,chunkNumber+1) :
 96 |         alignChunkFilename = "%s_%d.align" % (OUTPUT_FILENAME, chunkId)
 97 |         chunkStarts = chain[chunkId-1]
 98 |         huStart,enStart = chunkStarts
 99 |         if len(totalLadder)>0 :
100 |             if totalLadder[-1][:2] != chunkStarts :
101 |                 log( "ERROR: In %s rung %s should match with %s" % (alignChunkFilename, str(totalLadder[-1]), chunkStarts) )
102 |                 raise Exception("chunk aligns inconsistent with chunking data")
103 | 
104 |             # The last element of the ladder is only there to mark the size of the who bidocument.
105 |             # Supposedly it's quality value is always 0.3 (We don't check this.)
106 |             del totalLadder[-1]
107 |         try :
108 |             with open(alignChunkFilename) as f :
109 |                 chunkLadder = parseLadderData(f.read())
110 |                 assert chunkLadder[0][:2] == (0,0)
111 |                 for rung in chunkLadder :
112 |                     huStep,enStep,quality = rung
113 |                     totalLadder.append( (huStart+huStep, enStart+enStep, quality) )
114 |         except IOError :
115 |             log( "ERROR: %s missing, hunalign probably gave up on input" % alignChunkFilename)
116 |             raise Exception("chunk align missing")
117 | 
118 |     sys.stdout.write(serializeLadderData(totalLadder))
119 | 
120 | def testBatchHunalign() :
121 |     hunalignExecutablePath = '../src/hunalign/hunalign'
122 |     hunalignArgs = ['../data/null.dic', 'batch.job']
123 |     batchHunalignDriver(hunalignExecutablePath, hunalignArgs)
124 | 
125 | 
126 | def testHunalign() :
127 |     hunalignExecutablePath = '../src/hunalign/hunalign'
128 |     ladderDir = '../regtest/handaligns/1984.ro.utf8/'
129 |     hunalignArgs = [ '-hand='+ladderDir+'hand.ladder', '../data/null.dic', ladderDir+'/hu.pre', ladderDir+'/en.pre' ]
130 | 
131 |     result,stderr = hunalignDriver(hunalignExecutablePath, hunalignArgs)
132 | 
133 |     print "\n".join(map(str,result))
134 | 
135 | 
136 | def testFullStack() :
137 |     hunalignExecutablePath = '../src/hunalign/hunalign'
138 |     ladderDir = '../regtest/handaligns/1984.ro.utf8/'
139 |     huFilename, enFilename, dictFilename = ( ladderDir+'/hu.pre', ladderDir+'/en.pre', '../data/null.dic' )
140 | 
141 |     fullStack(hunalignExecutablePath, huFilename, enFilename, dictFilename)
142 | 
143 | def main() :
144 |     # testHunalign()
145 |     # testBatchHunalign()
146 |     testFullStack()
147 | 
148 | if __name__=='__main__' :
149 |     main()
150 | 


--------------------------------------------------------------------------------
/scripts/ladder2text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import sys
 3 | import itertools
 4 | 
 5 | '''file -> array holding the lines of the file'''
 6 | def readfile(name):
 7 | 	# Open the input files and read lines
 8 | 	infile = file(name, 'r')
 9 | 	lines = map( lambda s : s.strip("\n"), infile.readlines() )
10 | 	return lines
11 | 
12 | '''s -> (s0,s1), (s1,s2), (s2, s3), ...
13 | see http://docs.python.org/library/itertools.html'''
14 | def pairwise(iterable):
15 | 	a, b = itertools.tee(iterable)
16 | 	b.next()
17 | 	return itertools.izip(a, b)
18 | 
19 | '''Create aligned text from two sentence files and hunalign's ladder-style output.
20 | Usage: ladder2text.py <aligner.ladder> <hu.sen> <en.sen> > aligned.txt
21 | See http://mokk.bme.hu/resources/hunalign for detailed format specification and more.
22 | The output file is tab-delimited, with three columns. The first is a probability score.
23 | The second and third columns are the chunks corresponding to each other.
24 | " ~~~ " is the sentence delimiter inside chunks.
25 | '''
26 | def main() :
27 | 	if len(sys.argv) == 4:
28 | 		ladderlines = readfile(sys.argv[1])
29 | 		hulines = readfile(sys.argv[2])
30 | 		enlines = readfile(sys.argv[3])
31 | 		def parseLadderLine(l) :
32 | 		    a = l.split()
33 | 		    assert len(a)==3
34 | 		    return ( int(a[0]), int(a[1]), a[2] ) # The score we leave as a string, to avoid small diffs caused by different numerical representations.
35 | 		ladder = map( parseLadderLine, ladderlines )
36 | 
37 | 		# the next map() does all the work, so here are some comments...
38 | 		# the map() iterates over the holes of the ladder. 
39 | 		# a hole is supposed to be two consecutive items in the array holding the lines of the ladder. /an array of holes is returned by pairwise(ladder)/
40 | 		# the following segment returns an interval of sentences corresponding to a hole:
41 | 		# hulines[int(hole[0][0]):int(hole[1][0])]
42 | 		outputlines = map( lambda hole:
43 | 		    hole[0][2] + "\t" +
44 | 		    " ~~~ ".join(hulines[int(hole[0][0]):int(hole[1][0])]) 
45 | 		    + "\t" + 
46 | 		    " ~~~ ".join(enlines[int(hole[0][1]):int(hole[1][1])])
47 | 		,
48 | 		    pairwise(ladder)
49 | 		)
50 | 		
51 | 		for l in outputlines :
52 | 		    print l
53 | 	else:
54 | 		print 'usage: ladder2text.py <aligned.ladder> <hu.raw> <en.raw> > aligned.txt'
55 | 		sys.exit(-1)
56 | 
57 | 
58 | if __name__ == "__main__" :
59 | 	main()
60 | 


--------------------------------------------------------------------------------
/scripts/process.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | lang=Hungarian ; slang=hu ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen 2>/dev/null ; echo $lang/$au/sen/${au}_$ind.$slang.sen ; $BINDIR/hu.sen.one.sh < $lang/$au/raw/${au}_$ind.$slang.raw > $lang/$au/sen/${au}_$ind.$slang.sen ; done
 4 | 
 5 | lang=English ; slang=en ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen 2>/dev/null ; echo $lang/$au/sen/${au}_$ind.$slang.sen ; $BINDIR/en.sen.one.sh < $lang/$au/raw/${au}_$ind.$slang.raw > $lang/$au/sen/${au}_$ind.$slang.sen ; done
 6 | 
 7 | 
 8 | lang=Hungarian ; slang=hu ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen.tok 2>/dev/null ; echo $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok ; $BINDIR/tok.one.sh < $lang/$au/sen/${au}_$ind.$slang.sen > $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok ; done
 9 | 
10 | lang=English ; slang=en ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen.tok 2>/dev/null ; echo $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok ; $BINDIR/tok.one.sh < $lang/$au/sen/${au}_$ind.$slang.sen > $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok ; done
11 | 
12 | 
13 | lang=Hungarian ; slang=hu ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen.tok.stem 2>/dev/null ; echo $lang/$au/sen.tok.stem/${au}_$ind.$slang.sen.tok.stem ; $BINDIR/$slang.stem.one.sh < $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok > $lang/$au/sen.tok.stem/${au}_$ind.$slang.sen.tok.stem ; done
14 | 
15 | lang=English ; slang=en   ; cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do mkdir $lang/$au/sen.tok.stem 2>/dev/null ; echo $lang/$au/sen.tok.stem/${au}_$ind.$slang.sen.tok.stem ; $BINDIR/$slang.stem.one.sh < $lang/$au/sen.tok/${au}_$ind.$slang.sen.tok > $lang/$au/sen.tok.stem/${au}_$ind.$slang.sen.tok.stem ; done
16 | 
17 | # /////////////////////////////////////////////////
18 | # VEGRE ALIGN, POSZTPROCESSZOROKKAL:
19 | 
20 | mkdir $BICDIR/Align
21 | 
22 | cat $CATALOG | cut -f1 | sort -u | while read d ; do mkdir $BICDIR/Align/$d ; mkdir $BICDIR/Align/$d/ladder ; done
23 | 
24 | 
25 | cat $CATALOG | awk '{ print "Hungarian/"$1"/sen.tok.stem/"$1"_"$2".hu.sen.tok.stem" "\t" "English/"$1"/sen.tok.stem/"$1"_"$2".en.sen.tok.stem" "\t" "Align/"$1"/ladder/"$1"_"$2".ladder" }' > align.batch
26 | 
27 | $BINDIR/alignerTool -batch -headerthresh=100 -ppthresh=30 $BINDIR/vonyo7.nojoker.stemmed align.batch > align.cout 2> align.cerr
28 | 
29 | cat $CATALOG | cut -f1 | sort -u | while read d ; do mkdir $BICDIR/Align/$d/text ; done
30 | 
31 | cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do lang=Hungarian ; slang=hu ; $BINDIR/ladder2text.sh $BICDIR/Align/$au/ladder/${au}_$ind.ladder $BICDIR/Hungarian/$au/sen/${au}_$ind.hu.sen $BICDIR/English/$au/sen/${au}_$ind.en.sen > $BICDIR/Align/$au/text/${au}_$ind.text ; done
32 | 
33 | 
34 | cat $CATALOG | cut -f1 | sort -u | while read d ; do mkdir $BICDIR/Align/$d/text.qf ; done
35 | 
36 | cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do echo Align/$au/text.qf/${au}_$ind.text.qf ; cat Align/$au/text/${au}_$ind.text | grep -v "~~~" | grep -v "<p>" | awk 'BEGIN {FS="\t"} { ra = ( length($2)>length($3) ? (length($2)+10)/(length($3)+10) : (length($3)+10)/(length($2)+10) ) ; if (ra<1.5) print $0}' > Align/$au/text.qf/${au}_$ind.text.qf ; done
37 | 
38 | 
39 | cat $CATALOG | cut -f1 | sort -u | while read d ; do mkdir $BICDIR/Align/$d/shuffled ; done
40 | 
41 | cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do echo Align/$au/shuffled/${au}_$ind.shuffled ; cat Align/$au/text.qf/${au}_$ind.text.qf | cut -f2,3 | sort > $BICDIR/Align/$au/shuffled/${au}_$ind.shuffled ; done
42 | 
43 | 
44 | mkdir measure
45 | 
46 | cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do lang=Hungarian ; slang=hu ; hu=`cat $lang/$au/sen/${au}_$ind.$slang.sen | grep -v "<p>" | wc -l` ; lang=English ; slang=en ; en=`cat $lang/$au/sen/${au}_$ind.$slang.sen | grep -v "<p>" | wc -l` ; echo ${au}_$ind $hu $en ; done | awk '{ h=$2+1; e=$3+1; print (h<e ? h/e : e/h) "\t" $1 "\t" h "\t" e "\t" h/e }' | sort -n | cut -f2- > measure/senratio.txt
47 | 
48 | cat $CATALOG | tr '\t' '\n' | while read au ; read ind ; do i=$BICDIR/Align/$au/ladder/${au}_$ind.ladder ; echo -n "$i" ; cat $i | awk '{ if (($1-one==1)&&($2-two==1)) { ++bis } ; one=$1; two=$2 } END { print "\t" 0+bis "\t" 0+one "\t" 0+two "\t" bis/(1+(one<two?one:two))}' ; done | sort -n +4 > measure/otor.txt
49 | 


--------------------------------------------------------------------------------
/scripts/release.howto.txt:
--------------------------------------------------------------------------------
 1 | cvs co hunalign
 2 | cd hunalign
 3 | 
 4 | # Manually bump version number all over readme.html.
 5 | ...
 6 | 
 7 | # Remove CVS metadata. Commit before this!
 8 | find . | grep "/CVS$" | while read x ; do rm -rf "$x" ; done
 9 | 
10 | # I have no rights to release the parallel corpora.
11 | rm -rf regtest
12 | 
13 | cd ..
14 | ver=3
15 | mv hunalign hunalign-1.$ver
16 | cp -r hunalign-1.$ver /public/Hunglish/src/hunalign/latest/
17 | tar zcvf hunalign-1.$ver.tgz hunalign-1.$ver
18 | cp hunalign-1.$ver.tgz /public/Hunglish/src/hunalign/latest/
19 | 
20 | # Build for Windows. Make a zip called hunalign-1.$ver-windows.zip.
21 | # The zip should contain a directory called hunalign-1.$ver-windows, containing
22 | # the hunalign.exe, and
23 | # msvcp100.dll, msvcr100.dll files for MSVC++ or
24 | # cygwin1.dll for CYGWIN make.
25 | # Copy the zip to /public/Hunglish/src/hunalign/latest/.
26 | 
27 | # Manually copy the readme.html to the http://mokk.bme.hu/resources/hunalign plone page
28 | # in its html source edit mode.
29 | 


--------------------------------------------------------------------------------
/scripts/subprocessTest.py:
--------------------------------------------------------------------------------
 1 | from StringIO import StringIO
 2 | 
 3 | from teed import *
 4 | 
 5 | cmd = ["./testProcess.sh"]
 6 | cmd = ["./testProcess-RealHunalign.sh"]
 7 | 
 8 | ladderDir = '../regtest/handaligns/1984.ro.utf8/'
 9 | 
10 | cmd = ['../src/hunalign/hunalign', '-utf',
11 | '-hand='+ladderDir+'hand.ladder', '../data/null.dic', ladderDir+'/hu.pre', ladderDir+'/en.pre', '-bisent', '-realign']
12 | 
13 | fout, ferr = StringIO(), StringIO()
14 | exitcode = teed_call(cmd, stdout=fout, stderr=ferr)
15 | stdout = fout.getvalue()
16 | stderr = ferr.getvalue()
17 | 
18 | # print len(stdout),len(stderr)
19 | 
20 | print stderr
21 | 


--------------------------------------------------------------------------------
/scripts/teed.py:
--------------------------------------------------------------------------------
 1 | # http://stackoverflow.com/questions/4984428/python-subprocess-get-childrens-output-to-file-and-terminal/4985080#4985080
 2 | # Thanks http://stackoverflow.com/users/4279/j-f-sebastian
 3 | 
 4 | import sys
 5 | from subprocess import Popen, PIPE
 6 | from threading  import Thread
 7 | 
 8 | def tee(infile, *files):
 9 |     """Print `infile` to `files` in a separate thread."""
10 |     def fanout(infile, *files):
11 |         for line in iter(infile.readline, ''):
12 |             for f in files:
13 |                 f.write(line)
14 |         infile.close()
15 |     t = Thread(target=fanout, args=(infile,)+files)
16 |     t.daemon = True
17 |     t.start()
18 |     return t
19 | 
20 | def teed_call(cmd_args, **kwargs):    
21 |     stdout, stderr = [kwargs.pop(s, None) for s in 'stdout', 'stderr']
22 |     p = Popen(cmd_args,
23 |               stdout=PIPE if stdout is not None else None,
24 |               stderr=PIPE if stderr is not None else None,
25 |               **kwargs)
26 |     threads = []
27 |     # Here I changed Sebastian's original version, because I don't want to tee stdout, just stderr:
28 |     # ORIGINAL:
29 |     # if stdout is not None: threads.append(tee(p.stdout, stdout, sys.stdout))
30 |     # MINE:
31 |     if stdout is not None: threads.append(tee(p.stdout, stdout))
32 | 
33 |     if stderr is not None: threads.append(tee(p.stderr, stderr, sys.stderr))
34 |     for t in threads: t.join() # wait for IO completion
35 |     return p.wait()
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     outf, errf = open('out.txt', 'w'), open('err.txt', 'w')
40 |     assert not teed_call(["cat", __file__], stdout=None, stderr=errf)
41 |     assert not teed_call(["echo", "abc"], stdout=outf, stderr=errf, bufsize=0)
42 |     assert teed_call(["gcc", "a b"], close_fds=True, stdout=outf, stderr=errf)
43 | 
44 | 


--------------------------------------------------------------------------------
/scripts/testProcess-RealHunalign.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ladderDir=../regtest/handaligns/1984.ro.utf8
4 | ../src/hunalign/hunalign -utf -hand=$ladderDir/hand.ladder ../data/null.dic $ladderDir/hu.pre $ladderDir/en.pre -bisent
5 | 


--------------------------------------------------------------------------------
/scripts/testProcess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo -n "Working... " > /dev/stderr
 4 | for (( i=1 ; i<10 ; ++i )) ; do
 5 |     sleep 1
 6 |     echo -n "$i " > /dev/stderr
 7 | done
 8 | 
 9 | echo "Done." > /dev/stderr
10 | echo "End result 1"
11 | echo "End result 2"
12 | echo "End result 3"
13 | 


--------------------------------------------------------------------------------
/scripts/testProcess1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | awk 'BEGIN { for (i=1;i<100;++i) { if (i%20==0) { print "Working on number",i > "/dev/stderr" } ; s += i ; print s } }'
4 | 


--------------------------------------------------------------------------------
/scripts/testProcessWithInput.sh:
--------------------------------------------------------------------------------
1 | awk '{ if (NR%100==0) { print "Working on line",NR > "/dev/stderr" } ; s += $0 ; print s }'
2 | 


--------------------------------------------------------------------------------
/scripts/tok.one.sh:
--------------------------------------------------------------------------------
1 | sed 's/\([\.,:\/;()?\!\"]\)/ \1 /g' | sed "s/\([^ -]\)\(--\+\)/\1 \2/g" | sed "s/\(--\+\)\([^ -]\)/\1 \2/g" | sed 's/ \+/ /g' | sed 's/ $//' | sed "s/^ //"
2 | 


--------------------------------------------------------------------------------
/scripts/translate.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/scripts/translate.txt


--------------------------------------------------------------------------------
/scripts/visualizeAlignQuality.awk:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/awk -f
 2 | 
 3 | BEGIN {
 4 |     n = 50 ;
 5 |     r = "";
 6 |     for ( i=0; i<n; ++i )
 7 |     {
 8 | 	r = r "." ;
 9 |     }
10 | 
11 |     for ( i=0; i<n; ++i )
12 |     {
13 | 	t[i] = r ;
14 |     }
15 |     
16 |     p=0;
17 | }
18 | 
19 | {
20 |     xmax=$1;
21 |     ymax=$2;
22 |     x[p]=$1;
23 |     y[p]=$2;
24 |     p++;
25 | }
26 | 
27 | END {
28 |     print xmax, ymax;
29 | 
30 |     for ( j=0; j<p; ++j )
31 |     {
32 | 	rx = int(x[j]/xmax*n);
33 | 	ry = int(y[j]/ymax*n);
34 | 	
35 | 	s=t[ry];
36 | 	left = substr(s,1,rx);
37 | 	right = substr(s,rx+2);
38 | 	
39 | 	t[ry]= left "X" right;
40 |     }
41 | 
42 |     for ( i=0; i<n; ++i )
43 |     {
44 | 	print t[i] ;
45 |     }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/scripts/visualizeLadder.awk:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/awk -f
 2 | 
 3 | BEGIN {
 4 |     n = 50 ;
 5 |     r = "";
 6 |     for ( i=0; i<n; ++i )
 7 |     {
 8 | 	r = r "." ;
 9 |     }
10 | 
11 |     for ( i=0; i<n; ++i )
12 |     {
13 | 	t[i] = r ;
14 |     }
15 |     
16 |     p=0;
17 | }
18 | 
19 | {
20 |     xmax=$1;
21 |     ymax=$2;
22 |     x[p]=$1;
23 |     y[p]=$2;
24 |     p++;
25 | }
26 | 
27 | END {
28 |     print xmax, ymax;
29 | 
30 |     for ( j=0; j<p; ++j )
31 |     {
32 | 	rx = int(x[j]/xmax*n);
33 | 	ry = int(y[j]/ymax*n);
34 | 	
35 | 	s=t[ry];
36 | 	left = substr(s,1,rx);
37 | 	right = substr(s,rx+2);
38 | 	
39 | 	t[ry]= left "X" right;
40 |     }
41 | 
42 |     for ( i=0; i<n; ++i )
43 |     {
44 | 	print t[i] ;
45 |     }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/scripts/visualizeLadder.noshrink.awk:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/awk -f
 2 | 
 3 | BEGIN {
 4 |     p=0;
 5 | }
 6 | 
 7 | {
 8 |     xmax=$1;
 9 |     ymax=$2;
10 |     x[p]=$1;
11 |     y[p]=$2;
12 |     p++;
13 | }
14 | 
15 | END {
16 |     print xmax, ymax;
17 |     
18 |     n=40;
19 |     
20 |     nx = n;
21 |     ny = ymax/xmax*n;
22 |     
23 |     r = "";
24 |     for ( i=0; i<nx; ++i )
25 |     {
26 | 	r = r "." ;
27 |     }
28 | 
29 |     for ( i=0; i<ny; ++i )
30 |     {
31 | 	t[i] = r ;
32 |     }
33 | 
34 |     for ( j=0; j<p; ++j )
35 |     {
36 | 	rx = int(x[j]/xmax*n-0.1);
37 | 	ry = int(y[j]/xmax*n-0.1);
38 | 	
39 | 	s=t[ry];
40 | 	left = substr(s,1,rx);
41 | 	right = substr(s,rx+2);
42 | 	
43 | 	t[ry]= left "X" right;
44 |     }
45 | 
46 |     for ( i=0; i<ny; ++i )
47 |     {
48 | 	print t[i] ;
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/hunalign/DOMTreeErrorReporter.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * The Apache Software License, Version 1.1
  3 |  *
  4 |  * Copyright (c) 1999-2002 The Apache Software Foundation.  All rights
  5 |  * reserved.
  6 |  *
  7 |  * Redistribution and use in source and binary forms, with or without
  8 |  * modification, are permitted provided that the following conditions
  9 |  * are met:
 10 |  *
 11 |  * 1. Redistributions of source code must retain the above copyright
 12 |  *    notice, this list of conditions and the following disclaimer.
 13 |  *
 14 |  * 2. Redistributions in binary form must reproduce the above copyright
 15 |  *    notice, this list of conditions and the following disclaimer in
 16 |  *    the documentation and/or other materials provided with the
 17 |  *    distribution.
 18 |  *
 19 |  * 3. The end-user documentation included with the redistribution,
 20 |  *    if any, must include the following acknowledgment:
 21 |  *       "This product includes software developed by the
 22 |  *        Apache Software Foundation (http://www.apache.org/)."
 23 |  *    Alternately, this acknowledgment may appear in the software itself,
 24 |  *    if and wherever such third-party acknowledgments normally appear.
 25 |  *
 26 |  * 4. The names "Xerces" and "Apache Software Foundation" must
 27 |  *    not be used to endorse or promote products derived from this
 28 |  *    software without prior written permission. For written
 29 |  *    permission, please contact apache\@apache.org.
 30 |  *
 31 |  * 5. Products derived from this software may not be called "Apache",
 32 |  *    nor may "Apache" appear in their name, without prior written
 33 |  *    permission of the Apache Software Foundation.
 34 |  *
 35 |  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 36 |  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 37 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 38 |  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 39 |  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 40 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 41 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 42 |  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 43 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 44 |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 45 |  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 46 |  * SUCH DAMAGE.
 47 |  * ====================================================================
 48 |  *
 49 |  * This software consists of voluntary contributions made by many
 50 |  * individuals on behalf of the Apache Software Foundation, and was
 51 |  * originally based on software copyright (c) 1999, International
 52 |  * Business Machines, Inc., http://www.ibm.com .  For more information
 53 |  * on the Apache Software Foundation, please see
 54 |  * <http://www.apache.org/>.
 55 |  */
 56 | 
 57 | /*
 58 |  * $Id: DOMTreeErrorReporter.cpp,v 1.1.1.1 2009-07-06 14:05:31 daniel Exp $
 59 |  */
 60 | 
 61 | // ---------------------------------------------------------------------------
 62 | //  Includes
 63 | // ---------------------------------------------------------------------------
 64 | #include <xercesc/sax/SAXParseException.hpp>
 65 | #include "DOMTreeErrorReporter.hpp"
 66 | 
 67 | #include <iostream>
 68 | 
 69 | #include <stdlib.h>
 70 | #include <string.h>
 71 | 
 72 | XERCES_CPP_NAMESPACE_USE
 73 | 
 74 | void DOMTreeErrorReporter::warning(const SAXParseException&)
 75 | {
 76 |     //
 77 |     // Ignore all warnings.
 78 |     //
 79 | }
 80 | 
 81 | void DOMTreeErrorReporter::error(const SAXParseException& toCatch)
 82 | {
 83 |     fSawErrors = true;
 84 |     std::cerr << "Error at file \"" << StrX(toCatch.getSystemId())
 85 | 		 << "\", line " << toCatch.getLineNumber()
 86 | 		 << ", column " << toCatch.getColumnNumber()
 87 |      << "\n   Message: " << StrX(toCatch.getMessage()) << std::endl;
 88 | }
 89 | 
 90 | void DOMTreeErrorReporter::fatalError(const SAXParseException& toCatch)
 91 | {
 92 |     fSawErrors = true;
 93 |     std::cerr << "Fatal Error at file \"" << StrX(toCatch.getSystemId())
 94 | 		 << "\", line " << toCatch.getLineNumber()
 95 | 		 << ", column " << toCatch.getColumnNumber()
 96 |      << "\n   Message: " << StrX(toCatch.getMessage()) << std::endl;
 97 | }
 98 | 
 99 | void DOMTreeErrorReporter::resetErrors()
100 | {
101 |     fSawErrors = false;
102 | }
103 | 
104 | std::ostream& operator<<( std::ostream& target, const StrX& toDump)
105 | {
106 |     target << toDump.localForm();
107 |     return target;
108 | }
109 | 


--------------------------------------------------------------------------------
/src/hunalign/Makefile:
--------------------------------------------------------------------------------
 1 | sources = alignerTool.cpp alignment.cpp bloom.cpp bookToMatrix.cpp cooccurrence.cpp cooccurrenceTool.cpp dictionary.cpp main.cpp networkFlow.cpp oldAlignTest.cpp trailPostprocessors.cpp translate.cpp wordAlignment.cpp ../utils/stringsAndStreams.cpp ../utils/argumentsParser.cpp ../utils/timer.cpp
 2 | 
 3 | headers = alignment.h bloom.h bookToMatrix.h cooccurrence.h dictionary.h dicTree.h help.h networkFlow.h quasiDiagonal.h similarityEvaluator.h TEIReader.h trailPostprocessors.h translate.h wordAlignment.h words.h
 4 | 
 5 | objects = alignerTool.o alignment.o bloom.o bookToMatrix.o cooccurrence.o cooccurrenceTool.o dictionary.o main.o networkFlow.o oldAlignTest.o trailPostprocessors.o translate.o wordAlignment.o ../utils/stringsAndStreams.o ../utils/argumentsParser.o ../utils/timer.o
 6 | 
 7 | SHELL = /bin/bash
 8 | CXX = g++
 9 | CPPFLAGS = -O9 -ffast-math -funroll-loops -I ../include
10 | LIBS = -lstdc++
11 | RM = rm -f
12 | 
13 | hunalign: $(objects)
14 | 	$(CXX) $(CPPFLAGS) $(LIBS) -o hunalign $(objects)
15 | 
16 | depend:
17 | 	makedepend -Y -s"# DO NOT DELETE THIS LINE -- hunaligndep" $(sources)
18 | 
19 | clean:
20 | 	$(RM) hunalign $(objects)
21 | 
22 | # DO NOT DELETE THIS LINE -- hunaligndep
23 | 
24 | alignerTool.o: alignment.h quasiDiagonal.h words.h bookToMatrix.h translate.h
25 | alignerTool.o: dictionary.h cooccurrence.h trailPostprocessors.h help.h
26 | alignment.o: alignment.h quasiDiagonal.h words.h dictionary.h
27 | bloom.o: bloom.h words.h
28 | bookToMatrix.o: bookToMatrix.h words.h alignment.h quasiDiagonal.h
29 | bookToMatrix.o: dictionary.h
30 | cooccurrence.o: cooccurrence.h words.h networkFlow.h dictionary.h translate.h
31 | cooccurrenceTool.o: cooccurrence.h words.h networkFlow.h dictionary.h
32 | dictionary.o: dictionary.h words.h
33 | networkFlow.o: networkFlow.h
34 | oldAlignTest.o: dictionary.h words.h bloom.h translate.h alignment.h
35 | oldAlignTest.o: quasiDiagonal.h bookToMatrix.h dicTree.h
36 | trailPostprocessors.o: trailPostprocessors.h alignment.h quasiDiagonal.h
37 | trailPostprocessors.o: words.h bookToMatrix.h
38 | translate.o: translate.h words.h dictionary.h dicTree.h
39 | wordAlignment.o: wordAlignment.h words.h dictionary.h
40 | 


--------------------------------------------------------------------------------
/src/hunalign/TEIReader.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_TEIREADER_TEIREADER_H
12 | #define __HUNGLISH_TEIREADER_TEIREADER_H
13 | 
14 | #include "words.h"
15 | 
16 | #include <xercesc/dom/DOM.hpp>
17 | 
18 | #include <iosfwd>
19 | 
20 | namespace Hunglish
21 | {
22 | 
23 | std::string toString( const XMLCh* wstr );
24 | 
25 | std::ostream& operator<<( std::ostream& os, const XMLCh* wstr );
26 | 
27 | void traverseDOMTree( const XERCES_CPP_NAMESPACE::DOMNode* doc, int depth );
28 | 
29 | void trivialSerializeSubTree( const XERCES_CPP_NAMESPACE::DOMNode* node, std::ostream& os );
30 | 
31 | const XERCES_CPP_NAMESPACE::DOMNode* findFirstSubTree( const XERCES_CPP_NAMESPACE::DOMNode* parent, const String& key );
32 | 
33 | const XERCES_CPP_NAMESPACE::DOMNode* findNextSubTree( const XERCES_CPP_NAMESPACE::DOMNode* root, const XERCES_CPP_NAMESPACE::DOMNode* current, String& key );
34 | 
35 | void buildWordFromDOMTree_Hu( const XERCES_CPP_NAMESPACE::DOMNode* parent, Word& word, bool lemma );
36 | 
37 | void buildWordFromDOMTree_En( const XERCES_CPP_NAMESPACE::DOMNode* parent, Word& word, bool lemma );
38 | 
39 | String getIdOfSentence( const XERCES_CPP_NAMESPACE::DOMNode* parent );
40 | 
41 | void buildSentenceFromDOMTree_Hu( const XERCES_CPP_NAMESPACE::DOMNode* parent, Sentence& sentence );
42 | 
43 | void buildSentenceFromDOMTree_En( const XERCES_CPP_NAMESPACE::DOMNode* parent, Sentence& sentence );
44 | 
45 | void buildSentenceFromDOMTree( const XERCES_CPP_NAMESPACE::DOMNode* parent, Sentence& sentence, bool english );
46 | 
47 | void buildSentenceListFromDOMTree( const XERCES_CPP_NAMESPACE::DOMNode* parent, SentenceList& seg, bool english );
48 | 
49 | // If this interface was a class, parseTEI would be its only public method:
50 | int parseTEI( const char* xmlFile, bool english, SentenceList& sentenceList );
51 | 
52 | // ...And this would be the test for the class:
53 | int main_TEIReader( int argC, char* argV[] );
54 | 
55 | } // namespace Hunglish
56 | 
57 | #endif // #define __HUNGLISH_TEIREADER_TEIREADER_H
58 | 


--------------------------------------------------------------------------------
/src/hunalign/alignment.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 | *                                                                        *
  3 | *  (C) Copyright 2004. Media Research Centre at the                      *
  4 | *  Sociology and Communications Department of the                        *
  5 | *  Budapest University of Technology and Economics.                      *
  6 | *                                                                        *
  7 | *  Developed by Daniel Varga.                                            *
  8 | *                                                                        *
  9 | *************************************************************************/
 10 | 
 11 | #ifndef __HUNGLISH_ALIGNMENT_ALIGNMENT_H
 12 | #define __HUNGLISH_ALIGNMENT_ALIGNMENT_H
 13 | 
 14 | #include "quasiDiagonal.h"
 15 | 
 16 | #include <string>
 17 | #include <set>
 18 | 
 19 | namespace Hunglish
 20 | {
 21 | 
 22 | // Simply double values for each sentence. Right now we store sentence lengths in them.
 23 | typedef std::vector<double> SentenceValues;
 24 | 
 25 | // See quasiDiagonal.h
 26 | typedef QuasiDiagonal<double> AlignMatrix;
 27 | 
 28 | // Contains directions, a bit like a force field.
 29 | typedef QuasiDiagonal<unsigned char> TrelliMatrix;
 30 | 
 31 | // A Rundle (x,y) cuts the bitext into two sub-bitexts:
 32 | // [0,x)+[0,y) and [x,huSize)+[y,enSize).
 33 | typedef std::pair<int,int> Rundle;
 34 | 
 35 | // A Trail is a strictly ordered list of Rundles.
 36 | // It cuts the bitext into small bitexts.
 37 | // Such a small bitext is called a hole or segmentum.
 38 | // A hole can contion zero Hungarian sentence,
 39 | // it can contain zero English sentences, but not both.
 40 | // A Trail is sometimes referred to as a Ladder.
 41 | typedef std::vector<Rundle> Trail;
 42 | 
 43 | // A BisentenceList is formally identical to a Trail, but semantically very different.
 44 | // It represents an ordered list of bisentences.
 45 | // There are some functions which utilize the formal identity,
 46 | // manipulating both structures.
 47 | typedef std::vector< std::pair<int,int> > BisentenceList;
 48 | 
 49 | // OBSOLETE:
 50 | // TrailValues gives scores to the Rundles of a Trail (of the same size).
 51 | // Conceptually TrailValues should be attached to Trails.
 52 | // A TrailValues structure always accompanies a Trails list,
 53 | // but their consistency must be maintained by hand, pre-OO-style. (TODO)
 54 | // typedef std::vector<double> TrailValues;
 55 | 
 56 | // OBSOLETE:
 57 | // Has the exactly same relation to BisentenceList as
 58 | // a TrailValues has to a Trail. But note that these 
 59 | // scores mark the confidence in a bisentence. This is
 60 | // very different from the confidence in a rundle.
 61 | // typedef std::vector<double> BisentenceValues;
 62 | 
 63 | double closeness( double twoSentenceLength, double oneSentenceLength );
 64 | 
 65 | const double skipScore = -0.3;
 66 | 
 67 | 
 68 | // The main align function,
 69 | // Gets a confidence value for every sentence-pair,
 70 | // and sentence lengths for each sentence (for a a Gale-Church-like scoring).
 71 | // Returns a trail with the best total score, and the computed dynMatrix matrix:
 72 | // dynMatrix[huPos][enPos] gives the similarity of the [0,huPos) and [0,enPos) intervals.
 73 | void align( const AlignMatrix& w, const SentenceValues& huLength, const SentenceValues& enLength,
 74 |             Trail& bestTrail, AlignMatrix& dynMatrix );
 75 | 
 76 | 
 77 | bool oneToOne( const Trail& bestTrail, int pos );
 78 | 
 79 | // Collect bisentences.
 80 | void trailToBisentenceList( const Trail& bestTrail,
 81 |                             BisentenceList& bisentenceList );
 82 | 
 83 | // Score precision-recall of a BisentenceList according to a hand-aligned bicorpus.
 84 | // For best results, zero-to-many holes of the hand-alignment should be subdivided to zero-to-ones.
 85 | // Builds the manual bisentencelist. The compared sets consist of Bisentences.
 86 | double scoreBisentenceList( const BisentenceList& bisentenceList, const Trail& trailHand );
 87 | 
 88 | // The same precision-recall calculation for Trails. The compared sets consist of Rundles.
 89 | double scoreTrail         ( const Trail&          trailAuto,      const Trail& trailHand );
 90 | 
 91 | 
 92 | const int outsideOfRadiusValue = -1000000;
 93 | const int insideOfRadiusValue  = 0;
 94 | 
 95 | // Fills the complement of the radius of the trail with minus infties.
 96 | // The return value true means success. Failure means that during the fill,
 97 | // we intersected the outside of the quasidiagonal area.
 98 | // In this case, the operation is not finished.
 99 | bool borderDetailedAlignMatrix( AlignMatrix& m, const Trail& trail, int radius );
100 | 
101 | // What the name implies.
102 | void dumpAlignMatrix( const AlignMatrix& m, bool graphical );
103 | 
104 | template <class T>
105 | void dumpAlignMatrix( const QuasiDiagonal<T>& alignMatrix );
106 | 
107 | void dumpAlignMatrix( const QuasiDiagonal<int>& alignMatrix, bool graphical );
108 | 
109 | void dumpTrelliMatrix( const TrelliMatrix& trellis );
110 | 
111 | 
112 | } // namespace Hunglish
113 | 
114 | #endif // #define __HUNGLISH_ALIGNMENT_ALIGNMENT_H
115 | 


--------------------------------------------------------------------------------
/src/hunalign/bloom.cpp:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #pragma warning ( disable : 4786 )
12 | 
13 | #include "bloom.h"
14 | 
15 | namespace Hunglish
16 | {
17 | 
18 | int BloomFilter::hash( const Word& w )
19 | {
20 |   unsigned int v=0;
21 | 
22 |   for ( Word::const_iterator it=w.begin(); it!=w.end(); ++it )
23 |   {
24 |       unsigned int top = v >> 24;
25 |       // Ez ugy lett tervezve, hogy (unsigned int)(*it) valodi ketbajtos.
26 |       // A helyesebb megoldas az lenne, ha ketbajtosaval mennenk vegig rajta.
27 |       // De ez overkill, mert ha a xerces unicode-ot hash-el, akkor pont
28 |       // ugyanazt csinalja, mint most en.
29 |       v += (v * 37) + top + (unsigned int)(*it);
30 |   }
31 | 
32 |   // Divide by modulus
33 |   return v % bloomSize;
34 | }
35 | 
36 | void BloomFilter::set( const Word& w )
37 | {
38 |   ((std::bitset<bloomSize>*)this)->set( hash(w) % bloomSize );
39 | 
40 | }
41 | 
42 | bool BloomFilter::test ( const Word& w ) const
43 | {
44 |   return ((std::bitset<bloomSize>*)this)->test( hash(w) % bloomSize );
45 | }
46 | 
47 | int BloomFilter::count() const
48 | {
49 |   return ((std::bitset<bloomSize>*)this)->count();
50 | }
51 | 
52 | std::bitset<bloomSize>& BloomFilter::getBitset()
53 | {
54 |   return * (std::bitset<bloomSize>*)this ;
55 | }
56 | 
57 | const std::bitset<bloomSize>& BloomFilter::getBitset() const
58 | {
59 |   return * (const std::bitset<bloomSize>*)this ;
60 | }
61 | 
62 | int intersectionSize( const BloomFilter& b1, const BloomFilter& b2 )
63 | {
64 |   int count(0);
65 |   for ( int i=0; i<bloomSize; ++i )
66 |   {
67 |     if (b1[i]&&b2[i])
68 |     {
69 |       ++count;
70 |     }
71 |   }
72 |   return count;
73 | }
74 | 
75 | } // namespace Hunglish
76 | 


--------------------------------------------------------------------------------
/src/hunalign/bloom.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_ALIGNMENT_BLOOM_H
12 | #define __HUNGLISH_ALIGNMENT_BLOOM_H
13 | 
14 | #include <bitset>
15 | 
16 | // TODO
17 | #include "words.h"
18 | 
19 | namespace Hunglish
20 | {
21 | 
22 | const int bloomSize=512;
23 | 
24 | class BloomFilter : private std::bitset<bloomSize>
25 | {
26 | public:
27 |   void set( const Word& w );
28 |   bool test ( const Word& w ) const;
29 |   int count() const;
30 | 
31 | public:
32 |   std::bitset<bloomSize>& getBitset();
33 |   const std::bitset<bloomSize>& getBitset() const;
34 | 
35 | public:
36 |   friend int intersectionSize( const BloomFilter& b1, const BloomFilter& b2 );
37 | 
38 | public:
39 |   static int hash( const Word& w );
40 | };
41 | 
42 | int intersectSize( const BloomFilter& b1, const BloomFilter& b2 );
43 | 
44 | typedef std::vector<BloomFilter> BloomBook;
45 | 
46 | } // namespace Hunglish
47 | 
48 | #endif // #define __HUNGLISH_ALIGNMENT_BLOOM_H
49 | 


--------------------------------------------------------------------------------
/src/hunalign/bookToMatrix.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/src/hunalign/bookToMatrix.cpp


--------------------------------------------------------------------------------
/src/hunalign/bookToMatrix.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_ALIGNMENT_BOOKTOMATRIX_H
12 | #define __HUNGLISH_ALIGNMENT_BOOKTOMATRIX_H
13 | 
14 | #include "words.h"
15 | #include "alignment.h"
16 | 
17 | namespace Hunglish
18 | {
19 | 
20 | const double scoreOfParagraphMatch = 0.31;
21 | 
22 | const double scoreOfParagraphMisMatch = -1.0;
23 | 
24 | bool isParagraph( const Phrase& phrase );
25 | 
26 | // (!!!) We assert that sx and sy are ordered sets of Word-s!
27 | int intersectionSize( const WordList& sx, const WordList& sy );
28 | 
29 | void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix );
30 | 
31 | class TransLex;
32 | 
33 | double scoreByIdentity( const Phrase& hu, const Phrase& en );
34 | 
35 | double scoreByTranslation( const Phrase& hu, const Phrase& en, const TransLex& transLex );
36 | 
37 | // This is much-much slower, but instead of identity, uses a many-to-many dictionary.
38 | // For performance reasons, by convention does not calculate the similarity if the 
39 | // alignMatrix element contains outsideOfRadiusValue, a big negative number.
40 | void sentenceListsToAlignMatrixTranslation(
41 |                                            const SentenceList& huSentenceListPretty, const SentenceList& enSentenceList,
42 |                                            const TransLex& transLex,
43 |                                            AlignMatrix& alignMatrixDetailed );
44 | 
45 | class IBMModelOne;
46 | 
47 | void sentenceListsToAlignMatrixIBMModelOne(
48 |                                            const SentenceList& huSentenceList, const SentenceList& enSentenceList,
49 |                                            const IBMModelOne& modelOne,
50 |                                            AlignMatrix& alignMatrix );
51 | 
52 | int characterLength( const Word& words, bool utfCharCountingMode );
53 | 
54 | double characterLength( const Phrase& words, bool utfCharCountingMode );
55 | 
56 | double characterLength( int start, int end, const SentenceList& sentenceList, bool utfCharCountingMode );
57 | 
58 | void setSentenceValues( const SentenceList& sentences, SentenceValues& lengths, bool utfCharCountingMode );
59 | 
60 | } // namespace Hunglish
61 | 
62 | #endif // #define __HUNGLISH_ALIGNMENT_BOOKTOMATRIX_H
63 | 


--------------------------------------------------------------------------------
/src/hunalign/cooccurrence.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/src/hunalign/cooccurrence.cpp


--------------------------------------------------------------------------------
/src/hunalign/cooccurrence.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_ALIGNMENT_COOCCURRENCE_H
12 | #define __HUNGLISH_ALIGNMENT_COOCCURRENCE_H
13 | 
14 | #include "words.h"
15 | 
16 | namespace Hunglish
17 | {
18 | 
19 | void cooccurenceAnalysis( SentenceList& huSentenceList, SentenceList& enSentenceList,
20 |                           double minScore, int minCoocc );
21 | 
22 | void flowBuilderXml( const SentenceList& huSentenceList, const SentenceList& enSentenceList,
23 |                   std::ostream& flowStream );
24 | 
25 | void lexiconByEdmondsKarp( const SentenceList& huSentenceListC, const SentenceList& enSentenceListC );
26 | 
27 | 
28 | typedef std::pair<Word,Word> BiWord;
29 | typedef std::vector<BiWord> BiWords;
30 | 
31 | // This should be done after removeStopwords, simply because of bilanguage words like
32 | // "a","is","be". We absolutely don't care about rare bilanguage words like "petty".
33 | void removeIdenticals( SentenceList& huSentenceList, SentenceList& enSentenceList,
34 |                        BiWords& idTranslations );
35 | 
36 | void removeHapaxes( SentenceList& huSentenceList, SentenceList& enSentenceList,
37 |                     BiWords& hapaxTranslations );
38 | 
39 | class DictionaryItems;
40 | 
41 | void filterBicorpusByLexicon
42 |        ( SentenceList& huSentenceList, SentenceList& enSentenceList,
43 |        const DictionaryItems& dictionaryItems );
44 | 
45 | // Adds plausible items to the dictionary it recieves as input.
46 | void autoDictionaryForRealign( SentenceList& huSentenceList, SentenceList& enSentenceList,
47 |                           DictionaryItems& dictionary,
48 |                           double minScore, int minCoocc );
49 | 
50 | // Removes dictionary items for which it doesn't find cooccurrences in the bicorpus.
51 | // Typically, bicorpus is built from a primary alignment.
52 | void filterDictionaryForRealign( SentenceList& huSentenceList, SentenceList& enSentenceList,
53 |                           DictionaryItems& dictionary );
54 | 
55 | } // namespace Hunglish
56 | 
57 | #endif // #define __HUNGLISH_ALIGNMENT_COOCCURRENCE_H
58 | 


--------------------------------------------------------------------------------
/src/hunalign/dicTree.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 | *                                                                        *
  3 | *  (C) Copyright 2004. Media Research Centre at the                      *
  4 | *  Sociology and Communications Department of the                        *
  5 | *  Budapest University of Technology and Economics.                      *
  6 | *                                                                        *
  7 | *  Developed by Daniel Varga.                                            *
  8 | *                                                                        *
  9 | *************************************************************************/
 10 | 
 11 | #ifndef __HUNGLISH_TEIREADER_DICTIONARIES_H
 12 | #define __HUNGLISH_TEIREADER_DICTIONARIES_H
 13 | 
 14 | #include <map>
 15 | #include <vector>
 16 | #include <set>
 17 | #include <iostream>
 18 | 
 19 | namespace Hunglish
 20 | {
 21 | 
 22 | // A simple tree class.
 23 | // 
 24 | template <class Atom, class Identifier>
 25 | class DicTree
 26 | {
 27 | public:
 28 |   // Gets value a bit below. Ugly C++.
 29 |   static const bool WarnOnConflict;
 30 | 
 31 |   DicTree() : id(0) {}
 32 |   DicTree( const Identifier& id_ ) : id(id_) {}
 33 | 
 34 |   ~DicTree();
 35 | 
 36 |   const Identifier& getIdentifier() const { return id; }
 37 |   void setIdentifier( const Identifier& id_) { id=id_; }
 38 |   DicTree<Atom, Identifier>* lookup( const Atom& word ) const;
 39 |   DicTree& add( const Atom& word, const Identifier& id );
 40 |   bool empty() const { return children.empty(); }
 41 | 
 42 |   void dump( std::ostream& os ) const;
 43 | 
 44 | private:
 45 |   typedef std::map<Atom,DicTree*> DicTreeMap;
 46 |   DicTreeMap children;
 47 |   Identifier id;
 48 | };
 49 | 
 50 | template <class Atom, class Identifier>
 51 | const bool DicTree<Atom,Identifier>::WarnOnConflict = false;
 52 | 
 53 | // This structure stores a very sparse set-system of words.
 54 | // (A dictionary of complex expressions.)
 55 | // 
 56 | // It supports the following query:
 57 | // It receives a set of words S. It gives back the sets 
 58 | // of the set system that are contained in this set S.
 59 | // 
 60 | // For it to be effective, we must be careful during the building phase:
 61 | // words in vector 'words' must be ordered by INCREASING frequency. Rare words first.
 62 | 
 63 | template <class Atom, class Identifier>
 64 | class SubsetLookup
 65 | {
 66 | public:
 67 | 
 68 |   typedef std::vector<Atom> Atoms;
 69 | 
 70 |   void add( const Atoms& words, const Identifier& id );
 71 | 
 72 |   void lookup( const Atoms& words, std::set<Identifier>& results ) const;
 73 | 
 74 |   void dump( std::ostream& os ) const;
 75 | 
 76 | private:
 77 |   DicTree<Atom,Identifier> tree;
 78 | };
 79 | 
 80 | // Implementation. F.ck C++ for having to put this in a header.
 81 | 
 82 | template <class Atom, class Identifier>
 83 | DicTree<Atom, Identifier>::~DicTree()
 84 | {
 85 |   for ( typename DicTreeMap::iterator it=children.begin(); it!=children.end(); ++it )
 86 |   {
 87 |     delete it->second;
 88 |   }
 89 | }
 90 | 
 91 | // Az id-t soha nem irja at nullarol nemnullara.
 92 | // Ha nemnullarol nemnullara irja at, akkor kiabal elotte.
 93 | template <class Atom, class Identifier>
 94 | DicTree<Atom, Identifier>& DicTree<Atom, Identifier>::add( const Atom& word, const Identifier& id )
 95 | {
 96 |   DicTree* v = lookup(word);
 97 |   if (!v)
 98 |   {
 99 |     v = new DicTree<Atom, Identifier>();
100 |     v->id = id;
101 |     children[word] = v;
102 |   }
103 |   else
104 |   {
105 |     if ( ( v->id != 0 ) && ( id != 0 ) )
106 |     {
107 |       if (WarnOnConflict)
108 |         std::cerr << "warning: conflict in tree" << std::endl;
109 |     }
110 |     if ( id != 0 )
111 |     {
112 |       v->id = id;
113 |     }
114 |   }
115 | 
116 |   return (*v);
117 | }
118 | 
119 | template <class Atom, class Identifier>
120 | DicTree<Atom, Identifier>* DicTree<Atom, Identifier>::lookup( const Atom& word ) const
121 | {
122 |   typename DicTreeMap::const_iterator ft = children.find(word);
123 | 
124 |   if (ft==children.end())
125 |   {
126 |     return 0;
127 |   }
128 |   else
129 |   {
130 |     return ft->second;
131 |   }
132 | }
133 | 
134 | template <class Atom, class Identifier>
135 | void DicTree<Atom, Identifier>::dump( std::ostream& os ) const
136 | {
137 |   if (id!=0)
138 |   {
139 |     os << id << " ";
140 |   }
141 |   os << "{" << std::endl;
142 |   for ( typename DicTreeMap::const_iterator it=children.begin(); it!=children.end(); ++it )
143 |   {
144 |     os << it->first << " ";
145 |     it->second->dump(os);
146 |   }
147 |   os << "}" << std::endl;
148 | }
149 | 
150 | template <class Atom, class Identifier>
151 | void SubsetLookup<Atom, Identifier>::add( const Atoms& words, const Identifier& id )
152 | {
153 |   DicTree<Atom, Identifier>* v = &tree;
154 | 
155 |   for ( typename Atoms::const_iterator it=words.begin(); it!=words.end(); ++it )
156 |   {
157 |     DicTree<Atom, Identifier>& newv = v->add(*it,0);
158 |     v = &newv;
159 |   }
160 |   if ( v->getIdentifier() == 0 )
161 |   {
162 |     v->setIdentifier(id);
163 |   }
164 |   else
165 |   {
166 |     if (DicTree<Atom, Identifier>::WarnOnConflict)
167 |       std::cerr << "warning: conflict in tree" << std::endl;
168 |   }
169 | }
170 | 
171 | template <class Atom, class Identifier>
172 | void SubsetLookup<Atom, Identifier>::lookup( const Atoms& words, std::set<Identifier>& results ) const
173 | {
174 |   typedef std::set<const DicTree<Atom, Identifier>*> Pebbles;
175 |   Pebbles pebbles;
176 |   pebbles.insert(&tree);
177 | 
178 |   results.clear();
179 | 
180 |   for ( typename Atoms::const_iterator it=words.begin(); it!=words.end(); ++it )
181 |   {
182 |     const Atom& word = *it;
183 | 
184 |     for ( typename Pebbles::const_iterator jt=pebbles.begin(); jt!=pebbles.end(); ++jt )
185 |     {
186 |       const DicTree<Atom, Identifier>* subTree = (*jt)->lookup(word) ;
187 |       
188 |       if (!subTree)
189 |         continue;
190 | 
191 |       const Identifier& id = subTree->getIdentifier();
192 |       if (id!=0)
193 |       {
194 |         results.insert(id);
195 |       }
196 | 
197 |       if (!subTree->empty())
198 |       {
199 |         pebbles.insert(subTree);
200 |       }
201 |     }
202 |   }
203 | }
204 | 
205 | template <class Atom, class Identifier>
206 | void SubsetLookup<Atom, Identifier>::dump( std::ostream& os ) const
207 | {
208 |   tree.dump(os);
209 | }
210 | 
211 | } // namespace Hunglish
212 | 
213 | 
214 | #endif // #define __HUNGLISH_TEIREADER_DICTIONARIES_H
215 | 


--------------------------------------------------------------------------------
/src/hunalign/dictionary.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/src/hunalign/dictionary.cpp


--------------------------------------------------------------------------------
/src/hunalign/dictionary.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 | *                                                                        *
  3 | *  (C) Copyright 2004. Media Research Centre at the                      *
  4 | *  Sociology and Communications Department of the                        *
  5 | *  Budapest University of Technology and Economics.                      *
  6 | *                                                                        *
  7 | *  Developed by Daniel Varga.                                            *
  8 | *                                                                        *
  9 | *************************************************************************/
 10 | 
 11 | #ifndef __HUNGLISH_ALIGNMENT_DICTIONARY_H
 12 | #define __HUNGLISH_ALIGNMENT_DICTIONARY_H
 13 | 
 14 | #include "words.h"
 15 | 
 16 | #include <string>
 17 | #include <vector>
 18 | #include <map>
 19 | #include <iosfwd>
 20 | 
 21 | 
 22 | namespace Hunglish
 23 | {
 24 | 
 25 | typedef std::pair<Phrase,Phrase> DictionaryItem;
 26 | 
 27 | class DictionaryItems : public std::vector<DictionaryItem>
 28 | {
 29 | public:
 30 |   void read( std::istream& is );
 31 | };
 32 | 
 33 | class HalfDictionary : public std::vector<WordList>
 34 | {
 35 | public:
 36 |   void read( std::istream& is );
 37 | };
 38 | 
 39 | 
 40 | // After reading, this dictionary cannot be altered.
 41 | // Also, this is a strictly one-directional dictionary.
 42 | // If the other direction is needed, reverse( const Dictionary& dic ) another dictionary.
 43 | class Dictionary
 44 | {
 45 | public:
 46 |   void read( const char* dictionaryFile );
 47 |   void reverse( const Dictionary& dic );
 48 |   void build( const DictionaryItems& dictionaryItems );
 49 | 
 50 |   bool lookupWord( const Word& word, DictionaryItems& results ) const;
 51 |   bool lookupWordSet( const WordList& words, DictionaryItems& results ) const;
 52 | 
 53 | private:
 54 |   void buildWordLookupTable();
 55 | 
 56 | private:
 57 |   DictionaryItems dictionaryItems;
 58 | 
 59 |   typedef std::map<Word,int> wordLookupTable;
 60 | };
 61 | 
 62 | class FrequencyMap : public std::map<Word,int>
 63 | {
 64 | public:
 65 |   void add( const Word& word );
 66 |   void remove( const Word& word );
 67 |   void build( const WordList& wordList );
 68 |   void remove( const WordList& wordList );
 69 |   void build( const SentenceList& sentenceList ); // Just for convenience.
 70 |   int  total() const;
 71 |   void dump( std::ostream& os, int itemNum ) const;
 72 |   void lowPassFilter( WordList& allowedWords, double ratio ) const;
 73 |   void highPassFilter( WordList& allowedWords, double ratio ) const;
 74 | 
 75 | private:
 76 |   typedef std::multimap<int,Word> ReFrequencyMap;
 77 |   void reverseMap( ReFrequencyMap& reFrequencyMap ) const;
 78 | };
 79 | 
 80 | 
 81 | void filterSentences( SentenceList& sentenceList, const WordList& words );
 82 | 
 83 | void removeHungarianStopwords( SentenceList& huSentenceList );
 84 | void removeEnglishStopwords  ( SentenceList& enSentenceList );
 85 | void removeStopwords  ( SentenceList& huSentenceList, SentenceList& enSentenceList );
 86 | 
 87 | 
 88 | typedef std::pair<Word,Word> WordPair;
 89 | 
 90 | class TransLex
 91 | {
 92 | public:
 93 | 
 94 |   typedef std::multimap<Word,Word> WordMultimap;
 95 |   typedef WordMultimap::const_iterator WordMultimapIt;
 96 |   typedef std::pair<WordMultimapIt,WordMultimapIt> DictInterval;
 97 | 
 98 |   void add( const Word& huWord, const Word& enWord );
 99 |   void build( const DictionaryItems& dictionaryItems );
100 | 
101 |   DictInterval lookupLeftWord ( const Word& huWord ) const;
102 |   DictInterval lookupRightWord( const Word& enWord ) const;
103 |   bool isPresent( const Word& huWord, const Word& enWord ) const;
104 | 
105 | private:
106 |   WordMultimap forward;
107 |   WordMultimap backward;
108 | };
109 | 
110 | class IBMModelOne
111 | {
112 | public:
113 |   double lookup( const Word& hu, const Word& en ) const;
114 | 
115 |   double distance( const Phrase& hu, const Phrase& en ) const;
116 | 
117 |   void build( const SentenceList& huSentenceList, const SentenceList& enSentenceList );
118 | 
119 |   void reestimate( const SentenceList& huSentenceList, const SentenceList& enSentenceList );
120 | 
121 | public:
122 |   typedef std::pair<Word,Word> WordPair;
123 |   typedef std::map<WordPair,double> TransProbs;
124 | 
125 |   TransProbs transProbs;
126 | };
127 | 
128 | } // namespace Hunglish
129 | 
130 | #endif // #define __HUNGLISH_ALIGNMENT_DICTIONARY_H
131 | 


--------------------------------------------------------------------------------
/src/hunalign/help.h:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | 
 3 | std::string helpString = "Usage (either):\n\
 4 |     alignerTool [ common_arguments ] [ -hand=hand_align_file ] dictionary_file source_text target_text\n\
 5 | \n\
 6 | or:\n\
 7 |     alignerTool [ common_arguments ] -batch dictionary_file batch_file\n\
 8 | \n\
 9 | where\n\
10 | common_arguments ::= [ -text ] [ -bisent ] [ -utf ] [ -cautious ] [ -realign [ -autodict=filename ] ]\n\
11 |     [ -thresh=n ] [ -ppthresh=n ] [ -headerthresh=n ] [ -topothresh=n ]\n\
12 | \n\
13 | Arguments:\n\
14 | \n\
15 | -text\n\
16 | 	The output should be in text format, rather than the default (numeric) ladder format.\n\
17 | \n\
18 | -bisent\n\
19 | 	Only bisentences (one-to-one alignment segments) are printed. In non-text mode, their\n\
20 | 	starting rung is printed.\n\
21 | \n\
22 | -cautious\n\
23 | 	In -bisent mode, only bisentences for which both the preceeding and the following\n\
24 | 	segments are one-to-one are printed. In the default non-bisent mode, only rungs\n\
25 | 	for which both the preceeding and the following segments are one-to-one are printed.\n\
26 | \n\
27 | -hand=file\n\
28 | 	When this argument is given, the precision and recall of the alignment is calculated\n\
29 | 	based on the manually built ladder file. Information like the following is written\n\
30 | 	on the standard error: \n\
31 | 	53 misaligned out of 6446 correct items, 6035 bets.\n\
32 | 	Precision: 0.991218, Recall: 0.928017\n\
33 | 	\n\
34 |         Note that by default, 'item' means rung. The switch -bisent also changes the semantics\n\
35 | 	of the scoring from rung-based to bisentence-based and in this case 'item' means bisentences.\n\
36 | 	See File formats about the format of this input align file.\n\
37 | \n\
38 | -realign\n\
39 | 	If this option is set, the alignment is built in three phases.\n\
40 | 	After an initial alignment, the algorithm heuristically adds items\n\
41 | 	to the dictionary based on cooccurrences in the identified bisentences.\n\
42 | 	Then it re-runs the alignment process based on this larger dictionary.\n\
43 | 	This option is recommended to achieve the highest possible alignment quality.\n\
44 | 	It is not set by default because it approximately triples the running time\n\
45 | 	while the quality improvement it yields are typically small.\n\
46 | \n\
47 | -autodict=filename\n\
48 | 	The dictionary built during realign is saved to this file. By default, it is not saved.\n\
49 | \n\
50 | \n\
51 | -onebyteencoding\n\
52 | 	The system uses the character counts of the sentences as information for the\n\
53 | 	pairing of sentences. By default, it assumes UTF-8 encoding.\n\
54 | 	With this switch, it treats byte count as character count.\n\
55 | 	This should be used for ISO encodings.\n\
56 | -utf\n\
57 | 	This switch is obsolete, UTF-8 is the default input encoding in later versions.\n\
58 | 	Note: UTF-16 input is not supported.\n\
59 | \n\
60 | Postfiltering options:\n\
61 | There are various postprocessors which remove implausible rungs based on various heuristics.\n\
62 | \n\
63 | -thresh=n\n\
64 | 	Don't print out segments with score lower than n/100.\n\
65 | \n\
66 | -ppthresh=n\n\
67 | 	Filter rungs with less than n/100 average score in their vicinity.\n\
68 | \n\
69 | -headerthresh=n\n\
70 | 	Filter all rungs at the start and end of texts until finding a reliably\n\
71 | 	plausible region.\n\
72 | \n\
73 | -topothresh=n\n\
74 | 	Filter rungs with less than n percent of one-to-one segments in their vicinity.\n\
75 | \n\
76 | ";
77 | 


--------------------------------------------------------------------------------
/src/hunalign/main.cpp:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 | *                                                                        *
  3 | *  (C) Copyright 2004. Media Research Centre at the                      *
  4 | *  Sociology and Communications Department of the                        *
  5 | *  Budapest University of Technology and Economics.                      *
  6 | *                                                                        *
  7 | *  Developed by Daniel Varga.                                            *
  8 | *                                                                        *
  9 | *************************************************************************/
 10 | 
 11 | #pragma warning ( disable : 4786 )
 12 | 
 13 | 
 14 | #include <string>
 15 | 
 16 | 
 17 | namespace Hunglish
 18 | {
 19 | 
 20 | #ifdef WIN32
 21 | const std::string globalHome = "/";
 22 | #else
 23 | const std::string globalHome = "/home/daniel/";
 24 | #endif
 25 | 
 26 | std::string hunglishHome = globalHome + "hunglish/";
 27 | std::string hunglishExperimentsHome = hunglishHome + "data/experiments/";
 28 | std::string hunglishDictionaryHome  = hunglishHome + "data/szotar/";
 29 | 
 30 | } // namespace Hunglish
 31 | 
 32 | 
 33 | namespace Hunglish
 34 | {
 35 |   ///////////////////////////////////
 36 |   // Entry points of important tools:
 37 | 
 38 |   // Implemented in alignerTool.cpp
 39 |   int main_alignerTool(int argC, char* argV[]);
 40 | 
 41 |   // Implemented in cooccurrenceTool.cpp
 42 |   int main_cooccurrenceTool(int argC, char* argV[]);
 43 | 
 44 |   // Implemented in cooccurrenceTool.cpp
 45 |   int main_bicorpusProcessor(int argC, char* argV[]);
 46 | 
 47 |   ///////////////////////////////////
 48 |   // Just Tests:
 49 | 
 50 |   // We don't want to include DOM just for this function.
 51 |   // On the other hand, we don't want to create a header file just for this function. :)
 52 |   // Implemented in TEIReader.cpp
 53 |   int main_TEIReader( int argC, char* argV[] );
 54 | 
 55 |   // Implemented in networkFlow.cpp
 56 |   void main_edmondsKarpTest();
 57 | 
 58 |   // Implemented in oldAlignTest.cpp
 59 |   void main_alignTest();
 60 | 
 61 |   // Implemented in oldAlignTest.cpp
 62 |   void main_scoreByHandAlign();
 63 | 
 64 |   // Implemented in oldAlignTest.cpp
 65 |   void main_SmallSubsetLookupTest();
 66 | 
 67 |   // Implemented in oldAlignTest.cpp
 68 |   void main_HunHalfTest();
 69 | 
 70 |   // Implemented in oldAlignTest.cpp
 71 |   void main_translationTest();
 72 | 
 73 |   // Implemented in wordAlignment.cpp
 74 |   void main_wordAlignmentTest();
 75 | 
 76 |   // Implemented in bookToMatrix.cpp
 77 |   void main_similarityEvaluatorTool(int argC, char* argV[]);
 78 | 
 79 | } // namespace Hunglish
 80 | 
 81 | #include <timer.h>
 82 | #include <iostream>
 83 | 
 84 | void rectangleCacheTest()
 85 | {
 86 |   int xmax = 5000;
 87 | 
 88 |   const int ymaxmax=10000;
 89 |   const int step=100;
 90 | 
 91 |   char* a = new char[xmax*ymaxmax];
 92 | 
 93 |   {
 94 |     for ( int ymax=step; ymax<=ymaxmax; ymax+=step )
 95 |     {
 96 |       Hunglish::Ticker ticker;
 97 |       for ( int i=0; i<xmax; ++i )
 98 |       {
 99 |         for ( int j=0; j<ymax; ++j )
100 |         {
101 |           // a[i*ymax+j]= 97;
102 |           // a[j*xmax+i]= 97; // slow
103 |         }
104 |       }
105 | 
106 |       std::cout << ymax << "\t" << (double)ticker.get()/xmax/ymax*1000 << std::endl;
107 |     }
108 |   }
109 | 
110 |   delete [] a;
111 | }
112 | 
113 | void compilerOptimizationTest()
114 | {
115 |   int xmax=2000;
116 |   int ymax=2000;
117 | 
118 |   int a;
119 | 
120 |   Hunglish::Ticker ticker;
121 |   for ( int i=0; i<xmax; ++i )
122 |   {
123 |     for ( int j=0; j<ymax; ++j )
124 |     {
125 |       a = i*i*xmax+j ;
126 |       if (a<0)
127 |         return;
128 |     }
129 |   }
130 | 
131 |   std::cerr << ticker.get() << std::endl;
132 | }
133 | 
134 | 
135 | int main(int argC, char* argV[])
136 | {
137 |   // Hunglish::main_similarityEvaluatorTool(argC,argV); return 0;
138 | 
139 |   // compilerOptimizationTest(); return 0;
140 |   // rectangleCacheTest(); return 0;
141 | 
142 |   return ( Hunglish::main_alignerTool(argC,argV) );
143 | 
144 |   Hunglish::main_wordAlignmentTest(); return 0;
145 | 
146 |   return ( Hunglish::main_cooccurrenceTool(argC,argV) );
147 | 
148 |   return ( Hunglish::main_bicorpusProcessor(argC,argV) );
149 | 
150 |   Hunglish::main_scoreByHandAlign(); return 0;
151 | 
152 |   Hunglish::main_edmondsKarpTest(); return 0;
153 | 
154 |   Hunglish::main_alignTest(); return 0;
155 | 
156 |   Hunglish::main_translationTest(); return 0;
157 | 
158 |   // return ( Hunglish::main_TEIReader(argC,argV) );
159 | 
160 |   Hunglish::main_HunHalfTest(); return 0;
161 | 
162 |   Hunglish::main_SmallSubsetLookupTest(); return 0;
163 | 
164 |   return 0;
165 | }
166 | 


--------------------------------------------------------------------------------
/src/hunalign/networkFlow.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_ALIGNMENT_NETWORKFLOW_H
12 | #define __HUNGLISH_ALIGNMENT_NETWORKFLOW_H
13 | 
14 | #include <set>
15 | #include <map>
16 | #include <vector>
17 | #include <iostream>
18 | 
19 | namespace Hunglish
20 | {
21 | 
22 | // It cannot represent graphs with isolated vertices.
23 | // But you don't really need them, do you?
24 | // I could add an addNode class, and throw an exception
25 | // when forwardNeighbours-ing a non-node.
26 | class DiGraph
27 | {
28 | public:
29 |   void addEdge( int a, int b );
30 |   bool isEdge ( int a, int b ) const;
31 |   void clear();
32 | 
33 | public:
34 |   typedef std::set<int> Nodes;
35 | 
36 |   const Nodes& forwardNeighbours ( int a ) const;
37 |   const Nodes& backwardNeighbours( int a ) const;
38 | 
39 | private:
40 |   typedef std::map< int, Nodes > ToNodes;
41 | 
42 |   ToNodes forward;
43 |   ToNodes backward;
44 | };
45 | 
46 | 
47 | class NetworkWithFlow : public DiGraph
48 | {
49 | public:
50 |   typedef std::pair<int,int> Edge;
51 |   typedef std::map<Edge,double> Valuation;
52 | 
53 | public:
54 |   void addEdge( int a, int b, double v );
55 |   void edmondsKarp( int s, int t );
56 |   const Valuation& getFlow() const
57 |   {
58 |     return flow;
59 |   }
60 |   const Valuation& getCapacity() const
61 |   {
62 |     return capacity;
63 |   }
64 |   void dumpFlow( std::ostream& os, int s ) const;
65 | 
66 | private:
67 | 
68 |   double dfs( int s, int t, std::vector<int>& path, bool justWithForwards );
69 |   void augment( const std::vector<int>& path, const double& excess );
70 |   double evaluateAugmentation( const std::vector<int>& path );
71 |   DiGraph::Nodes::const_iterator nextFwd
72 |     ( int x, DiGraph::Nodes::const_iterator it, DiGraph::Nodes::const_iterator end, double& excess );
73 |   DiGraph::Nodes::const_iterator nextBwd
74 |     ( int x, DiGraph::Nodes::const_iterator it, DiGraph::Nodes::const_iterator end, double& excess );
75 | 
76 | private:
77 |   Valuation capacity;
78 |   Valuation flow;
79 | };
80 | 
81 | } // namespace Hunglish
82 | 
83 | #endif // #define __HUNGLISH_ALIGNMENT_NETWORKFLOW_H
84 | 


--------------------------------------------------------------------------------
/src/hunalign/oldAlignTest.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danielvarga/hunalign/2fbcbc976495738321089e13cb728051350b84cd/src/hunalign/oldAlignTest.cpp


--------------------------------------------------------------------------------
/src/hunalign/quasiDiagonal.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 | *                                                                        *
  3 | *  (C) Copyright 2004. Media Research Centre at the                      *
  4 | *  Sociology and Communications Department of the                        *
  5 | *  Budapest University of Technology and Economics.                      *
  6 | *                                                                        *
  7 | *  Developed by Daniel Varga.                                            *
  8 | *                                                                        *
  9 | *************************************************************************/
 10 | 
 11 | #ifndef __HUNGLISH_ALIGNMENT_QUASIDIAGONAL_H
 12 | #define __HUNGLISH_ALIGNMENT_QUASIDIAGONAL_H
 13 | 
 14 | #include <vector>
 15 | 
 16 | namespace Hunglish
 17 | {
 18 | 
 19 | template <class T>
 20 | class QuasiDiagonal
 21 | {
 22 | public:
 23 | 
 24 |   // Quite slow, because of the many bounds checks.
 25 |   class QuasiDiagonalRow
 26 |   {
 27 |   public:
 28 | 
 29 |     // QuasiDiagonalRow is similar to a vector of size size_. The difference is
 30 |     // that only the [offset_,offset_+thickness) subinterval can be written.
 31 |     // Reading from outside this interval yields the default T().
 32 |     // Reading from outside the [0,size) interval yields a throw.
 33 |     // It is NOT asserted that [offset_,offset_+thickness)
 34 |     // should be a subset of [0,size).
 35 |     //
 36 |     QuasiDiagonalRow( int size_=0, int offset_=0, int thickness=0, T outsideDefault_=T() )
 37 |       : offset(offset_), size(size_), data(thickness,T()), outsideDefault(outsideDefault_) {}
 38 | 
 39 | //x     T operator[](int k) const
 40 | //x     {
 41 | //x       if ( ! ((k>=0) && (k<size)) )
 42 | //x       {
 43 | //x         throw "out of bounds";
 44 | //x       }
 45 | //x       int d = k-offset;
 46 | //x       if ( (d>=0) && (d<(int)data.size()) )
 47 | //x       {
 48 | //x         return data[k-offset];
 49 | //x       }
 50 | //x       else
 51 | //x       {
 52 | //x         return outsideDefault;
 53 | //x       }
 54 | //x     }
 55 | 
 56 |     enum ZoneType
 57 |     {
 58 |       DiagZone    = 1,
 59 |       MatrixZone  = 2,
 60 |       OutsideZone = 3
 61 |     };
 62 | 
 63 |     ZoneType zone(int k) const
 64 |     {
 65 |       if ( ! ((k>=0) && (k<size)) )
 66 |       {
 67 |         return OutsideZone;
 68 |       }
 69 |       int d = k-offset;
 70 |       if ( (d>=0) && (d<(int)data.size()) )
 71 |       {
 72 |         return DiagZone;
 73 |       }
 74 |       else
 75 |       {
 76 |         return MatrixZone;
 77 |       }
 78 |     }
 79 | 
 80 |     const T& operator[](int k) const
 81 |     {
 82 |       if ( ! ((k>=0) && (k<size)) )
 83 |       {
 84 |         throw "out of matrix";
 85 |       }
 86 |       int d = k-offset;
 87 |       if ( (d>=0) && (d<(int)data.size()) )
 88 |       {
 89 |         return data[k-offset];
 90 |       }
 91 |       else
 92 |       {
 93 |         return outsideDefault;
 94 |       }
 95 |     }
 96 | 
 97 |     T& cell(int k)
 98 |     {
 99 |       if ( ! ((k>=0) && (k<size)) )
100 |       {
101 |         throw "out of matrix";
102 |       }
103 |       int d = k-offset;
104 |       if ( (d>=0) && (d<(int)data.size()) )
105 |       {
106 |         return data[k-offset];
107 |       }
108 |       else
109 |       {
110 |         throw "out of quasidiagonal";
111 |       }
112 |     }
113 | 
114 |   private:
115 |     std::vector<T> data;
116 |     int offset;
117 |     int size;
118 |     T   outsideDefault;
119 |   };
120 | 
121 |   QuasiDiagonal( int height_, int width_, int thickness_, T outsideDefault_=T() )
122 |     : height(height_), width(width_), thicknes(thickness_)
123 |   {
124 |     for ( int i=0; i<height; ++i )
125 |     {
126 |       // Too much copying, but we don't care.
127 |       QuasiDiagonalRow row( width, offset(i), thicknes, outsideDefault_ );
128 |       rows.push_back(row);
129 |     }
130 |   }
131 | 
132 |   int offset( int row ) const
133 |   {
134 |     return (row*width/height-thicknes/2);
135 |   }
136 | 
137 |   int rowStart( int row ) const
138 |   {
139 |     int s=offset(row);
140 |     return ( s>0 ? s : 0 );
141 |   }
142 | 
143 |   int rowEnd( int row ) const
144 |   {
145 |     int e=offset(row)+thicknes;
146 |     return ( e<width ? e : width );
147 |   }
148 | 
149 |   // The first coordinate is (somewhat atypically) the row.
150 |   const QuasiDiagonalRow& operator[]( int y ) const
151 |   {
152 |     return rows[y];
153 |   }
154 |   
155 |   T& cell( int y, int x )
156 |   {
157 |     if ((y<0)||(y>=height))
158 |     {
159 |       throw "out of matrix";
160 |     }
161 | 
162 |     return rows[y].cell(x);
163 |   }
164 | 
165 |   bool setCell( int y, int x, const T& t )
166 |   {
167 |     cell(y,x) = t;
168 |     return true;
169 |   }
170 | 
171 |   int size() const { return height; }
172 |   // Yes, I know it's a stupid name. The reason is, I don't want to
173 |   // put width/height on the interface, because usually
174 |   // the first coord is the columns, but not here.
175 |   // This could lead to confusion.
176 |   int otherSize() const { return width; }
177 | 
178 |   int thickness() const { return thicknes; }
179 | 
180 | private:
181 |   std::vector<QuasiDiagonalRow> rows;
182 |   int height,width,thicknes;
183 | };
184 | 
185 | } // namespace Hunglish
186 | 
187 | #endif // #define __HUNGLISH_ALIGNMENT_QUASIDIAGONAL_H
188 | 


--------------------------------------------------------------------------------
/src/hunalign/similarityEvaluator.cpp:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 | *                                                                        *
  3 | *  (C) Copyright 2004. Media Research Centre at the                      *
  4 | *  Sociology and Communications Department of the                        *
  5 | *  Budapest University of Technology and Economics.                      *
  6 | *                                                                        *
  7 | *  Developed by Daniel Varga.                                            *
  8 | *                                                                        *
  9 | *************************************************************************/
 10 | 
 11 | #pragma warning ( disable : 4786 )
 12 | 
 13 | #include "bookToMatrix.h"
 14 | #include "translate.h"
 15 | 
 16 | #include <histogram.h>
 17 | #include <serializeImpl.h>
 18 | 
 19 | #include <iostream>
 20 | #include <fstream>
 21 | #include <algorithm>
 22 | 
 23 | namespace Hunglish
 24 | {
 25 | 
 26 | 
 27 | void bisentenceListToBicorpus(
 28 |                               const SentenceList& huSentenceListC, const SentenceList& enSentenceListC,
 29 |                               const BisentenceList& bisentenceList,
 30 |                               SentenceList& huBisentenceHalves, SentenceList& enBisentenceHalves 
 31 |                               )
 32 | {
 33 |   huBisentenceHalves.clear();
 34 |   enBisentenceHalves.clear();
 35 | 
 36 |   for ( int i=0; i<bisentenceList.size(); ++i )
 37 |   {
 38 |     huBisentenceHalves.push_back( huSentenceListC[ bisentenceList[i].first  ] );
 39 |     enBisentenceHalves.push_back( enSentenceListC[ bisentenceList[i].second ] );
 40 |   }
 41 | }
 42 | 
 43 | class SimilarityScorer
 44 | {
 45 | public:
 46 |   virtual double operator()( const Phrase& hu, const Phrase& en ) const = 0;
 47 | };
 48 | 
 49 | class NullScorer : public SimilarityScorer
 50 | {
 51 | public:
 52 |   double operator()( const Phrase& hu, const Phrase& en ) const
 53 |   {
 54 |     return 0;
 55 |   }
 56 | };
 57 | 
 58 | class GaleScorer : public SimilarityScorer
 59 | {
 60 | public:
 61 |   double operator()( const Phrase& hu, const Phrase& en ) const
 62 |   {
 63 |     return closeness( characterLength(hu), characterLength(en) );
 64 |   }
 65 | };
 66 | 
 67 | class IdentityScorer : public SimilarityScorer
 68 | {
 69 | public:
 70 |   double operator()( const Phrase& hu, const Phrase& en ) const
 71 |   {
 72 |     return scoreByIdentity(hu,en);
 73 |   }
 74 | };
 75 | 
 76 | class CombinatorScorer : public SimilarityScorer
 77 | {
 78 | public:
 79 | 
 80 |   CombinatorScorer( const SimilarityScorer& s1_, const SimilarityScorer& s2_, double mul_ ) : s1(s1_), s2(s2_), mul(mul_) {}
 81 | 
 82 |   double operator()( const Phrase& hu, const Phrase& en ) const
 83 |   {
 84 |     return s1(hu,en) + mul*s2(hu,en);
 85 |   }
 86 | private:
 87 |   const SimilarityScorer& s1;
 88 |   const SimilarityScorer& s2;
 89 |   const double mul;
 90 | 
 91 | };
 92 | 
 93 | // The bigger the better.
 94 | double averageSimilarity( const SentenceList& huSentenceList, const SentenceList& enSentenceList,
 95 |                           SimilarityScorer& similarityScorer,
 96 |                           DiscreteDoubleMap& distribution )
 97 | {
 98 |   double sum(0);
 99 |   distribution.clear();
100 | 
101 |   for ( int i=0; i<huSentenceList.size(); ++i )
102 |   {
103 |     const Phrase& hu = huSentenceList[i].words;
104 |     const Phrase& en = enSentenceList[i].words;
105 | 
106 |     double value = similarityScorer( hu, en ) ;
107 | 
108 |     sum += value;
109 |     ++distribution[value];
110 |   }
111 | 
112 |   bool logBin = false;
113 |   if (logBin)
114 |   {
115 |     SmoothDoubleMap binned;
116 |     distribution.binning( true/*logBin*/, false/*dontShowZeros*/, 1.2/*step*/, binned );
117 | 
118 |     for ( SmoothDoubleMap::const_iterator it=binned.begin(); it!=binned.end(); ++it )
119 |     {
120 |       std::cerr << it->first << "\t" << it->second << std::endl;
121 |     }
122 |   }
123 | 
124 |   return ( sum / huSentenceList.size() );
125 | }
126 | 
127 | 
128 | void similarityEvaluator( const DictionaryItems& dictionary,
129 |                           const SentenceList& huSentenceListPretty, const SentenceList& enSentenceListPretty )
130 | {
131 |   SentenceList huSentenceList, enSentenceList;
132 | 
133 |   normalizeTextsForIdentity( dictionary, huSentenceListPretty, enSentenceListPretty, huSentenceList, enSentenceList );
134 | 
135 |   for ( int i=0; i<5; ++i )
136 |   {
137 |     std::cout << huSentenceList[i].words << " --- " << enSentenceList[i].words << std::endl;
138 |   }
139 | 
140 |   DiscreteDoubleMap distribution;
141 | 
142 |   IdentityScorer identityScorer;
143 |   GaleScorer     galeScorer;
144 | 
145 |   CombinatorScorer similarityScorer( identityScorer, galeScorer, 1.0 );
146 | 
147 |   double realSimilarity = averageSimilarity( huSentenceList, enSentenceList, similarityScorer, distribution );
148 | 
149 |   std::cerr << "Real similarity " << realSimilarity << std::endl;
150 | 
151 |   SentenceList huSentenceListWarped(huSentenceList);
152 |   SentenceList enSentenceListWarped(enSentenceList);
153 | 
154 |   huSentenceListWarped.insert( huSentenceListWarped.begin(), huSentenceListWarped.back() );
155 |   huSentenceListWarped.resize( huSentenceListWarped.size()-1 );
156 |   double warpedSimilarity1 = averageSimilarity( huSentenceListWarped, enSentenceListWarped, similarityScorer, distribution );
157 | 
158 |   std::cerr << "Placebo similarity #1 " << warpedSimilarity1 << std::endl;
159 | 
160 | //x   huSentenceListWarped.insert( huSentenceListWarped.begin(), huSentenceListWarped.back() );
161 | //x   huSentenceListWarped.resize( huSentenceListWarped.size()-1 );
162 | //x   double warpedSimilarity2 = averageSimilarity( huSentenceListWarped, enSentenceListWarped, similarityScorer, distribution );
163 | //x 
164 | //x   std::cerr << "Placebo similarity #2 " << warpedSimilarity2 << std::endl;
165 | //x 
166 | //x   huSentenceListWarped.insert( huSentenceListWarped.begin(), huSentenceListWarped.back() );
167 | //x   huSentenceListWarped.resize( huSentenceListWarped.size()-1 );
168 | //x   double warpedSimilarity3 = averageSimilarity( huSentenceListWarped, enSentenceListWarped, similarityScorer, distribution );
169 | //x 
170 | //x   std::cerr << "Placebo similarity #3 " << warpedSimilarity3 << std::endl;
171 | //x 
172 | //x   std::random_shuffle( huSentenceListWarped.begin(), huSentenceListWarped.end() );
173 | //x   double randomSimilarity = averageSimilarity( huSentenceListWarped, enSentenceListWarped, similarityScorer, distribution );
174 | //x 
175 | //x   std::cerr << "Random  similarity    " << randomSimilarity << std::endl;
176 | }
177 | 
178 | void main_similarityEvaluatorTool(int argC, char* argV[])
179 | {
180 |   if (argC!=4)
181 |     throw "argument error";
182 | 
183 |   const char* dicFilename = argV[1];
184 |   const char* huFilename  = argV[2];
185 |   const char* enFilename  = argV[3];
186 | 
187 |   DictionaryItems dictionary;
188 |   std::ifstream dis(dicFilename);
189 |   dictionary.read(dis);
190 | 
191 |   SentenceList huSentenceList;
192 |   SentenceList enSentenceList;
193 | 
194 |   std::ifstream hus(huFilename);
195 |   huSentenceList.readNoIds(hus);
196 |   std::ifstream ens(enFilename);
197 |   enSentenceList.readNoIds(ens);
198 | 
199 |   if (huSentenceList.size()!=enSentenceList.size())
200 |   {
201 |     std::cerr << "Number of sentences not matching: " 
202 |       << huSentenceList.size() << " versus " << enSentenceList.size() << "."
203 |       << std::endl;
204 |     throw "data error";
205 |   }
206 |   else
207 |   {
208 |     std::cerr << huSentenceList.size() << " bisentences read." << std::endl;
209 |   }
210 | 
211 |   similarityEvaluator( dictionary, huSentenceList, enSentenceList );
212 | }
213 | 
214 | } // namespace Hunglish
215 | 


--------------------------------------------------------------------------------
/src/hunalign/similarityEvaluator.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_ALIGNMENT_SIMILARITYEVALUATOR_H
12 | #define __HUNGLISH_ALIGNMENT_SIMILARITYEVALUATOR_H
13 | 
14 | namespace Hunglish
15 | {
16 | 
17 | } // namespace Hunglish
18 | 
19 | #endif // #define __HUNGLISH_ALIGNMENT_SIMILARITYEVALUATOR_H
20 | 


--------------------------------------------------------------------------------
/src/hunalign/trailPostprocessors.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 | *                                                                        *
  3 | *  (C) Copyright 2004. Media Research Centre at the                      *
  4 | *  Sociology and Communications Department of the                        *
  5 | *  Budapest University of Technology and Economics.                      *
  6 | *                                                                        *
  7 | *  Developed by Daniel Varga.                                            *
  8 | *                                                                        *
  9 | *************************************************************************/
 10 | 
 11 | #ifndef __HUNGLISH_ALIGNMENT_TRAILPOSTPROCESSORS_H
 12 | #define __HUNGLISH_ALIGNMENT_TRAILPOSTPROCESSORS_H
 13 | 
 14 | #include "alignment.h"
 15 | 
 16 | namespace Hunglish
 17 | {
 18 | 
 19 | // Helper class that calculates scores of holes.
 20 | class TrailScores
 21 | {
 22 | public:
 23 |   TrailScores( const Trail& trail_, const AlignMatrix& dynMatrix_ );
 24 |   // The score of the jth segmentum. The bigger the better.
 25 |   double operator()( int j ) const;
 26 | 
 27 | private:
 28 |   const Trail& trail;
 29 |   const AlignMatrix& dynMatrix;
 30 | };
 31 | 
 32 | 
 33 | class SentenceList;
 34 | 
 35 | 
 36 | // Helper class that calculates scores of segmentums.
 37 | class TrailScoresInterval
 38 | {
 39 | public:
 40 |   TrailScoresInterval( const Trail& trail_, const AlignMatrix& dynMatrix_,
 41 |     const SentenceList& huSentenceList_, const SentenceList& enSentenceList_ );
 42 | 
 43 |   // The average score of the jth segmentum. The bigger the better.
 44 |   // Division is by the maximum of the Hungarian and English intervals.
 45 |   // This is a somewhat arbritary decision, and goes very badly with the
 46 |   // scoring of the knight's moves. But we really have no better choice.
 47 |   // 
 48 |   // Also, the method applies some very ugly hacks to avoid the effect of
 49 |   // paragraph-delimiters. It strips both intervals of <p>s, and
 50 |   // modifies the dynMatrix-based score assuming that all <p>s got paired.
 51 |   // except surplus <p>s.
 52 |   double scoreSegmentum( const Rundle& start, const Rundle& end ) const;
 53 | 
 54 |   // The score of a segment identified by its index.
 55 |   double operator()( int j ) const;
 56 |   // The score of a union of segments identified by its start and end rundles' index.
 57 |   // Both these methods rely on scoreSegmentum():
 58 |   // This means an important thing: the score only depends
 59 |   // on the start and end rundle, not the rundles in between.
 60 |   double operator()( int j, int k ) const;
 61 | 
 62 | private:
 63 |   const Trail& trail;
 64 |   const AlignMatrix& dynMatrix;
 65 |   const SentenceList& huSentenceList;
 66 |   const SentenceList& enSentenceList;
 67 | };
 68 | 
 69 | // Helper class that calculates scores of one-to-one holes.
 70 | class BisentenceListScores
 71 | {
 72 | public:
 73 |   BisentenceListScores( const BisentenceList& bisentenceList_, const AlignMatrix& dynMatrix_ );
 74 |   // The score of the jth bisentence. The bigger the better.
 75 |   double operator()( int j ) const;
 76 | 
 77 | private:
 78 |   const BisentenceList& bisentenceList;
 79 |   const AlignMatrix& dynMatrix;
 80 | };
 81 | 
 82 | void removeRundles( Trail& trail, const std::set<int>& rundlesToKill );
 83 | 
 84 | // In cautious mode, auto-aligned rundles are thrown away if
 85 | // their left or right neighbour holes are not one-to-one.
 86 | // From the point of view of the resultant bisentences:
 87 | // In cautious mode, one-to-one bisentences are thrown away if
 88 | // they have left or right neighbours which are not one-to-one.
 89 | // This of course dramatically improves precision while slightly degrading recall.
 90 | void cautiouslyFilterTrail( Trail& bestTrail );
 91 | 
 92 | void spaceOutBySentenceLength( Trail& bestTrail, 
 93 |                  const SentenceList& huSentenceListPretty,
 94 |                  const SentenceList& enSentenceList,
 95 | 		 bool utfCharCountingMode );
 96 | 
 97 | // The function gets a nonconst reference to bestTrail.
 98 | // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval.
 99 | // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval.
100 | void postprocessTrailStart( Trail& bestTrail,
101 |                             const TrailScoresInterval& trailScoresInterval,
102 |                             const double& qualityThreshold );
103 | 
104 | // The function gets a nonconst reference to bestTrail.
105 | // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval.
106 | // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval.
107 | void postprocessTrailStartAndEnd( Trail& bestTrail,
108 |                                   const TrailScoresInterval& trailScoresInterval,
109 |                                   double qualityThreshold );
110 | 
111 | // The function gets a nonconst reference to bestTrail.
112 | // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval.
113 | // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval.
114 | void postprocessTrail( Trail& bestTrail, 
115 |                        const TrailScoresInterval& trailScoresInterval, 
116 |                        double qualityThreshold );
117 | 
118 | 
119 | // Throws away rundles which are predominantly surrounded by not-one-to-one holes.
120 | void postprocessTrailByTopology( Trail& bestTrail, double qualityThreshold );
121 | 
122 | 
123 | // Only collect bisentences with score at least qualityThreshold.
124 | void trailToBisentenceList( const Trail& bestTrail, const TrailScores& trailScores, double qualityThreshold,
125 |                             BisentenceList& bisentenceList );
126 | 
127 | // This is basically incorrect.
128 | // Here we use the score of the right-hand segment to decide about the rundle.
129 | //
130 | // The function gets a nonconst reference to bestTrail.
131 | // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval.
132 | // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval.
133 | void filterTrailByQuality( Trail& trail, const TrailScoresInterval& trailScoresInterval,
134 |                            const double& qualityThreshold );
135 | 
136 | void filterBisentenceListByQuality( BisentenceList& bisentenceList, const AlignMatrix& dynMatrix,
137 |                                     const double& qualityThreshold );
138 | 
139 | } // namespace Hunglish
140 | 
141 | #endif // #define __HUNGLISH_ALIGNMENT_TRAILPOSTPROCESSORS_H
142 | 


--------------------------------------------------------------------------------
/src/hunalign/translate.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_ALIGNMENT_TRANSLATE_H
12 | #define __HUNGLISH_ALIGNMENT_TRANSLATE_H
13 | 
14 | #include "words.h"
15 | #include "dictionary.h"
16 | 
17 | namespace Hunglish
18 | {
19 | 
20 | typedef std::map< std::string, Phrase > DumbDictionary;
21 | 
22 | // This will become a class, with dictionary initialization, and a translate method.
23 | // It will have various implementations.
24 | 
25 | void buildDumbDictionary( const DictionaryItems& dictionary, DumbDictionary& dumbDictionary );
26 | 
27 | void buildDumbDictionaryUsingFrequencies( 
28 |        const DictionaryItems& dictionary, 
29 |        FrequencyMap& enFreq, 
30 |        DumbDictionary& dumbDictionary );
31 | 
32 | void buildDumbDictionary( Hunglish::DumbDictionary& dumbDictionary,
33 |                           const std::string& dictionaryFilename,
34 |                           const Hunglish::SentenceList& enSentenceList = Hunglish::SentenceList()
35 |                         );
36 | 
37 | void trivialTranslateWord(
38 |                      const DumbDictionary& dumbDictionary,
39 |                      const Word& originalWord,
40 |                      Phrase& words
41 |                      );
42 | 
43 | void trivialTranslate(
44 |                      const DumbDictionary& dumbDictionary,
45 |                      const Sentence& sentence,
46 |                            Sentence& translatedSentence
47 |                      );
48 | 
49 | void trivialTranslateSentenceList(
50 |                      const DumbDictionary& dumbDictionary,
51 |                      const SentenceList& sentenceList,
52 |                            SentenceList& translatedSentenceList
53 |                      );
54 | 
55 | void naiveTranslate(
56 |                      const DictionaryItems& dictionary,
57 |                      const SentenceList& sentenceList,
58 |                            SentenceList& translatedSentenceList
59 |                      );
60 | 
61 | typedef std::multimap< std::string, Phrase > DumbMultiDictionary;
62 | 
63 | void buildDumbMultiDictionary( const DictionaryItems& dictionary, DumbMultiDictionary& dumbMultiDictionary, bool reverse );
64 | 
65 | void sortNormalizeSentences( Hunglish::SentenceList& sentenceList );
66 | 
67 | // This function preprocesses the sentences so that sentenceListsToAlignMatrixIdentity can be applied to them.
68 | // It does a rough translation and an alphabetic sort of words.
69 | void normalizeTextsForIdentity( const DictionaryItems& dictionary,
70 |                                 const SentenceList& huSentenceListPretty,  const SentenceList& enSentenceListPretty,
71 |                                       SentenceList& huSentenceListGarbled,       SentenceList& enSentenceListGarbled );
72 | 
73 | } // namespace Hunglish
74 | 
75 | #endif // #define __HUNGLISH_ALIGNMENT_TRANSLATE_H
76 | 


--------------------------------------------------------------------------------
/src/hunalign/wordAlignment.h:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 | *                                                                        *
  3 | *  (C) Copyright 2004. Media Research Centre at the                      *
  4 | *  Sociology and Communications Department of the                        *
  5 | *  Budapest University of Technology and Economics.                      *
  6 | *                                                                        *
  7 | *  Developed by Daniel Varga.                                            *
  8 | *                                                                        *
  9 | *************************************************************************/
 10 | 
 11 | #ifndef __HUNGLISH_ALIGNMENT_WORDALIGNMENT_H
 12 | #define __HUNGLISH_ALIGNMENT_WORDALIGNMENT_H
 13 | 
 14 | #include "words.h"
 15 | #include "dictionary.h"
 16 | 
 17 | #include <set>
 18 | #include <vector>
 19 | 
 20 | 
 21 | namespace Hunglish
 22 | {
 23 | 
 24 | const int NullWord = -3 ;
 25 | 
 26 | typedef int WordIndex;
 27 | 
 28 | typedef std::pair<WordIndex,WordIndex> WordRelation;
 29 | 
 30 | typedef std::vector<WordRelation> WordRelations;
 31 | 
 32 | typedef std::set<WordIndex> WordSet;
 33 | 
 34 | // Describes the word-to-word structure of a bisentence. Many-to-one and one-to-NIL relationships are allowed.
 35 | // Many-to-many is currently supported but not encouraged: disjoint complete bigraphs are allowed.
 36 | // The bisentence itself is not stored. It is referred into by integer word indices.
 37 | // Word-to-NIL relations must be made expicit. An initial empty WordAlignment means no knowledge, not knowledge of NIL.
 38 | // TODO confidence values may be incorporated later.
 39 | // TODO maybe even more importantly, flags to denote kinds of relations:
 40 | //   - suitable as dictionary item
 41 | //   - the result of ellipsis, not suitable as dictionary item
 42 | //   - ?
 43 | //
 44 | class WordAlignment
 45 | {
 46 | public:
 47 |   const WordRelations& getWordRelations() const ;
 48 |   void addWordRelation( const WordRelation& wordRelation ) ;
 49 | 
 50 |   // Under the current, unindexed implementation this is an O(n) operation.
 51 |   // leftSide refers to the argument being on the leftside, not the result! Major f.ck up possibility!
 52 |   WordSet relation( WordIndex wordIndex, bool leftSide ) const ;
 53 | 
 54 |   // Under the current, unindexed implementation this is implemented by two *Friends operations, so it is very very slow.
 55 |   // leftSide refers to the argument being on the leftside, not the result! Major f.ck up possibility!
 56 |   WordSet group   ( WordIndex wordIndex, bool leftSide ) const ;
 57 | 
 58 |   // Inconsistency can be caused by the following:
 59 |   // - word connected to NIL and other.
 60 |   // - two words connected twice.
 61 |   // - graph is not disjoint union of stars.( Or complete bigraphs, if many-to-many is supported.)
 62 |   bool isConsistent() const;
 63 | 
 64 |   // Reorders the data lexicographically, without changing its semantics in any way.
 65 |   void resort();
 66 | 
 67 |   void clear();
 68 | 
 69 | private:
 70 |   WordSet rightFriends( WordIndex wordIndex ) const;
 71 |   WordSet leftFriends ( WordIndex wordIndex ) const;
 72 | 
 73 | private:
 74 |   WordRelations wordRelations;
 75 | };
 76 | 
 77 | // BiSentence::first is the source (Hungarian) sentence.
 78 | typedef std::pair<Phrase,Phrase> BiSentence;
 79 | 
 80 | class WordAlignedBisentence : public BiSentence // Inheritance from nonvirtual. It sounds so strange but it feels so good.
 81 | {
 82 | public:
 83 |   void markDictionaryItem( const DictionaryItem& dictionaryItem );
 84 | 
 85 |   void findDictionaryItemsByGaps( DictionaryItems& dictionaryItems ); // Not const because it resorts.
 86 | 
 87 |   // Removes all words that the align can account for.
 88 |   void elimination();
 89 | 
 90 | public:
 91 |   WordAlignment wordAlignment;
 92 | };
 93 | 
 94 | class WordAlignedBisentences : public std::vector<WordAlignedBisentence> // Inheritance from nonvirtual. It sounds so strange but it feels so good.
 95 | {
 96 | public:
 97 | 
 98 |   void markDictionaryItem( const DictionaryItem& dictionaryItem );
 99 | 
100 |   void importBicorpus( SentenceList& huSentenceList, SentenceList& enSentenceList );
101 | 
102 |   void findDictionaryItemsByGaps( DictionaryItems& dictionaryItems ); // Not const because it resorts.
103 | 
104 |   // Removes all words that the align can account for.
105 |   void elimination();
106 | };
107 | 
108 | } // namespace Hunglish
109 | 
110 | #endif // #define __HUNGLISH_ALIGNMENT_WORDALIGNMENT_H
111 | 


--------------------------------------------------------------------------------
/src/hunalign/words.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_ALIGNMENT_WORDS_H
12 | #define __HUNGLISH_ALIGNMENT_WORDS_H
13 | 
14 | #include <string>
15 | #include <vector>
16 | #include <iosfwd>
17 | 
18 | namespace Hunglish
19 | {
20 | 
21 | typedef std::string String;
22 | 
23 | typedef String Word;
24 | 
25 | typedef std::vector<Word> WordList;
26 | 
27 | typedef WordList Phrase;
28 | 
29 | typedef std::vector<Phrase> Book;
30 | 
31 | struct Sentence
32 | {
33 |   WordList words;
34 |   String   sentence;
35 |   String   id;
36 | };
37 | 
38 | // Implemented in dictionary.cpp
39 | class SentenceList : public std::vector<Sentence>
40 | {
41 | public:
42 |   void read ( std::istream& is );
43 |   void readNoIds( std::istream& is );
44 |   void write( std::ostream& os ) const;
45 |   void writeNoIds( std::ostream& os ) const;
46 | };
47 | 
48 | // Implemented in dictionary.cpp
49 | void readBicorpus( std::istream& is, SentenceList& huSentenceList, SentenceList& enSentenceList);
50 | void writeBicorpus( std::ostream& os, const SentenceList& huSentenceList, const SentenceList& enSentenceList);
51 | 
52 | } // namespace Hunglish
53 | 
54 | #endif // #define __HUNGLISH_ALIGNMENT_WORDS_H
55 | 


--------------------------------------------------------------------------------
/src/include/argumentsParser.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __ARGUMENTSPARSER_H
12 | #define __ARGUMENTSPARSER_H
13 | 
14 | #include <string>
15 | #include <map>
16 | #include <vector>
17 | 
18 | // Current usage and limitations:
19 | // Every argument starts with a '-'.
20 | // It is a key/value pair. The delimiter
21 | // is either the first '=' (erased), or the
22 | // first nonalphabetic character (not erased).
23 | 
24 | class AnyData
25 | {
26 | public:
27 |   enum Kind { Int, String, Float, Set };
28 | 
29 | public:
30 |   AnyData() : kind(String), dInt(-1) {}
31 |   AnyData( const int& d ) : kind(Int), dInt(d) {}
32 |   AnyData( const std::string& d ) : kind(String), dInt(-1), dString(d) {}
33 | //  AnyData( const float& d ) : kind(Float), dFloat(d) {}
34 | //  AnyData( const std::set<int>& d ) : kind(Set), dSet(d), dInt(-1) {}
35 | 
36 | public:
37 |   Kind kind;
38 |   int dInt;
39 |   std::string dString;
40 | //  float dFloat;
41 | //  std::set<int> dSet;
42 | };
43 | 
44 | typedef std::string ArgName;
45 | typedef std::map< ArgName, AnyData > ArgumentMap;
46 | 
47 | class Arguments : public ArgumentMap
48 | {
49 | public:
50 |   // Very important note: When read finds a numeric/set argument,
51 |   // it sets anyData.kind to Int. But STILL, it fills anyData.dString,
52 |   // just in case. So if the ArgumentMap was built by Arguments::read,
53 |   // the dString fields are all filled.
54 |   bool read( int argc, char **argv );
55 | 
56 |   // remains is filled with the arguments not starting with '-'.
57 |   bool read( int argc, char **argv, std::vector<const char*>& remains );
58 | 
59 |   // const if fails, erases arg if succeeds.
60 |   bool getNumericParam( const ArgName& name, int& num );
61 | 
62 |   // sw is true if the switch is present. The function
63 |   // returns false if the argument value is not empty.
64 |   bool getSwitch( const ArgName& name, bool& sw );
65 | 
66 |   bool getSwitchConst( const ArgName& name, bool& sw ) const;
67 | 
68 |   // Returns true if the switch is present. Throws an error message if
69 |   // if the argument value is not empty.
70 |   bool getSwitchCompact( const ArgName& name );
71 | 
72 |   void checkEmptyArgs() const;
73 | };
74 | 
75 | #endif // #define __ARGUMENTSPARSER_H
76 | 


--------------------------------------------------------------------------------
/src/include/histogram.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Applied Logic Laboratory, Ltd.                    *
 4 | *  All rights reserved.                                                  *
 5 | *                                                                        *
 6 | *  Developed by Daniel Varga                                             *
 7 | *                                                                        *
 8 | *************************************************************************/
 9 | 
10 | #ifndef __PRIVATE_DANIEL_NEIGHBOURS_HISTOGRAM_H
11 | #define __PRIVATE_DANIEL_NEIGHBOURS_HISTOGRAM_H
12 | 
13 | #include <map>
14 | #include <vector>
15 | 
16 | class Histogram : public std::vector<double>
17 | {
18 | public:
19 |   void add( int x, double val = 1 );
20 | 
21 |   void write( std::ostream& os ) const;
22 |   void write_othernonull( std::ostream& os ) const;
23 |   void read( std::istream& is );
24 | 
25 |   double sumFromOne() const;
26 |   void setZeroByTotal( double total );
27 | };
28 | 
29 | // Szemantikus kaosz:
30 | // Ket szemantikusan radikalisan kulonbozo strukturat tarolhatunk DoubleMap-ben.
31 | // Az egyik egy olyan dolog, ami mindenhol nulla, ahol nem mondtuk meg, mennyi.
32 | // A hisztogrammok ilyenek.
33 | // A masik, amelyiket interpolalnank az explicit adott ertekek kozott.
34 | // A binning eredmenyek ilyenek.
35 | 
36 | // TODO Szarmaztassunk le egy Interpolable osztalyt.
37 | class DoubleMap : public std::map<double,double>
38 | {
39 | public:
40 | 
41 |   void read ( std::istream& is );
42 |   void write( std::ostream& os ) const;
43 | 
44 | };
45 | 
46 | class SmoothDoubleMap : public DoubleMap
47 | {
48 | public:
49 |   void from( const Histogram& h );
50 | 
51 |   // Ide majd interpolalo metodusok jonnek.
52 | };
53 | 
54 | class DiscreteDoubleMap : public DoubleMap
55 | {
56 | public:
57 |   // A nulla implicit (azaz nem) jelenik meg a DoubleMap-ben!
58 |   void from( const Histogram& h );
59 | 
60 |   void binning( bool logBin, bool dontShowZeros, double step, SmoothDoubleMap& binned ) const;
61 | };
62 | 
63 | #endif // #define __PRIVATE_DANIEL_NEIGHBOURS_HISTOGRAM_H
64 | 


--------------------------------------------------------------------------------
/src/include/portableHash.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_INCLUDE_PORTABLEHASH_H
12 | #define __HUNGLISH_INCLUDE_PORTABLEHASH_H
13 | 
14 | 
15 | #ifdef WIN32
16 | 
17 | #define EXTNAMESPACE std
18 | #include <hash_map>
19 | 
20 | #else
21 | 
22 | #define EXTNAMESPACE __gnu_cxx
23 | #include <ext/hash_map>
24 | 
25 | #endif
26 | 
27 | #endif // #define __HUNGLISH_INCLUDE_PORTABLEHASH_H
28 | 


--------------------------------------------------------------------------------
/src/include/serializeImpl.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_INCLUDE_SERIALIZEIMPL_H
12 | #define __HUNGLISH_INCLUDE_SERIALIZEIMPL_H
13 | 
14 | #include <iostream>
15 | #include <vector>
16 | #include <set>
17 | 
18 | template <class T>
19 | std::ostream& operator<<( std::ostream& os, const std::vector<T>& v )
20 | {
21 |   for ( typename std::vector<T>::const_iterator it=v.begin(); it!=v.end(); ++it )
22 |   {
23 |     os << *it ;
24 |     if (it+1!=v.end())
25 |       os << " ";
26 |   }
27 |   return os;
28 | }
29 | 
30 | template <class T>
31 | std::ostream& operator<<( std::ostream& os, const std::set<T>& v )
32 | {
33 |   typename std::set<T>::const_iterator it=v.begin();
34 |   while (true)
35 |   {
36 |     os << *it ;
37 | 
38 |     typename std::set<T>::const_iterator itplus = it;
39 |     ++itplus;
40 | 
41 |     if (itplus == v.end())
42 |       break;
43 |     else
44 |       os << " ";
45 | 
46 |     it = itplus;
47 |   }
48 |   return os;
49 | }
50 | 
51 | #endif // #define __HUNGLISH_INCLUDE_SERIALIZEIMPL_H
52 | 


--------------------------------------------------------------------------------
/src/include/stringsAndStreams.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_INCLUDE_STRINGSANDSTREAMS_H
12 | #define __HUNGLISH_INCLUDE_STRINGSANDSTREAMS_H
13 | 
14 | #include <string>
15 | #include <vector>
16 | 
17 | namespace Hunglish
18 | {
19 | 
20 | void split( const std::string line, std::vector<std::string>& words, char delim='\t' );
21 | 
22 | } // namespace Hunglish
23 | 
24 | #endif // #define __HUNGLISH_INCLUDE_STRINGSANDSTREAMS_H
25 | 


--------------------------------------------------------------------------------
/src/include/timer.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #ifndef __HUNGLISH_INCLUDE_TIMER_H
12 | #define __HUNGLISH_INCLUDE_TIMER_H
13 | 
14 | // Don't use it for anything important:
15 | // The windows version overflows at 2^32=4294967296 milliseconds (49.71 days) after boot time.
16 | // The unix version overflows in every seven weeks (3600*24*49*1000=4233600000 ms).
17 | 
18 | namespace Hunglish
19 | {
20 | 
21 | // In microseconds.
22 | class Timer
23 | {
24 | public:
25 |   static int getTick();
26 | };
27 | 
28 | class Ticker
29 | {
30 | public:
31 |   Ticker() { start(); }
32 |   
33 |   void start() { time = Timer::getTick(); }
34 |   int get() { return Timer::getTick()-time; }
35 |   int next() { int t=get(); start(); return t; }
36 | 
37 | private:
38 |   int time;
39 | };
40 | 
41 | } // namespace Hunglish
42 | 
43 | #ifndef WIN32
44 | 
45 | void itoa( int n, char* s, int radix );
46 | 
47 | #endif
48 | 
49 | #endif // #define __HUNGLISH_INCLUDE_TIMER_H
50 | 


--------------------------------------------------------------------------------
/src/utils/argumentsParser.cpp:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 | *                                                                        *
  3 | *  (C) Copyright 2004. Media Research Centre at the                      *
  4 | *  Sociology and Communications Department of the                        *
  5 | *  Budapest University of Technology and Economics.                      *
  6 | *                                                                        *
  7 | *  Developed by Daniel Varga.                                            *
  8 | *                                                                        *
  9 | *************************************************************************/
 10 | 
 11 | #pragma warning ( disable : 4786 )
 12 | 
 13 | #include "argumentsParser.h"
 14 | 
 15 | #include <iostream>
 16 | #include <stdlib.h>
 17 | 
 18 | // Could be better.
 19 | bool alphabetic( char c)
 20 | {
 21 |   return ((c>='a')&&(c<='z')) || ((c>='A')&&(c<='Z')) || (c=='_');
 22 | }
 23 | 
 24 | bool Arguments::read( int argc, char **argv )
 25 | {
 26 |   for ( int i=1; i<argc; ++i )
 27 |   {
 28 |     std::string p = argv[i];
 29 |     if (p.empty() || p[0]!='-')
 30 |     {
 31 |       std::cerr << p << ": unable to parse argument\n";
 32 |       throw "argument error";
 33 |       return false;
 34 |     }
 35 |     p.erase(0,1);
 36 | 
 37 |     if (p.empty())
 38 |     {
 39 |       std::cerr << "Empty argument\n";
 40 |       throw "argument error";
 41 |       return false;
 42 |     }
 43 | 
 44 |     int j(0);
 45 |     for ( ; j<p.size(); ++j )
 46 |     {
 47 |       if (! alphabetic(p[j]) )
 48 |       {
 49 |         if (p[j]=='=')
 50 |           p.erase(j,1);
 51 |         break;
 52 |       }
 53 |     }
 54 | 
 55 |     ArgName name = p.substr(0,j);
 56 |     std::string val = p.substr(j, p.size()-j);
 57 |     int num = atoi(val.c_str());
 58 | 
 59 |     AnyData anyData(val);
 60 |     if ( (num!=0) || (val=="0") )
 61 |     {
 62 |       anyData.dInt = num;
 63 |       anyData.kind = AnyData::Int;
 64 |     }
 65 |     operator[](name) = anyData;
 66 | 
 67 |   }
 68 | 
 69 |   return true;
 70 | }
 71 | 
 72 | bool Arguments::read( int argc, char **argv, std::vector<const char*>& remains )
 73 | {
 74 |   remains.clear();
 75 | 
 76 |   for ( int i=1; i<argc; ++i )
 77 |   {
 78 |     std::string p = argv[i];
 79 |     if (p.empty() || p[0]!='-')
 80 |     {
 81 |       remains.push_back(argv[i]);
 82 |       continue;
 83 |     }
 84 | 
 85 |     p.erase(0,1);
 86 | 
 87 |     if (p.empty())
 88 |     {
 89 |       std::cerr << "Empty argument\n";
 90 |       throw "argument error";
 91 |       return false;
 92 |     }
 93 | 
 94 |     int j(0);
 95 |     for ( ; j<p.size(); ++j )
 96 |     {
 97 |       if (! alphabetic(p[j]) )
 98 |       {
 99 |         if (p[j]=='=')
100 |           p.erase(j,1);
101 |         break;
102 |       }
103 |     }
104 | 
105 |     ArgName name = p.substr(0,j);
106 |     std::string val = p.substr(j, p.size()-j);
107 |     int num = atoi(val.c_str());
108 | 
109 |     AnyData anyData(val);
110 |     if ( (num!=0) || (val=="0") )
111 |     {
112 |       anyData.dInt = num;
113 |       anyData.kind = AnyData::Int;
114 |     }
115 |     operator[](name) = anyData;
116 | 
117 |   }
118 | 
119 |   return true;
120 | }
121 | 
122 | bool Arguments::getNumericParam( const std::string& name, int& num )
123 | {
124 |   const_iterator it=find(name);
125 |   if (it==end())
126 |   {
127 |     // std::cerr << "Argument -" << name << " missing.\n";
128 |     return false;
129 |   }
130 | 
131 |   if (it->second.kind != AnyData::Int)
132 |   {
133 |     std::cerr << "Argument -" << name << ": integer expected.\n";
134 |     throw "argument error";
135 |   }
136 | 
137 |   num = it->second.dInt;
138 |   erase(name);
139 |   return true;
140 | }
141 | 
142 | bool Arguments::getSwitchConst( const ArgName& name, bool& sw ) const
143 | {
144 |   const_iterator it=find(name);
145 |   if (it==end())
146 |   {
147 |     sw = false;
148 |     return true;
149 |   }
150 |   else if (! it->second.dString.empty())
151 |   {
152 |     std::cerr << "Argument -" << name << ": value is not allowed.\n";
153 |     return false;
154 |   }
155 |   else
156 |   {
157 |     sw = true;
158 |     return true;
159 |   }
160 | }
161 | 
162 | bool Arguments::getSwitch( const ArgName& name, bool& sw )
163 | {
164 |   bool ok = getSwitchConst(name, sw);
165 |   if (ok)
166 |     erase(name);
167 | 
168 |   return ok;
169 | }
170 | 
171 | bool Arguments::getSwitchCompact( const ArgName& name )
172 | {
173 |   bool sw(false);
174 |   bool ok = getSwitchConst(name, sw);
175 |   if (ok)
176 |   {
177 |     erase(name);
178 |     return sw;
179 |   }
180 |   else
181 |   {
182 |     std::cerr << "No value is allowed for argument -" << name << ".\n";
183 |     throw "argument error";
184 |   }
185 | }
186 | 
187 | void Arguments::checkEmptyArgs() const
188 | {
189 |   if (!empty())
190 |   {
191 |     std::cerr << "Invalid argument: ";
192 | 
193 |     for ( Arguments::const_iterator it=begin(); it!=end(); ++it )
194 |     {
195 |       std::cerr << "-" << it->first;
196 |       if (!it->second.dString.empty())
197 |         std::cerr << "=" << it->second.dString;
198 |       std::cerr << " ";
199 |     }
200 |     std::cerr << std::endl;
201 | 
202 |     throw "argument error";
203 |   }
204 | }
205 | 


--------------------------------------------------------------------------------
/src/utils/histogram.cpp:
--------------------------------------------------------------------------------
  1 | /*************************************************************************
  2 | *                                                                        *
  3 | *  (C) Copyright 2002.  Daniel Varga                                     *
  4 | *  All rights reserved by the author.                                    *
  5 | *                                                                        *
  6 | *************************************************************************/
  7 | 
  8 | #pragma warning ( disable : 4786 )
  9 | 
 10 | #include <histogram.h>
 11 | 
 12 | #include <iostream>
 13 | 
 14 | void DiscreteDoubleMap::binning( bool logBin, bool dontShowZeros, double step, SmoothDoubleMap& binned ) const
 15 | {
 16 |   const DiscreteDoubleMap& m = *this;
 17 | 
 18 |   binned.clear();
 19 | 
 20 |   if (m.empty())
 21 |     return;
 22 | 
 23 |   double leftestValue = m.begin()->first;
 24 | 
 25 |   double leftFloat = 0.0;
 26 |   if (logBin)
 27 |   {
 28 |     if ( leftestValue < 0 )
 29 |     {
 30 |       std::cerr << "Logbinning currently does not work for values smaller than 0." << std::endl;
 31 |       throw "data error";
 32 |     }
 33 |     else if ( leftestValue < 1 )
 34 |     {
 35 |       // A very primitive, basically incorrect way to get something for sub-one values. Not a real logbin.
 36 |       double left = 0.0;
 37 |       double right = 1.0;
 38 | 
 39 |       DoubleMap::const_iterator leftit  = m.lower_bound(left);
 40 |       DoubleMap::const_iterator rightit = m.lower_bound(right);
 41 | 
 42 |       if (leftit!=m.end())
 43 |       {
 44 |         double sum=0;
 45 | 
 46 |         for ( ; leftit!=rightit; ++leftit )
 47 |         {
 48 |           sum += leftit->second;
 49 |         }
 50 | 
 51 |         // Nem vilagos, hogy x meghatarozasara mi a jo politika. Itt van ket primitiv:
 52 |         // double adHocCenter = left;
 53 |         double adHocCenter = (left+right-1)/2;
 54 | 
 55 |         if ( (!dontShowZeros) || (sum>0) )
 56 |         {
 57 |           binned[adHocCenter] = sum/(right-left);
 58 |         }
 59 |       }
 60 |     }
 61 | 
 62 |     leftFloat = 1.0;
 63 |   }
 64 |   else
 65 |   {
 66 |     leftFloat = 0.0;
 67 |     if ( leftestValue < 0 )
 68 |     {
 69 |       std::cerr << "Binning currently does not work for values smaller than 0." << std::endl;
 70 |       throw "data error";
 71 |     }
 72 |   }
 73 | 
 74 |   while (true)
 75 |   {
 76 |     double rightFloat;
 77 |     if (logBin)
 78 |       rightFloat = leftFloat * step;
 79 |     else
 80 |       rightFloat = leftFloat + step;
 81 | 
 82 |     // Nulla hosszu bin intervallum.
 83 |     if ((int)leftFloat==(int)rightFloat)
 84 |     {
 85 |       leftFloat = rightFloat;
 86 |       continue;
 87 |     }
 88 | 
 89 |     double left  = (int)leftFloat;
 90 |     double right = (int)rightFloat;
 91 | 
 92 |     DoubleMap::const_iterator leftit  = m.lower_bound(left);
 93 |     DoubleMap::const_iterator rightit = m.lower_bound(right);
 94 | 
 95 |     if (leftit==m.end())
 96 |       break;
 97 | 
 98 |     double sum=0;
 99 | 
100 |     for ( ; leftit!=rightit; ++leftit )
101 |     {
102 |       sum += leftit->second;
103 |     }
104 | 
105 |     // Nem vilagos, hogy x meghatarozasara mi a jo politika. Itt van ket primitiv:
106 |     // double adHocCenter = left;
107 |     double adHocCenter = (left+right-1)/2;
108 | 
109 |     if ( (!dontShowZeros) || (sum>0) )
110 |     {
111 |       binned[adHocCenter] = sum/(right-left);
112 |     }
113 |     leftFloat = rightFloat;
114 |   }
115 | }
116 | 
117 | void DoubleMap::read( std::istream& is )
118 | {
119 |   clear();
120 | 
121 |   while ( !is.eof() && (is.good()) )
122 |   {
123 |     double x(-1024),y(-1024);
124 |     is >> x >> y;
125 |     is.ignore(); // New line.
126 | 
127 |     if (!is.good())
128 |       break;
129 | 
130 |     operator[](x) = y;
131 |   }
132 | }
133 | 
134 | void DoubleMap::write( std::ostream& os ) const
135 | {
136 |   for ( DoubleMap::const_iterator it=begin(); it!=end(); ++it )
137 |   {
138 |     os << it->first << "\t" << it->second << std::endl;
139 |   }
140 | }
141 | 
142 | // A nulla implicit (azaz nem) jelenik meg a DiscreteDoubleMap-ben!
143 | void DiscreteDoubleMap::from( const Histogram& h )
144 | {
145 |   clear();
146 |   for ( int i=0; i<h.size(); ++i )
147 |   {
148 |     if (h[i] != 0)
149 |     {
150 |       operator[](i) = h[i];
151 |     }
152 |   }
153 | }
154 | 
155 | // Bezzeg a SmoothDoubleMap-ben!
156 | void SmoothDoubleMap::from( const Histogram& h )
157 | {
158 |   clear();
159 |   for ( int i=0; i<h.size(); ++i )
160 |   {
161 |     operator[](i) = h[i];
162 |   }
163 | }
164 | 
165 | void Histogram::add( int x, double val/* = 1 */ )
166 | {
167 |   if (x>=size())
168 |   {
169 |     resize(x+1);
170 |   }
171 |   operator[](x) += val;
172 | }
173 | 
174 | void Histogram::write( std::ostream& os ) const
175 | {
176 |   for ( int i=0; i<size(); ++i )
177 |   {
178 |     os << i << "\t" << operator[](i) << std::endl;
179 |   }
180 | }
181 | 
182 | void Histogram::write_othernonull( std::ostream& os ) const
183 | {
184 |   for ( int i=1; i<size(); ++i )
185 |   {
186 |     if (operator[](i) != 0)
187 |       os << i << "\t" << operator[](i) << std::endl;
188 |   }
189 | }
190 | 
191 | void Histogram::read( std::istream& is )
192 | {
193 |   while ( !is.eof() && (is.good()) )
194 |   {
195 |     int x(-1024);
196 |     double y(-1024);
197 |     is >> x >> y;
198 |     is.ignore(); // New line.
199 | 
200 |     if (!is.good())
201 |       break;
202 | 
203 |     if (x>=size())
204 |     {
205 |       resize(x+1);
206 |     }
207 |     operator[](x) = y;
208 |   }
209 | }
210 | 
211 | double Histogram::sumFromOne() const
212 | {
213 |   double n=0;
214 |   for ( int i=1; i<size(); ++i )
215 |   {
216 |     n += operator[](i);
217 |   }
218 |   return n;
219 | }
220 | 
221 | void Histogram::setZeroByTotal( double total )
222 | {
223 |   double thereis = sumFromOne();
224 |   operator[](0) = total-thereis;
225 | }
226 | 


--------------------------------------------------------------------------------
/src/utils/stringsAndStreams.cpp:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #pragma warning ( disable : 4786 )
12 | 
13 | #include <stringsAndStreams.h>
14 | 
15 | namespace Hunglish
16 | {
17 | 
18 | void split( const std::string line, std::vector<std::string>& words, char delim /*='\t'*/ )
19 | {
20 |   words.clear();
21 | 
22 |   std::string current;
23 |   int i;
24 |   for ( i=0; i<line.size(); ++i )
25 |   {
26 |     if (line[i]==delim)
27 |     {
28 |       words.push_back(current);
29 |       current = "";
30 |     }
31 |     else
32 |     {
33 |       current += line[i];
34 |     }
35 |   }
36 |   words.push_back(current);
37 | }
38 | 
39 | } // namespace Hunglish
40 | 


--------------------------------------------------------------------------------
/src/utils/timer.cpp:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 | *                                                                        *
 3 | *  (C) Copyright 2004. Media Research Centre at the                      *
 4 | *  Sociology and Communications Department of the                        *
 5 | *  Budapest University of Technology and Economics.                      *
 6 | *                                                                        *
 7 | *  Developed by Daniel Varga.                                            *
 8 | *                                                                        *
 9 | *************************************************************************/
10 | 
11 | #pragma warning ( disable : 4786 )
12 | 
13 | #include "timer.h"
14 | 
15 | #include <cassert>
16 | #include <string.h>
17 | 
18 | //#include <iostream> // Just for testing.
19 | 
20 | #ifdef WIN32
21 | #include <windows.h>
22 | #else
23 | #include <sys/time.h>
24 | #include <time.h>
25 | #endif
26 | 
27 | #ifndef WIN32
28 | #include <sstream> // For itoa implementation.
29 | #endif
30 | 
31 | namespace Hunglish
32 | {
33 | 
34 | int Timer::getTick()
35 | {
36 | #ifdef WIN32
37 | 
38 |   return GetTickCount();
39 | 
40 | #else
41 | 
42 |   timeval tv;
43 | 
44 |   assert( 0 == gettimeofday( &tv, 0) );
45 | 
46 | //  std::cerr << "sec:"<< tv.tv_sec << " usec:"<< tv.tv_usec << std::endl;
47 | 
48 |   return (tv.tv_sec % (3600*24*49))*1000+tv.tv_usec/1000;
49 | 
50 | #endif
51 | }
52 | 
53 | } // namespace Hunglish
54 | 
55 | // Ugly portability layer:
56 | 
57 | #ifndef WIN32
58 | 
59 | void itoa( int n, char* s, int radix )
60 | {
61 |   assert( radix==10 );
62 |   std::ostringstream ss;
63 |   ss << n;
64 |   strcpy(s,ss.str().c_str());
65 | }
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------