├── LICENSE ├── README.md ├── dist └── invitationmodel-1.0.jar ├── pom.xml └── src └── main ├── java └── nl │ └── uva │ └── illc │ ├── AlignmentCalculator.java │ ├── PerplexityCalculator.java │ └── dataselection │ ├── InvitationModel.java │ └── TranslationTable.java └── resources └── log4j2.xml /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # InvitationModel 2 | Implementation of domain adaptation algorithm based on the paper "Latent Domain Translation Models in Mix-of-Domains Haystack" http://www.aclweb.org/anthology/C14-1182 3 | 4 | This work was supported by "STW Open Technologieprogramma" grant under project name "Data-Powered Domain-Specific Translation Services On Demand". 5 | 6 | ### Compilation 7 | 8 | `mvn package` 9 | 10 | This will generate target/invitationmodel-1.0.jar 11 | 12 | 13 | ### Usage 14 | 15 | ``` 16 | java -cp target/invitationmodel-1.0.jar nl.uva.illc.dataselection.InvitationModel 17 | 18 | -cin,--in-domain-corpus In-domain corpus name 19 | -cmix,--mix-domain-corpus Mix-domain corpus name 20 | -i,--max-iterations Maximum Iterations 21 | -src,--src-language Source Language 22 | -trg,--trg-language Target Language 23 | -th,--threshold This threshold deicdes which sentences 24 | updates translation tables. Default is 25 | 0.5 26 | -cf,--conv_threshold This threshold decide if the 27 | convergence is reached. Default is 28 | 0.00001 29 | ``` 30 | 31 | ##### Example 32 | 33 | If you have a parallel indomain corpus in-domain.l1, indomain.l2 and a parallel mix-domain corpus mixdomain.l1, mixdomain.l2. 34 | Then you can execute this utility as follow: 35 | 36 | `java -cp target/invitationmodel-1.0.jar nl.uva.illc.dataselection.InvitationModel -cin indomain -cmix mixdomain -src l1 -trg l2 -i 10 -th 0.5 -cf 0.00001` 37 | -------------------------------------------------------------------------------- /dist/invitationmodel-1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amirkamran/InvitationModel/605b17835db6e9b0350d7f33d28160ef27d56132/dist/invitationmodel-1.0.jar -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | 4.0.0 5 | nl.uva.illc.dataselection 6 | invitationmodel 7 | 1.0 8 | InvitationModel 9 | Implementation of domain adaptation algorithm based on the paper "Latent Domain Translation Models in Mix-of-Domains Haystack" http://www.aclweb.org/anthology/C14-1182 10 | jar 11 | 12 | 13 | 14 | edu.berkeley.nlp 15 | berkeleylm 16 | 1.1.2 17 | 18 | 19 | commons-cli 20 | commons-cli 21 | 1.2 22 | 23 | 24 | net.openhft 25 | koloboke-api-jdk6-7 26 | 0.6.7 27 | 28 | 29 | net.openhft 30 | koloboke-impl-jdk6-7 31 | 0.6.7 32 | 33 | 34 | org.apache.logging.log4j 35 | log4j-core 36 | 2.3 37 | 38 | 39 | org.apache.logging.log4j 40 | log4j-api 41 | 2.3 42 | 43 | 44 | 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-shade-plugin 50 | 2.4 51 | 52 | 53 | package 54 | 55 | shade 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /src/main/java/nl/uva/illc/AlignmentCalculator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * (C) Copyright 2015 ILLC University of Amsterdam (http://www.illc.uva.nl) 3 | * 4 | * This work was supported by "STW Open Technologieprogramma" grant 5 | * under project name "Data-Powered Domain-Specific Translation Services On Demand" 6 | * 7 | * All rights reserved. This program and the accompanying materials 8 | * are made available under the terms of the GNU Lesser General Public License 9 | * (LGPL) version 2.1 which accompanies this distribution, and is available at 10 | * http://www.gnu.org/licenses/lgpl-2.1.html 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | */ 18 | 19 | 20 | package nl.uva.illc; 21 | 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | import java.util.concurrent.CountDownLatch; 25 | import java.util.concurrent.ExecutorService; 26 | import java.util.concurrent.Executors; 27 | 28 | import nl.uva.illc.dataselection.TranslationTable; 29 | 30 | 31 | public class AlignmentCalculator { 32 | 33 | public static CountDownLatch latch = null; 34 | public static ExecutorService jobs = Executors.newCachedThreadPool(); 35 | 36 | public static void process(int src[][], int trg[][], TranslationTable t2s, TranslationTable s2t) throws InterruptedException { 37 | int split = (int)Math.ceil(src.length / 100000d); 38 | latch = new CountDownLatch(split); 39 | for(int start=0;start src.length) { 42 | end = src.length; 43 | } 44 | calculateAlignment(src, trg, t2s, s2t, start, end); 45 | } 46 | latch.await(); 47 | jobs.shutdown(); 48 | } 49 | 50 | public static void calculateAlignment(final int src[][], final int trg[][], final TranslationTable t2s, final TranslationTable s2t, final int start, final int end) { 51 | 52 | jobs.execute(new Runnable() { 53 | 54 | @Override 55 | public void run() { 56 | for(int sent=start;sent alignments = intersection(a1, a2); 62 | System.out.println(alignments); 63 | } 64 | 65 | } 66 | }); 67 | 68 | } 69 | 70 | public static List intersection(int a1[], int a2[]) { 71 | List alignments = new ArrayList(); 72 | for(int i=1;i alignments) { 81 | /*for(Alignment alignment : alignments) { 82 | 83 | }*/ 84 | } 85 | } 86 | 87 | class Alignment { 88 | 89 | public int source; 90 | public int target; 91 | 92 | public Alignment(int source, int target) { 93 | this.source = source; 94 | this.target = target; 95 | } 96 | 97 | } -------------------------------------------------------------------------------- /src/main/java/nl/uva/illc/PerplexityCalculator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * (C) Copyright 2015 ILLC University of Amsterdam (http://www.illc.uva.nl) 3 | * 4 | * This work was supported by "STW Open Technologieprogramma" grant 5 | * under project name "Data-Powered Domain-Specific Translation Services On Demand" 6 | * 7 | * All rights reserved. This program and the accompanying materials 8 | * are made available under the terms of the GNU Lesser General Public License 9 | * (LGPL) version 2.1 which accompanies this distribution, and is available at 10 | * http://www.gnu.org/licenses/lgpl-2.1.html 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | */ 18 | 19 | package nl.uva.illc; 20 | 21 | import java.io.BufferedReader; 22 | import java.io.File; 23 | import java.io.FileInputStream; 24 | import java.io.FileOutputStream; 25 | import java.io.InputStreamReader; 26 | import java.io.OutputStreamWriter; 27 | import java.io.PrintWriter; 28 | import java.util.concurrent.CountDownLatch; 29 | import java.util.concurrent.ExecutorService; 30 | import java.util.concurrent.Executors; 31 | import java.util.concurrent.Future; 32 | 33 | public class PerplexityCalculator { 34 | 35 | public static ExecutorService jobs = Executors.newFixedThreadPool(20); 36 | public static CountDownLatch latch = null; 37 | 38 | public static double [][]perp = null; 39 | 40 | @SuppressWarnings("rawtypes") 41 | public static void main(String args[]) throws InterruptedException { 42 | 43 | int files = Integer.parseInt(args[0]); 44 | String lang = args[1]; 45 | 46 | perp = new double[files][19]; 47 | 48 | new File("./temp").mkdir(); 49 | 50 | latch = new CountDownLatch(files*perp[0].length); 51 | for(int i=1;i<=perp.length;i++) { 52 | String fileName = "selected" + i + "." + lang; 53 | Future f1 = splitFile(fileName, 100000); 54 | for(int j=1;j<=perp[i-1].length;j++) { 55 | Future f2 = runCommand("./ngram-count -order 5 -interpolate -kndiscount3 -kndiscount5 -lm ./temp/" + fileName+"."+j+".lm -text ./temp/" + fileName+"."+j , f1); 56 | Future f3 = runCommand("./ngram -lm ./temp/" + fileName+"."+j +".lm -ppl ./test." + lang + " > ./temp/" + fileName+"."+j + ".ppl", f2); 57 | readPpl("./temp/" + fileName+"."+j + ".ppl", i-1, j-1, f3); 58 | } 59 | } 60 | latch.await(); 61 | jobs.shutdown(); 62 | for(int i=0;i= splitSize) { 112 | s = 1; 113 | out.close(); 114 | out = new PrintWriter(new OutputStreamWriter(new FileOutputStream("./temp/" + fileName + "." + ++i), "UTF8")); 115 | } 116 | } 117 | out.close(); 118 | reader.close(); 119 | } catch (Exception e) { 120 | e.printStackTrace(); 121 | System.exit(1); 122 | } 123 | } 124 | }); 125 | } 126 | 127 | 128 | @SuppressWarnings("rawtypes") 129 | public static Future runCommand(final String command, final Future future) { 130 | return jobs.submit(new Runnable() { 131 | @Override 132 | public void run() { 133 | try { 134 | 135 | if(future!=null) { 136 | future.get(); 137 | } 138 | 139 | System.out.println(command); 140 | 141 | String [] cmd = {"/bin/sh" , "-c", command}; 142 | Process p = Runtime.getRuntime().exec(cmd); 143 | p.waitFor(); 144 | BufferedReader reader = new BufferedReader(new InputStreamReader(p.getErrorStream())); 145 | String line = ""; 146 | while ((line = reader.readLine()) != null) { 147 | System.out.println(line); 148 | } 149 | reader.close(); 150 | reader = new BufferedReader(new InputStreamReader(p.getInputStream())); 151 | while ((line = reader.readLine()) != null) { 152 | System.out.println(line); 153 | } 154 | reader.close(); 155 | 156 | } catch (Exception e) { 157 | System.out.println(e.getMessage()); 158 | System.exit(1); 159 | } 160 | } 161 | }); 162 | } 163 | 164 | @SuppressWarnings("rawtypes") 165 | public static Future runCommand(String command) { 166 | return runCommand(command, null); 167 | } 168 | 169 | 170 | } 171 | -------------------------------------------------------------------------------- /src/main/java/nl/uva/illc/dataselection/InvitationModel.java: -------------------------------------------------------------------------------- 1 | /* 2 | * (C) Copyright 2015 ILLC University of Amsterdam (http://www.illc.uva.nl) 3 | * 4 | * This work was supported by "STW Open Technologieprogramma" grant 5 | * under project name "Data-Powered Domain-Specific Translation Services On Demand" 6 | * 7 | * All rights reserved. This program and the accompanying materials 8 | * are made available under the terms of the GNU Lesser General Public License 9 | * (LGPL) version 2.1 which accompanies this distribution, and is available at 10 | * http://www.gnu.org/licenses/lgpl-2.1.html 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | */ 18 | 19 | package nl.uva.illc.dataselection; 20 | 21 | import edu.berkeley.nlp.lm.ConfigOptions; 22 | import edu.berkeley.nlp.lm.NgramLanguageModel; 23 | import edu.berkeley.nlp.lm.StringWordIndexer; 24 | import edu.berkeley.nlp.lm.io.ArpaLmReader; 25 | import edu.berkeley.nlp.lm.io.LmReaders; 26 | 27 | import java.io.BufferedReader; 28 | import java.io.BufferedWriter; 29 | import java.io.File; 30 | import java.io.FileInputStream; 31 | import java.io.FileNotFoundException; 32 | import java.io.FileOutputStream; 33 | import java.io.FileReader; 34 | import java.io.IOException; 35 | import java.io.InputStreamReader; 36 | import java.io.LineNumberReader; 37 | import java.io.OutputStreamWriter; 38 | import java.io.PrintWriter; 39 | import java.nio.charset.Charset; 40 | import java.util.ArrayList; 41 | import java.util.Collections; 42 | import java.util.List; 43 | import java.util.concurrent.CountDownLatch; 44 | import java.util.concurrent.ExecutorService; 45 | import java.util.concurrent.Executors; 46 | import java.util.concurrent.TimeUnit; 47 | 48 | import net.openhft.koloboke.collect.map.hash.HashIntFloatMap; 49 | import net.openhft.koloboke.collect.map.hash.HashIntFloatMaps; 50 | import net.openhft.koloboke.collect.map.hash.HashIntIntMap; 51 | import net.openhft.koloboke.collect.map.hash.HashIntIntMaps; 52 | import net.openhft.koloboke.collect.map.hash.HashIntObjMap; 53 | import net.openhft.koloboke.collect.map.hash.HashIntObjMaps; 54 | import net.openhft.koloboke.collect.map.hash.HashObjIntMap; 55 | import net.openhft.koloboke.collect.map.hash.HashObjIntMaps; 56 | 57 | import org.apache.commons.cli.CommandLine; 58 | import org.apache.commons.cli.CommandLineParser; 59 | import org.apache.commons.cli.GnuParser; 60 | import org.apache.commons.cli.HelpFormatter; 61 | import org.apache.commons.cli.Options; 62 | import org.apache.commons.cli.ParseException; 63 | import org.apache.logging.log4j.LogManager; 64 | import org.apache.logging.log4j.Logger; 65 | 66 | /** 67 | * Invitation based data selection approach exploits in-domain data (both 68 | * monolingual and bilingual) as prior to guide word alignment and phrase pair 69 | * estimates in the large mix-domain corpus. As a by-product, accurate estimates 70 | * for P(D|e,f) of the mixed-domain sentences are produced (with D being either 71 | * in-domain or out-of-domain), which can be used to rank the sentences in Dmix 72 | * according to their relevance to Din. 73 | * 74 | * For more information see: Hoang, Cuong and Sima'an, Khalil (2014): Latent 75 | * Domain Translation Models in Mix-of-Domains Haystack, Proceedings of COLING 76 | * 2014, the 25th International Conference on Computational Linguistics 77 | * http://www.aclweb.org/anthology/C14-1182.pdf 78 | * 79 | * @author Amir Kamran 80 | */ 81 | 82 | public class InvitationModel { 83 | 84 | private static Logger log = LogManager.getLogger(InvitationModel.class); 85 | 86 | static String IN = null; 87 | static String MIX = null; 88 | static String SRC = null; 89 | static String TRG = null; 90 | 91 | static int iMAX = 10; 92 | 93 | static int src_indomain[][] = null; 94 | static int trg_indomain[][] = null; 95 | static int src_mixdomain[][] = null; 96 | static int trg_mixdomain[][] = null; 97 | static int src_outdomain[][] = null; 98 | static int trg_outdomain[][] = null; 99 | 100 | static HashObjIntMap src_codes = null; 101 | static HashObjIntMap trg_codes = null; 102 | 103 | static float lm[][] = null; 104 | 105 | static float LOG_0_5 = (float) Math.log(0.5); 106 | 107 | // default confidence threshold: use to decide which sentences 108 | // will update the translation table 109 | static float CONF_THRESHOLD = (float) Math.log(0.5); 110 | 111 | // default convergence threshold: How much change in PD1 is significant 112 | // to continue to next iteration 113 | static float CONV_THRESHOLD = 0.00001f; 114 | 115 | 116 | static float PD1 = LOG_0_5; 117 | static float PD0 = LOG_0_5; 118 | 119 | static TranslationTable ttable[] = new TranslationTable[4]; 120 | 121 | public static CountDownLatch latch = null; 122 | public static ExecutorService jobs = Executors.newCachedThreadPool(); 123 | 124 | public static HashIntIntMap ignore = HashIntIntMaps.newMutableMap(); 125 | 126 | public static float n = 0.5f; 127 | public static float V = 500000f; 128 | public static float nV = n * V; 129 | public static float p = -(float) Math.log(V); 130 | 131 | public static void main(String args[]) throws IOException, 132 | InterruptedException { 133 | log.info("Start ..."); 134 | processCommandLineArguments(args); 135 | readFiles(); 136 | initialize(); 137 | burnIN(); 138 | createLM(); 139 | training(); 140 | 141 | jobs.shutdown(); 142 | 143 | jobs.awaitTermination(10, TimeUnit.MINUTES); 144 | 145 | log.info("END"); 146 | } 147 | 148 | public static void processCommandLineArguments(String args[]) { 149 | Options options = new Options(); 150 | options.addOption("cmix", "mix-domain-corpus", true, 151 | "Mix-domain corpus name"); 152 | options.addOption("cin", "in-domain-corpus", true, 153 | "In-domain corpus name"); 154 | options.addOption("src", "src-language", true, "Source Language"); 155 | options.addOption("trg", "trg-language", true, "Target Language"); 156 | options.addOption("i", "max-iterations", true, "Maximum Iterations"); 157 | options.addOption("th", "threshold", true, "This threshold deicdes which sentences updates translation tables. Default is 0.5"); 158 | options.addOption("cf", "conv_threshold", true, "This threshold decide if the convergence is reached. Default is 0.00001"); 159 | 160 | CommandLineParser parser = new GnuParser(); 161 | try { 162 | CommandLine cmd = parser.parse(options, args); 163 | if (cmd.hasOption("cmix") && cmd.hasOption("cin") 164 | && cmd.hasOption("src") && cmd.hasOption("trg")) { 165 | MIX = cmd.getOptionValue("cmix"); 166 | IN = cmd.getOptionValue("cin"); 167 | SRC = cmd.getOptionValue("src"); 168 | TRG = cmd.getOptionValue("trg"); 169 | 170 | if (cmd.hasOption("i")) { 171 | iMAX = Integer.parseInt(cmd.getOptionValue("i")); 172 | } 173 | 174 | if (cmd.hasOption("th")) { 175 | CONF_THRESHOLD = (float) Math.log(Double.parseDouble(cmd.getOptionValue("th"))); 176 | } 177 | 178 | if (cmd.hasOption("cf")) { 179 | CONV_THRESHOLD = (float) Float.parseFloat(cmd.getOptionValue("cf")); 180 | } 181 | 182 | 183 | } else { 184 | System.out.println("Missing required argumetns!"); 185 | printHelp(options); 186 | } 187 | } catch (ParseException e) { 188 | printHelp(options); 189 | } 190 | } 191 | 192 | private static void printHelp(Options options) { 193 | HelpFormatter formatter = new HelpFormatter(); 194 | formatter.printHelp("java " + InvitationModel.class.getName(), options); 195 | System.exit(1); 196 | } 197 | 198 | public static void initialize() throws InterruptedException { 199 | 200 | log.info("Initializing Translaiton Tables"); 201 | 202 | for (int i = 0; i < ttable.length; i++) { 203 | ttable[i] = new TranslationTable(); 204 | } 205 | 206 | latch = new CountDownLatch(4); 207 | 208 | initializeTranslationTable(src_indomain, trg_indomain, ttable[0]); 209 | initializeTranslationTable(trg_indomain, src_indomain, ttable[1]); 210 | initializeTranslationTable(src_mixdomain, trg_mixdomain, ttable[2]); 211 | initializeTranslationTable(trg_mixdomain, src_mixdomain, ttable[3]); 212 | 213 | latch.await(); 214 | 215 | log.info("DONE"); 216 | } 217 | 218 | public static void initializeTranslationTable(final int src[][], 219 | final int trg[][], final TranslationTable ttable) { 220 | 221 | jobs.execute(new Runnable() { 222 | 223 | @Override 224 | public void run() { 225 | 226 | HashIntFloatMap totals = HashIntFloatMaps.newMutableMap(); 227 | 228 | for (int sent = 0; sent < src.length; sent++) { 229 | 230 | if (sent % 100000 == 0) 231 | log.debug("Sentence " + sent); 232 | 233 | int ssent[] = src[sent]; 234 | int tsent[] = trg[sent]; 235 | for (int t = 1; t < tsent.length; t++) { 236 | int tw = tsent[t]; 237 | for (int s = 0; s < ssent.length; s++) { 238 | int sw = ssent[s]; 239 | ttable.increas(tw, sw, 1f); 240 | totals.addValue(sw, 1f, 0f); 241 | } 242 | } 243 | } 244 | 245 | // normalizing and smoothing 246 | for (int tw : ttable.ttable.keySet()) { 247 | HashIntFloatMap tMap = ttable.ttable.get(tw); 248 | for (int sw : tMap.keySet()) { 249 | float prob = (float) (Math.log(ttable.get(tw, sw) + n) - Math 250 | .log(totals.get(sw) + nV)); 251 | ttable.put(tw, sw, prob); 252 | } 253 | } 254 | 255 | log.info("."); 256 | 257 | InvitationModel.latch.countDown(); 258 | } 259 | 260 | }); 261 | 262 | } 263 | 264 | public static void createLM() throws InterruptedException { 265 | 266 | log.info("Creating Language Models ..."); 267 | 268 | lm = new float[4][]; 269 | 270 | latch = new CountDownLatch(4); 271 | 272 | createLM(IN + "." + SRC + ".encoded", lm, 0, src_mixdomain); 273 | createLM(IN + "." + TRG + ".encoded", lm, 1, trg_mixdomain); 274 | createLM("outdomain." + SRC + ".encoded", lm, 2, src_mixdomain); 275 | createLM("outdomain." + TRG + ".encoded", lm, 3, trg_mixdomain); 276 | 277 | latch.await(); 278 | 279 | log.info("DONE"); 280 | 281 | } 282 | 283 | public static void burnIN() throws IOException, InterruptedException { 284 | 285 | log.info("BurnIN started ... "); 286 | 287 | HashIntObjMap results = null; 288 | 289 | for (int i = 1; i <= 1; i++) { 290 | 291 | log.info("Iteration " + i); 292 | 293 | results = HashIntObjMaps.newMutableMap(); 294 | 295 | float sPD[][] = new float[2][src_mixdomain.length]; 296 | 297 | int split = (int) Math.ceil(src_mixdomain.length / 100000d); 298 | 299 | latch = new CountDownLatch(split); 300 | for (int sent = 0; sent < src_mixdomain.length; sent += 100000) { 301 | int end = sent + 100000; 302 | if (end > src_mixdomain.length) { 303 | end = src_mixdomain.length; 304 | } 305 | calcualteBurnInScore(sent, end, sPD); 306 | } 307 | latch.await(); 308 | 309 | float countPD[] = new float[2]; 310 | countPD[0] = Float.NEGATIVE_INFINITY; 311 | countPD[1] = Float.NEGATIVE_INFINITY; 312 | 313 | for (int sent = 0; sent < src_mixdomain.length; sent++) { 314 | 315 | if (ignore.containsKey(sent)) 316 | continue; 317 | 318 | if (Float.isNaN(sPD[0][sent]) || Float.isNaN(sPD[1][sent])) { 319 | ignore.put(sent, sent); 320 | log.info("Ignoring " + (sent + 1)); 321 | continue; 322 | } 323 | 324 | countPD[0] = logAdd(countPD[0], sPD[0][sent]); 325 | countPD[1] = logAdd(countPD[1], sPD[1][sent]); 326 | 327 | results.put(sent, new Result(sent, sPD[0][sent])); 328 | 329 | } 330 | 331 | } 332 | 333 | log.info("BurnIN DONE"); 334 | 335 | log.info("Writing outdomain corpus ... "); 336 | 337 | ArrayList sortedResult = new ArrayList(results.values()); 338 | Collections.sort(sortedResult); 339 | 340 | PrintWriter src_out = new PrintWriter("outdomain." + SRC + ".encoded"); 341 | PrintWriter trg_out = new PrintWriter("outdomain." + TRG + ".encoded"); 342 | 343 | PrintWriter out_score = new PrintWriter("outdomain.scores"); 344 | 345 | src_outdomain = new int[src_indomain.length][]; 346 | trg_outdomain = new int[trg_indomain.length][]; 347 | 348 | int j = 0; 349 | 350 | for (Result r : sortedResult) { 351 | 352 | int sentIndex = r.sentenceNumber - 1; 353 | 354 | int ssent[] = src_mixdomain[sentIndex]; 355 | int tsent[] = trg_mixdomain[sentIndex]; 356 | 357 | out_score.println(r.sentenceNumber + "\t" + r.score); 358 | 359 | src_outdomain[j] = ssent; 360 | trg_outdomain[j] = tsent; 361 | 362 | for (int w = 1; w < ssent.length; w++) { 363 | src_out.print(ssent[w]); 364 | src_out.print(" "); 365 | } 366 | src_out.println(); 367 | for (int w = 1; w < tsent.length; w++) { 368 | trg_out.print(tsent[w]); 369 | trg_out.print(" "); 370 | } 371 | trg_out.println(); 372 | 373 | j++; 374 | 375 | if (j == src_indomain.length) { 376 | break; 377 | } 378 | } 379 | 380 | out_score.close(); 381 | 382 | src_out.close(); 383 | trg_out.close(); 384 | 385 | log.info("DONE"); 386 | 387 | } 388 | 389 | public static void training() throws FileNotFoundException, 390 | InterruptedException { 391 | 392 | log.info("Starting Invitation EM ..."); 393 | 394 | latch = new CountDownLatch(2); 395 | ttable[2] = new TranslationTable(); 396 | ttable[3] = new TranslationTable(); 397 | initializeTranslationTable(src_outdomain, trg_outdomain, ttable[2]); 398 | initializeTranslationTable(trg_outdomain, src_outdomain, ttable[3]); 399 | latch.await(); 400 | 401 | for (int i = 1; i <= iMAX; i++) { 402 | log.info("Iteration " + i); 403 | HashIntObjMap results = HashIntObjMaps.newMutableMap(); 404 | 405 | float sPD[][] = new float[2][src_mixdomain.length]; 406 | 407 | int splits = 10; 408 | int split_size = src_mixdomain.length / splits; 409 | 410 | latch = new CountDownLatch(splits); 411 | for (int s=0;s1 && Math.abs(Math.exp(newPD1) - Math.exp(PD1)) <= CONV_THRESHOLD) { 453 | log.info("Convergence threshold reached."); 454 | break; 455 | } 456 | 457 | PD1 = newPD1; 458 | PD0 = newPD0; 459 | 460 | if (i < iMAX) { 461 | 462 | latch = new CountDownLatch(4); 463 | updateTranslationTable(src_mixdomain, trg_mixdomain, ttable[0], sPD[1]); 464 | updateTranslationTable(trg_mixdomain, src_mixdomain, ttable[1], sPD[1]); 465 | updateTranslationTable(src_mixdomain, trg_mixdomain, ttable[2], sPD[0]); 466 | updateTranslationTable(trg_mixdomain, src_mixdomain, ttable[3], sPD[0]); 467 | latch.await(); 468 | } 469 | 470 | } 471 | } 472 | 473 | public static void calcualteScore(final int start, final int end, 474 | final float sPD[][]) { 475 | 476 | jobs.execute(new Runnable() { 477 | 478 | @Override 479 | public void run() { 480 | for (int sent = start; sent < end; sent++) { 481 | 482 | if (ignore.containsKey(sent)) 483 | continue; 484 | 485 | int ssent[] = src_mixdomain[sent]; 486 | int tsent[] = trg_mixdomain[sent]; 487 | 488 | float sProb[] = new float[4]; 489 | 490 | sProb[0] = calculateProb(ssent, tsent, ttable[0]); 491 | sProb[1] = calculateProb(tsent, ssent, ttable[1]); 492 | sProb[2] = calculateProb(ssent, tsent, ttable[2]); 493 | sProb[3] = calculateProb(tsent, ssent, ttable[3]); 494 | 495 | float in_score = PD1 + logAdd(sProb[0] + lm[1][sent], sProb[1] + lm[0][sent]); 496 | float mix_score = PD0 + logAdd(sProb[2] + lm[3][sent], sProb[3] + lm[2][sent]); 497 | 498 | sPD[1][sent] = in_score - logAdd(in_score, mix_score); 499 | sPD[0][sent] = mix_score - logAdd(in_score, mix_score); 500 | 501 | } 502 | InvitationModel.latch.countDown(); 503 | } 504 | }); 505 | 506 | } 507 | 508 | public static void calcualteBurnInScore(final int start, final int end, 509 | final float sPD[][]) { 510 | 511 | jobs.execute(new Runnable() { 512 | 513 | @Override 514 | public void run() { 515 | for (int sent = start; sent < end; sent++) { 516 | 517 | if (ignore.containsKey(sent)) 518 | continue; 519 | 520 | int ssent[] = src_mixdomain[sent]; 521 | int tsent[] = trg_mixdomain[sent]; 522 | 523 | float sProb[] = new float[4]; 524 | 525 | sProb[0] = calculateProb(ssent, tsent, ttable[0]); 526 | sProb[1] = calculateProb(tsent, ssent, ttable[1]); 527 | sProb[2] = calculateProb(ssent, tsent, ttable[2]); 528 | sProb[3] = calculateProb(tsent, ssent, ttable[3]); 529 | 530 | float in_score = PD1 + logAdd(sProb[0], sProb[1]); 531 | float mix_score = PD0 + logAdd(sProb[2], sProb[3]); 532 | 533 | sPD[1][sent] = in_score - logAdd(in_score, mix_score); 534 | sPD[0][sent] = mix_score - logAdd(in_score, mix_score); 535 | 536 | } 537 | InvitationModel.latch.countDown(); 538 | } 539 | }); 540 | 541 | } 542 | 543 | public static void writeResult(final int iterationNumber, 544 | final HashIntObjMap results) { 545 | 546 | jobs.execute(new Runnable() { 547 | 548 | @Override 549 | public void run() { 550 | ArrayList sortedResult = new ArrayList(results 551 | .values()); 552 | Collections.sort(sortedResult); 553 | try { 554 | PrintWriter output = new PrintWriter("output_" 555 | + iterationNumber + ".txt"); 556 | for (Result r : sortedResult) { 557 | output.println(r.sentenceNumber + "\t" 558 | + Math.exp(r.score) + "\t" 559 | + Math.exp(r.lm_score)); 560 | } 561 | output.close(); 562 | } catch (FileNotFoundException e) { 563 | e.printStackTrace(); 564 | } 565 | } 566 | }); 567 | 568 | } 569 | 570 | public static float calculateProb(final int ssent[], final int tsent[], 571 | final TranslationTable ttable) { 572 | float prob = 0; 573 | for (int t = 1; t < tsent.length; t++) { 574 | int tw = tsent[t]; 575 | float sum = Float.NEGATIVE_INFINITY; 576 | for (int s = 0; s < ssent.length; s++) { 577 | int sw = ssent[s]; 578 | sum = logAdd(sum, ttable.get(tw, sw, p)); 579 | } 580 | prob += sum; 581 | } 582 | return prob - (float)Math.log(Math.pow(ssent.length, tsent.length-1)); 583 | } 584 | 585 | public static void updateTranslationTable(final int src[][], 586 | final int trg[][], final TranslationTable ttable, final float sPD[]) { 587 | 588 | jobs.execute(new Runnable() { 589 | 590 | @Override 591 | public void run() { 592 | log.info("Updating translation table ... "); 593 | 594 | TranslationTable counts = new TranslationTable(); 595 | HashIntFloatMap totals = HashIntFloatMaps.newMutableMap(); 596 | 597 | for (int sent = 0; sent < src.length; sent++) { 598 | 599 | if (sent % 100000 == 0) 600 | log.debug("Sentence " + sent); 601 | 602 | if (ignore.containsKey(sent)) 603 | continue; 604 | 605 | if(sPD[sent] < CONF_THRESHOLD) continue; 606 | 607 | int ssent[] = src[sent]; 608 | int tsent[] = trg[sent]; 609 | 610 | HashIntFloatMap s_total = HashIntFloatMaps.newMutableMap(); 611 | 612 | // calculating normalization 613 | for (int t = 1; t < tsent.length; t++) { 614 | int tw = tsent[t]; 615 | for (int s = 0; s < ssent.length; s++) { 616 | int sw = ssent[s]; 617 | s_total.put( 618 | tw, 619 | logAdd(s_total.getOrDefault(tw, 620 | Float.NEGATIVE_INFINITY), ttable 621 | .get(tw, sw, p))); 622 | } 623 | } 624 | 625 | // collect counts 626 | for (int t = 1; t < tsent.length; t++) { 627 | int tw = tsent[t]; 628 | for (int s = 0; s < ssent.length; s++) { 629 | int sw = ssent[s]; 630 | float in_count = sPD[sent] 631 | + (ttable.get(tw, sw, p) - s_total.get(tw)); 632 | counts.put( 633 | tw, 634 | sw, 635 | logAdd(counts.get(tw, sw, 636 | Float.NEGATIVE_INFINITY), in_count)); 637 | totals.put( 638 | sw, 639 | logAdd(totals.getOrDefault(sw, 640 | Float.NEGATIVE_INFINITY), in_count)); 641 | } 642 | } 643 | 644 | } 645 | 646 | // maximization 647 | for (int tw : counts.ttable.keySet()) { 648 | HashIntFloatMap tMap = counts.ttable.get(tw); 649 | for (int sw : tMap.keySet()) { 650 | float newProb = counts.get(tw, sw) - totals.get(sw); 651 | ttable.put(tw, sw, newProb); 652 | } 653 | } 654 | log.info("Updating translation table DONE"); 655 | InvitationModel.latch.countDown(); 656 | } 657 | 658 | }); 659 | 660 | } 661 | 662 | public static void readFiles() throws IOException, InterruptedException { 663 | 664 | log.info("Reading files"); 665 | 666 | src_codes = HashObjIntMaps.newMutableMap(); 667 | trg_codes = HashObjIntMaps.newMutableMap(); 668 | src_codes.put(null, 0); 669 | trg_codes.put(null, 0); 670 | 671 | LineNumberReader lr = new LineNumberReader(new FileReader(IN + "." 672 | + SRC)); 673 | lr.skip(Long.MAX_VALUE); 674 | int indomain_size = lr.getLineNumber(); 675 | lr.close(); 676 | 677 | lr = new LineNumberReader(new FileReader(MIX + "." + SRC)); 678 | lr.skip(Long.MAX_VALUE); 679 | int mixdomain_size = lr.getLineNumber(); 680 | lr.close(); 681 | 682 | src_indomain = new int[indomain_size][]; 683 | trg_indomain = new int[indomain_size][]; 684 | src_mixdomain = new int[mixdomain_size][]; 685 | trg_mixdomain = new int[mixdomain_size][]; 686 | 687 | latch = new CountDownLatch(2); 688 | readFile(IN + "." + SRC, src_codes, src_indomain); 689 | readFile(IN + "." + TRG, trg_codes, trg_indomain); 690 | latch.await(); 691 | 692 | latch = new CountDownLatch(2); 693 | readFile(MIX + "." + SRC, src_codes, src_mixdomain); 694 | readFile(MIX + "." + TRG, trg_codes, trg_mixdomain); 695 | latch.await(); 696 | 697 | } 698 | 699 | public static void readFile(final String fileName, 700 | final HashObjIntMap codes, final int lines[][]) 701 | throws IOException { 702 | jobs.execute(new Runnable() { 703 | @Override 704 | public void run() { 705 | try { 706 | BufferedReader reader = new BufferedReader( 707 | new InputStreamReader( 708 | new FileInputStream(fileName), Charset 709 | .forName("UTF8"))); 710 | String line = null; 711 | int i = 0; 712 | while ((line = reader.readLine()) != null) { 713 | String words[] = line.split("\\s+"); 714 | lines[i] = new int[words.length + 1]; 715 | lines[i][0] = 0; 716 | int j = 1; 717 | for (String word : words) { 718 | int code = 0; 719 | if (!codes.containsKey(word)) { 720 | code = codes.size() + 1; 721 | codes.put(word, code); 722 | } else { 723 | code = codes.getInt(word); 724 | } 725 | lines[i][j++] = code; 726 | } 727 | i++; 728 | } 729 | reader.close(); 730 | } catch (IOException e) { 731 | e.printStackTrace(); 732 | System.exit(1); 733 | } 734 | writeEncodedFile(fileName + ".encoded", lines); 735 | log.info(fileName + " ... DONE"); 736 | InvitationModel.latch.countDown(); 737 | } 738 | }); 739 | } 740 | 741 | public static void writeEncodedFile(final String fileName, 742 | final int lines[][]) { 743 | 744 | jobs.execute(new Runnable() { 745 | 746 | @Override 747 | public void run() { 748 | try { 749 | BufferedWriter encodedWriter = new BufferedWriter( 750 | new OutputStreamWriter(new FileOutputStream( 751 | fileName), Charset.forName("UTF8"))); 752 | for (int i = 0; i < lines.length; i++) { 753 | for (int j = 1; j < lines[i].length; j++) { 754 | int word = lines[i][j]; 755 | encodedWriter.write("" + word); 756 | encodedWriter.write(" "); 757 | } 758 | encodedWriter.write("\n"); 759 | } 760 | encodedWriter.close(); 761 | } catch (IOException e) { 762 | e.printStackTrace(); 763 | } 764 | 765 | } 766 | 767 | }); 768 | 769 | } 770 | 771 | public static float getLMProb(NgramLanguageModel lm, int sent[]) { 772 | List words = new ArrayList(); 773 | for (int i = 1; i < sent.length; i++) { 774 | words.add("" + sent[i]); 775 | } 776 | return lm.getLogProb(words); 777 | } 778 | 779 | public static void createLM(final String fileName, final float lm[][], 780 | final int index, final int corpus[][]) { 781 | 782 | jobs.execute(new Runnable() { 783 | 784 | @Override 785 | public void run() { 786 | log.info("Creating language model"); 787 | 788 | NgramLanguageModel createdLM = null; 789 | final int lmOrder = 4; 790 | final List inputFiles = new ArrayList(); 791 | inputFiles.add(fileName); 792 | final StringWordIndexer wordIndexer = new StringWordIndexer(); 793 | wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL); 794 | wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL); 795 | wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL); 796 | 797 | createdLM = LmReaders 798 | .readContextEncodedKneserNeyLmFromTextFile(inputFiles, 799 | wordIndexer, lmOrder, new ConfigOptions(), 800 | new File(fileName + ".lm")); 801 | 802 | lm[index] = new float[corpus.length]; 803 | 804 | for (int i = 0; i < corpus.length; i++) { 805 | int sent[] = corpus[i]; 806 | lm[index][i] = getLMProb(createdLM, sent); 807 | } 808 | 809 | log.info("."); 810 | 811 | InvitationModel.latch.countDown(); 812 | } 813 | 814 | }); 815 | } 816 | 817 | public static float logAdd(float a, float b) { 818 | float max, negDiff; 819 | if (a > b) { 820 | max = a; 821 | negDiff = b - a; 822 | } else { 823 | max = b; 824 | negDiff = a - b; 825 | } 826 | if (max == Float.NEGATIVE_INFINITY) { 827 | return max; 828 | } else if (negDiff < -20.0f) { 829 | return max; 830 | } else { 831 | return max + (float) Math.log(1.0 + Math.exp(negDiff)); 832 | } 833 | } 834 | 835 | } 836 | 837 | class Result implements Comparable { 838 | 839 | int sentenceNumber; 840 | float score = 1; 841 | float lm_score = 1; 842 | 843 | public Result(int sentenceNumber, float score) { 844 | this.sentenceNumber = sentenceNumber + 1; 845 | this.score = score; 846 | } 847 | 848 | public Result(int sentenceNumber, float score, float lm_score) { 849 | this.sentenceNumber = sentenceNumber + 1; 850 | this.score = score; 851 | this.lm_score = lm_score; 852 | } 853 | 854 | @Override 855 | public int compareTo(Result result) { 856 | int cmp = Float.compare(result.score, this.score); 857 | if (cmp == 0) { 858 | cmp = Float.compare(result.lm_score, this.lm_score); 859 | } 860 | return cmp; 861 | } 862 | 863 | } -------------------------------------------------------------------------------- /src/main/java/nl/uva/illc/dataselection/TranslationTable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * (C) Copyright 2015 ILLC University of Amsterdam (http://www.illc.uva.nl) 3 | * 4 | * This work was supported by "STW Open Technologieprogramma" grant 5 | * under project name "Data-Powered Domain-Specific Translation Services On Demand" 6 | * 7 | * All rights reserved. This program and the accompanying materials 8 | * are made available under the terms of the GNU Lesser General Public License 9 | * (LGPL) version 2.1 which accompanies this distribution, and is available at 10 | * http://www.gnu.org/licenses/lgpl-2.1.html 11 | * 12 | * This program is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | * GNU General Public License for more details. 16 | * 17 | */ 18 | 19 | package nl.uva.illc.dataselection; 20 | 21 | import net.openhft.koloboke.collect.map.hash.HashIntFloatMap; 22 | import net.openhft.koloboke.collect.map.hash.HashIntFloatMaps; 23 | import net.openhft.koloboke.collect.map.hash.HashIntObjMap; 24 | import net.openhft.koloboke.collect.map.hash.HashIntObjMaps; 25 | 26 | public class TranslationTable { 27 | 28 | public HashIntObjMap ttable = null; 29 | 30 | public TranslationTable() { 31 | ttable = HashIntObjMaps.newMutableMap(); 32 | } 33 | 34 | public TranslationTable(TranslationTable toCopy) { 35 | this.ttable = HashIntObjMaps.newMutableMap(); 36 | this.ttable.putAll(toCopy.ttable); 37 | } 38 | 39 | public void put(int tw, int sw, float value) { 40 | HashIntFloatMap tMap = ttable.get(tw); 41 | if (tMap == null) { 42 | tMap = HashIntFloatMaps.newMutableMap(); 43 | ttable.put(tw, tMap); 44 | } 45 | tMap.put(sw, value); 46 | } 47 | 48 | public void increas(int tw, int sw, float value) { 49 | HashIntFloatMap tMap = ttable.get(tw); 50 | if (tMap == null) { 51 | tMap = HashIntFloatMaps.newMutableMap(); 52 | ttable.put(tw, tMap); 53 | } 54 | tMap.addValue(sw, value, 0f); 55 | } 56 | 57 | public float get(int tw, int sw) { 58 | HashIntFloatMap tMap = ttable.get(tw); 59 | if (tMap != null) { 60 | if (tMap.containsKey(sw)) { 61 | return tMap.get(sw); 62 | } 63 | } 64 | return Float.NaN; 65 | } 66 | 67 | public float get(int tw, int sw, float d) { 68 | float value = get(tw, sw); 69 | return Float.isNaN(value) ? d : value; 70 | } 71 | 72 | public void remove(int tw, int sw) { 73 | HashIntFloatMap tMap = ttable.get(tw); 74 | if (tMap != null) { 75 | tMap.remove(sw); 76 | } 77 | } 78 | 79 | public void normalize() { 80 | for (int tw : ttable.keySet()) { 81 | HashIntFloatMap tMap = ttable.get(tw); 82 | float sum = 0; 83 | for (int sw : tMap.keySet()) { 84 | sum += tMap.get(sw); 85 | } 86 | for (int sw : tMap.keySet()) { 87 | tMap.put(sw, tMap.get(sw) / sum); 88 | } 89 | } 90 | } 91 | 92 | public int[] getAlignment(int ssent[], int tsent[]) { 93 | int alignments[] = new int[tsent.length]; 94 | for (int t = 1; t < tsent.length; t++) { 95 | int tw = tsent[t]; 96 | float max_p = 0f; 97 | int ind = -1; 98 | for (int s = 0; s < ssent.length; s++) { 99 | int sw = ssent[s]; 100 | float p = this.get(tw, sw, 0f); 101 | if (p >= max_p) { 102 | max_p = p; 103 | ind = s; 104 | } 105 | } 106 | alignments[t] = ind; 107 | } 108 | return alignments; 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | --------------------------------------------------------------------------------