├── .gitignore ├── A Generalized Language Model as the Combination of Skipped n-grams and Modified Kneser-Ney Smoothing.pdf ├── README.md ├── bachelor-thesis-martin-koerner.pdf ├── calcentropy.py ├── config.sample.txt ├── lib ├── README ├── apache │ ├── commons-compress-1.5 │ │ ├── LICENSE.txt │ │ ├── NOTICE.txt │ │ ├── README.txt │ │ ├── RELEASE-NOTES.txt │ │ ├── commons-compress-1.5-javadoc.jar │ │ ├── commons-compress-1.5-sources.jar │ │ └── commons-compress-1.5.jar │ ├── commons-io-2.4 │ │ ├── LICENSE.txt │ │ ├── NOTICE.txt │ │ ├── RELEASE-NOTES.txt │ │ ├── commons-io-2.4-javadoc.jar │ │ ├── commons-io-2.4-sources.jar │ │ └── commons-io-2.4.jar │ └── log4j-2.0-beta9 │ │ ├── LICENSE.txt │ │ ├── NOTICE.txt │ │ ├── RELEASE-NOTES.txt │ │ ├── log4j-api-2.0-beta9-javadoc.jar │ │ ├── log4j-api-2.0-beta9-sources.jar │ │ ├── log4j-api-2.0-beta9.jar │ │ ├── log4j-core-2.0-beta9-javadoc.jar │ │ ├── log4j-core-2.0-beta9-sources.jar │ │ └── log4j-core-2.0-beta9.jar ├── lucene-analyzers-common-4.0.0.jar ├── lucene-core-4.0.0.jar ├── lucene-queryparser-4.0.0.jar ├── mongo-2.9.1.jar └── mysql │ └── mysql-connector-java-5.1.24-bin.jar ├── metriken.sh ├── mvn.sh ├── pom.xml ├── scripts ├── README.md ├── combineLM.sh ├── combineTypo.sh ├── combinefiles.sh ├── createPlot.sh ├── index-glm.sh ├── index-kneser-ney.sh ├── index-lm.sh ├── index-typo.sh ├── preparequery.sh ├── runpreparequery.sh ├── stats.txt ├── table.sh └── test.sh ├── src ├── de │ └── typology │ │ ├── executables │ │ ├── KneserNeyBuilder.java │ │ └── MultiKneserNeyBuilder.java │ │ ├── indexes │ │ ├── WordIndex.java │ │ └── WordIndexer.java │ │ ├── patterns │ │ ├── PatternBuilder.java │ │ └── PatternTransformer.java │ │ ├── smoother │ │ ├── KneserNeySmoother.java │ │ └── ModifiedKneserNeySmoother.java │ │ ├── splitter │ │ ├── AbsoluteSplitter.java │ │ ├── Aggregator.java │ │ ├── DataSetSplitter.java │ │ ├── LineCounterTask.java │ │ ├── SequenceModifier.java │ │ ├── Sequencer.java │ │ ├── SmoothingSplitter.java │ │ └── SplitterTask.java │ │ ├── tester │ │ ├── SequenceExtractorTask.java │ │ └── TestSequenceExtractor.java │ │ └── utils │ │ ├── Config.java │ │ ├── Counter.java │ │ ├── DecimalFormatter.java │ │ └── SequenceFormatter.java ├── log4j2.xml └── main │ └── resources │ └── log4j2.xml ├── testDataset ├── testDataset.txt └── training.txt └── tests └── de └── typology ├── indexes ├── WordIndexTest.java └── WordIndexerTest.java ├── smoother └── KneserNeySmootherTest.java └── splitter ├── AggregatorTest.java ├── SequenceModifierTest.java └── SequencerTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | /.classpath 2 | /.project 3 | /.settings 4 | /bin 5 | /config.txt 6 | /testDataset 7 | /logs 8 | /target 9 | 10 | .ser 11 | mod* 12 | kneser* 13 | normalized* 14 | training.txt 15 | learning.txt 16 | test* 17 | absolute 18 | continuation 19 | parsed.txt -------------------------------------------------------------------------------- /A Generalized Language Model as the Combination of Skipped n-grams and Modified Kneser-Ney Smoothing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/A Generalized Language Model as the Combination of Skipped n-grams and Modified Kneser-Ney Smoothing.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Generalized Language Model Toolkit 2 | 3 | The software can be used to compute a Generalized Language Model which is yet another mean to compute a [Language Model](http://en.wikipedia.org/wiki/Language_model). As shown [in this publication](http://arxiv.org/pdf/1404.3377v1.pdf) Generalized Language models can outperform Modified Kneser Ney Smoothing by 10 to 25 % in Terms of perplexity. 4 | 5 | ## Getting started 6 | ``` 7 | git clone git@github.com:renepickhardt/generalized-language-modeling-toolkit.git 8 | sudo chmod a+x mvn.sh 9 | ``` 10 | You will need to install maven in order to build the project. 11 | ``` 12 | sudo apt-get install maven2 13 | ``` 14 | 15 | You need to copy config.sample.txt to config.txt and read the instructions in config.sample.txt. 16 | ``` 17 | cp config.sample.txt config.txt 18 | emacs config.txt 19 | ``` 20 | 21 | After you set all your directories in config.txt you can run the project 22 | ``` 23 | ./mvn.sh 24 | ``` 25 | 26 | ## Disk and Main memory requirements 27 | Since Generalized language models can become very large the software is written to use the hard disk. In this sense you can theoretically run the programm with very little memory. Still we recommend 16 GB of main memory for the large english wikipedia data sets. 28 | 29 | We tried to avoid frequent disc hits. Still the programm will execute much faster if you store your data on a Solid State disk. 30 | 31 | ## Download the test data sets 32 | you need to have a file called `normalized.txt` which serves as your input. This file should contain one sentence per line. You will learn language models based on this file. 33 | 34 | Please refere to http://glm.rene-pickhardt.de/data in order to download preprocessed and formatted data sets. 35 | 36 | If you whish to parse the data yourself (e.g. because you want to use a newer wikipedia dump) refer to https://github.com/mkrnr/lexer-parser 37 | 38 | ## Processing pipeline of the GLM toolkit: 39 | 40 | you have to start with a file called `normalized.txt` which has to be stored in your data directory (according to `config.txt`). `mvn.sh` will compile the program and start the flow of the following steps (which can be configured by switching the fields ind `config.txt` from `true` to `false`) 41 | 42 | * splitting `normalized.txt` to `training.txt` and `testing.txt` according to the datasplit parameters in `config.txt` 43 | * building a wordindex `index.txt` this index is used to split the language models into files of equal size 44 | * creating absolute counts and continuation counts in the directories `absolute` and `continuation` 45 | ** the various models are stored in folders like `11111` meaning a regular 5 gram or `11011` meaning a skipped 5 gram at the third position 46 | * creation of testing samples from `testing.txt`: `testing-samples-4.txt` for example contains about 100k sequences of 4 words to be tested 47 | * calculating the D and N values for Modified Kneser Ney Smoothing and making them persistent in the two *.ser files (for speeding up various tests) 48 | * running the experiments by creating files like `mod-kneser-ney-complex-backoffToCont-3.txt`: depending on your configuration the files could be named with a `simple` instead of `complex` (complex meaning GLM, simple meaning LM). Exchanging the `3` you can have different model lenghts. These files contain the testing samples with the log of their probabilities. 49 | * you have to manually calculate the entropy by running the python script as an argument you might want to pass `mod*.txt`: in this way you can calculate the entropy for all files and experiments. 50 | 51 | ## Citing the paper 52 | If this software or data is of any help to your research please be so fair and cite the [original publication](http://arxiv.org/pdf/1404.3377v1.pdf) which is also in the home directory of [this git repository](https://github.com/renepickhardt/generalized-language-modeling-toolkit/raw/master/A Generalized Language Model as the Combination of Skipped n-grams and Modified Kneser-Ney Smoothing.pdf). 53 | You might want to use the following bibtex entry: 54 | ``` 55 | @inproceedings{Pickhardt:2014:GLM, 56 | author = {Pickhardt, Rene and Gottron, Thomas and Körner, Martin and Wagner, Paul Georg and Speicher, Till and Staab, Steffen}, 57 | title = {A Generalized Language Model as the Combination of Skipped n-grams and Modified Kneser Ney Smoothing}, 58 | year = {2014}, 59 | booktitle = {ACL'14: Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics}, 60 | } 61 | ``` 62 | 63 | ## History 64 | The Generalized Language models envolved from Paul Georg Wagner's and Till Speicher's Young Scientists project called [Typology](http://www.typology.de) which I advised in 2012. 65 | The Typology project played around and evaluated an idea I had (inspired by [the PhD thesis of Adam Schenker](http://scholarcommons.usf.edu/cgi/viewcontent.cgi?article=2466&context=etd)) of presenting text as a graph in which the edges would encode relationships (nowerdays known as skipped bi-grams). The Graph was used to produce an answer to the next word prediction problem applied to word suggestions in keyboards of modern smartphones. 66 | From the convincing results I developed the theory of Generalized Language models. 67 | Most of the Code was written by my student assistent [Martin Körner](http://mkoerner.de/) who also created his [bachlor thesis](https://github.com/renepickhardt/generalized-language-modeling-toolkit/raw/master/bachelor-thesis-martin-koerner.pdf) about the implementation of a preliminary vesion of the Generalized Language Models. This thesis is a nice reference if you want to get an understanding of modified kneser ney smoothing for standard language models. In terms of notation and building of generalized language models it is outdated. 68 | 69 | ## Questions, Feedback, Bugs 70 | If you have questions feel free to contact me via the issue tracker. on [my blog](http://www.rene-pickhardt.de) or in the paper you could find my mail address. 71 | -------------------------------------------------------------------------------- /bachelor-thesis-martin-koerner.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/bachelor-thesis-martin-koerner.pdf -------------------------------------------------------------------------------- /calcentropy.py: -------------------------------------------------------------------------------- 1 | """ 2 | author: rene pickhardt 3 | 4 | give a list of files as arguments. The files contain test sequences 5 | with probabilities according to a trained language model 6 | 7 | this code ist GPLv3 8 | """ 9 | from math import log 10 | import sys 11 | 12 | def calc(arg): 13 | f=open(arg,"r") 14 | res = 0 15 | zero = 0; 16 | wc = 0; 17 | for l in f: 18 | fl = float(l.split("\t")[1]) 19 | if (fl==0): 20 | zero = zero + 1 21 | continue 22 | res=res + log(fl,2) 23 | wc = wc + len(l.split(" ")) 24 | print arg + "\t entropy: " + str((res*-1)/wc) + "\tsequences with zeros: " + str(zero) 25 | 26 | 27 | 28 | for arg in sys.argv: 29 | if (arg==sys.argv[0]): 30 | continue 31 | calc(arg) 32 | -------------------------------------------------------------------------------- /config.sample.txt: -------------------------------------------------------------------------------- 1 | ################################################################################################## 2 | # 3 | # Wellcome to the Generalized Language Model toolkit config file. 4 | # 5 | # !!!!!!!!!! copy config.sample.txt to config.txt !!!!!!!! 6 | # 7 | # this is the configuration file for the generalized language modelleing toolkit 8 | # you can configure everything such as model length and the place of your training data. 9 | # 10 | # also this software can be used to run in several stages if your data set is large and your machine 11 | # got killed for some reason you don't have to redo the calculation. 12 | # 13 | # if you have questions please send a mail to rene@rene-pickhardt.de 14 | ################################################################################################## 15 | 16 | ################################################################################################## 17 | ### basic settings which will be most certainly be needed to change 18 | ################################################################################################## 19 | 20 | #directory from which we will start to work 21 | outputDirectory = /media/mssd/datasets/glm/out/ 22 | 23 | #length of the model to be trained 24 | modelLength = 5 25 | 26 | #amount of threads that should be concurrently assigned to the program 27 | numberOfCores = 4 28 | 29 | #name of the input data set (this is supposed to be a subfolder of outputDirectory) in this folder the trainingfile should be named normalized.txt and should contain one sentence per line. 30 | inputDataSet = wiki 31 | 32 | #can be used for multiple languages 33 | languages = en 34 | 35 | ################################################################################################## 36 | ### stages of the entire calculaten in the order they are being processed 37 | ### usefull for big data sets. here if something goes wrong you don't have to start over again 38 | ### set the following values to false for the stages of processing you wish to skip 39 | ################################################################################################## 40 | 41 | ### first the data sets are split to training and test data 42 | splitData = true 43 | 44 | ### state if the index of words should be build. The index is used to create subfiles for counting and aggregating sequences 45 | buildIndex = true 46 | 47 | ### if the absolute values for skipped sequences should be build 48 | buildGLM = true 49 | 50 | ### states if also all the continuation values should be build. 51 | buildContinuationGLM = true 52 | 53 | ### the absolute counts and continuation counts from the entire LM which are needed for the testing-samples 54 | ### will be extracted and stored in testing-samples/ pay attantion. If your testing-samples are too large 55 | ### you might run out of memory when running the experiment since all the data needed will be stored into main 56 | ### memory 57 | extractContinuationGLM = true 58 | 59 | ### set this to true if you want to build a standard kneser ney (generalized) language model 60 | buildKneserNey = true 61 | 62 | ### set this to true if you want to build a modified kneser ney (generalized) language model 63 | buildModKneserNey = true 64 | 65 | # was not used for paper since there is currently an acompaning python script for the task 66 | calculateEntropy = false 67 | 68 | ### calculate a standard language model 69 | kneserNeySimple = true 70 | 71 | ### calculate a generalized language model 72 | kneserNeyComplex = true 73 | 74 | ### use absolute discounting for interpolated probabilities (this should be set to false for the standard (modified) kneser ney implementation) 75 | backoffAbsolute = false 76 | 77 | ### don't use any smoothing but just calculate conditional probabilities. 78 | conditionalProbabilityOnly = false 79 | 80 | ################################################################################################## 81 | ### misc 82 | ################################################################################################## 83 | 84 | ### should be used to save space 85 | deleteTempFiles = true 86 | 87 | ### is useful for modified kneser ney smoothing 88 | addSentenceTags = true 89 | addFakeStartTag = true 90 | 91 | ### number of decimal places that will be used for calculation of smoothing algorithms 92 | decimalPlaces = 30 93 | 94 | ################################################################################################## 95 | ### configuration of training data 96 | ################################################################################################## 97 | 98 | ### number of test queries which will be sampled from the test query set 99 | numberOfQueries = 100000 100 | 101 | ### used for splitting files in which the skipped ngrams are stored and for index building 102 | maxCountDivider = 1000 103 | 104 | ################################################################################################## 105 | ### the following numbers are for creation of training, learning and testing data splits. 106 | ################################################################################################## 107 | 108 | # 20 means that only 20% of the input data will be thrown away 109 | sampleRate = 0 110 | 111 | # 90 means that 90% of data will be training data 112 | splitDataRatio = 2 113 | 114 | splitTestRatio = 100 115 | 116 | -------------------------------------------------------------------------------- /lib/README: -------------------------------------------------------------------------------- 1 | origin of bzip2.jar: 2 | http://www.kohsuke.org/bzip2/ 3 | 4 | origin of wikimlj 5 | https://code.google.com/p/wikixmlj/downloads/detail?name=wikixmlj-r43.jar 6 | -------------------------------------------------------------------------------- /lib/apache/commons-compress-1.5/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /lib/apache/commons-compress-1.5/NOTICE.txt: -------------------------------------------------------------------------------- 1 | Apache Commons Compress 2 | Copyright 2002-2013 The Apache Software Foundation 3 | 4 | This product includes software developed at 5 | The Apache Software Foundation (http://www.apache.org/). 6 | -------------------------------------------------------------------------------- /lib/apache/commons-compress-1.5/README.txt: -------------------------------------------------------------------------------- 1 | Apache Commons Compress was derived from various sources, including: 2 | 3 | Original BZip2 classes contributed by Keiron Liddle 4 | , Aftex Software to the Apache Ant project 5 | They are based on a port of Julian Seward's libbzip2. 6 | 7 | Original Tar classes from contributors of the Apache Ant project 8 | 9 | Original Zip classes from contributors of the Apache Ant project 10 | 11 | Original CPIO classes contributed by Markus Kuss and the jRPM project 12 | (jrpm.sourceforge.net) 13 | -------------------------------------------------------------------------------- /lib/apache/commons-compress-1.5/RELEASE-NOTES.txt: -------------------------------------------------------------------------------- 1 | Apache Commons Compress 1.5 RELEASE NOTES 2 | 3 | Apache Commons Compress software defines an API for working with compression and archive formats. 4 | These include: bzip2, gzip, pack200 and ar, cpio, jar, tar, zip, dump. 5 | 6 | Release 1.5 7 | 8 | Changes in this version include: 9 | 10 | New features: 11 | 12 | o CompressorStreamFactory has an option to create decompressing 13 | streams that decompress the full input for formats that support 14 | multiple concatenated streams. 15 | Issue: COMPRESS-220. 16 | 17 | Fixed Bugs: 18 | 19 | o Typo in CompressorStreamFactory Javadoc 20 | Issue: COMPRESS-218. 21 | Thanks to Gili. 22 | o ArchiveStreamFactory's tar stream detection created false positives 23 | for AIFF files. 24 | Issue: COMPRESS-191. 25 | Thanks to Jukka Zitting. 26 | o XZ for Java didn't provide an OSGi bundle. Compress' dependency on 27 | it has now been marked optional so Compress itself can still be used 28 | in an OSGi context. 29 | Issue: COMPRESS-199. 30 | Thanks to Jukka Zitting. 31 | o When specifying the encoding explicitly TarArchiveOutputStream would 32 | write unreadable names in GNU mode or even cause errors in POSIX 33 | mode for file names longer than 66 characters. 34 | Issue: COMPRESS-200. 35 | Thanks to Christian Schlichtherle. 36 | o Writing TAR PAX headers failed if the generated entry name ended 37 | with a "/". 38 | Issue: COMPRESS-203. 39 | o ZipArchiveInputStream sometimes failed to provide input to the 40 | Inflater when it needed it, leading to reads returning 0. 41 | Issue: COMPRESS-189. 42 | Thanks to Daniel Lowe. 43 | o TarArchiveInputStream ignored the encoding for GNU long name 44 | entries. 45 | Issue: COMPRESS-212. 46 | o TarArchiveInputStream could leave the second EOF record inside the 47 | stream it had just finished reading. 48 | Issue: COMPRESS-206. 49 | Thanks to Peter De Maeyer. 50 | o DumpArchiveInputStream no longer implicitly closes the original 51 | input stream when it reaches the end of the archive. 52 | o ZipArchiveInputStream now consumes the remainder of the archive when 53 | getNextZipEntry returns null. 54 | o Unit tests could fail if the source tree was checked out to a 55 | directory tree containign spaces. 56 | Issue: COMPRESS-205. 57 | Thanks to Daniel Lowe. 58 | o Fixed a potential ArrayIndexOutOfBoundsException when reading STORED 59 | entries from ZipArchiveInputStream. 60 | Issue: COMPRESS-219. 61 | o CompressorStreamFactory can now be used without XZ for Java being 62 | available. 63 | Issue: COMPRESS-221. 64 | 65 | Changes: 66 | 67 | o Improved exception message if a zip archive cannot be read because 68 | of an unsupported compression method. 69 | Issue: COMPRESS-188. 70 | Thanks to Harald Kuhn. 71 | o ArchiveStreamFactory has a setting for file name encoding that sets 72 | up encoding for ZIP and TAR streams. 73 | Issue: COMPRESS-192. 74 | Thanks to Jukka Zitting. 75 | o TarArchiveEntry now has a method to verify its checksum. 76 | Issue: COMPRESS-191. 77 | Thanks to Jukka Zitting. 78 | o Split/spanned ZIP archives are now properly detected by 79 | ArchiveStreamFactory but will cause an 80 | UnsupportedZipFeatureException when read. 81 | o ZipArchiveInputStream now reads archives that start with a "PK00" 82 | signature. Archives with this signatures are created when the 83 | archiver was willing to split the archive but in the end only needed 84 | a single segment - so didn't split anything. 85 | Issue: COMPRESS-208. 86 | o TarArchiveEntry has a new constructor that allows setting linkFlag 87 | and preserveLeadingSlashes at the same time. 88 | Issue: COMPRESS-201. 89 | o ChangeSetPerformer has a new perform overload that uses a ZipFile 90 | instance as input. 91 | Issue: COMPRESS-159. 92 | o Garbage collection pressure has been reduced by reusing temporary 93 | byte arrays in classes. 94 | Issue: COMPRESS-172. 95 | Thanks to Thomas Mair. 96 | o Can now handle zip extra field 0x5455 - Extended Timestamp. 97 | Issue: COMPRESS-210. 98 | Thanks to Julius Davies. 99 | o handle zip extra field 0x7875 - Info Zip New Unix Extra Field. 100 | Issue: COMPRESS-211. 101 | Thanks to Julius Davies. 102 | o ZipShort, ZipLong, ZipEightByteInteger should implement Serializable 103 | Issue: COMPRESS-213. 104 | Thanks to Julius Davies. 105 | o better support for unix symlinks in ZipFile entries. 106 | Issue: COMPRESS-214. 107 | Thanks to Julius Davies. 108 | o ZipFile's initialization has been improved for non-Zip64 archives. 109 | Issue: COMPRESS-215. 110 | Thanks to Robin Power. 111 | o Updated XZ for Java dependency to 1.2 as this version provides 112 | proper OSGi manifest attributes. 113 | 114 | For complete information on Commons Compress, including instructions on how to submit bug reports, 115 | patches, or suggestions for improvement, see the Apache Commons Compress website: 116 | 117 | http://commons.apache.org/compress/ 118 | -------------------------------------------------------------------------------- /lib/apache/commons-compress-1.5/commons-compress-1.5-javadoc.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-compress-1.5/commons-compress-1.5-javadoc.jar -------------------------------------------------------------------------------- /lib/apache/commons-compress-1.5/commons-compress-1.5-sources.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-compress-1.5/commons-compress-1.5-sources.jar -------------------------------------------------------------------------------- /lib/apache/commons-compress-1.5/commons-compress-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-compress-1.5/commons-compress-1.5.jar -------------------------------------------------------------------------------- /lib/apache/commons-io-2.4/NOTICE.txt: -------------------------------------------------------------------------------- 1 | Apache Commons IO 2 | Copyright 2002-2012 The Apache Software Foundation 3 | 4 | This product includes software developed by 5 | The Apache Software Foundation (http://www.apache.org/). 6 | 7 | -------------------------------------------------------------------------------- /lib/apache/commons-io-2.4/RELEASE-NOTES.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-io-2.4/RELEASE-NOTES.txt -------------------------------------------------------------------------------- /lib/apache/commons-io-2.4/commons-io-2.4-javadoc.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-io-2.4/commons-io-2.4-javadoc.jar -------------------------------------------------------------------------------- /lib/apache/commons-io-2.4/commons-io-2.4-sources.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-io-2.4/commons-io-2.4-sources.jar -------------------------------------------------------------------------------- /lib/apache/commons-io-2.4/commons-io-2.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-io-2.4/commons-io-2.4.jar -------------------------------------------------------------------------------- /lib/apache/log4j-2.0-beta9/LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 1999-2005 The Apache Software Foundation 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /lib/apache/log4j-2.0-beta9/NOTICE.txt: -------------------------------------------------------------------------------- 1 | Apache Log4j 2 | Copyright 1999-2012 Apache Software Foundation 3 | 4 | This product includes software developed at 5 | The Apache Software Foundation (http://www.apache.org/). 6 | 7 | ResolverUtil.java 8 | Copyright 2005-2006 Tim Fennell 9 | 10 | Dumbster SMTP test server 11 | Copyright 2004 Jason Paul Kitchen 12 | -------------------------------------------------------------------------------- /lib/apache/log4j-2.0-beta9/RELEASE-NOTES.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache Log4j 2.0-beta9 RELEASE NOTES 3 | 4 | The Apache Log4j 2 team is pleased to announce the Log4j 2.0-beta9 release! 5 | 6 | Apache log4j is a well known framework for logging application behavior. Log4j 2 is an upgrade to 7 | Log4j that provides significant improvements over its predecessor, Log4j 1.x, and provides 8 | many of the improvements available in Logback while fixing some inherent problems in Logback's 9 | architecture. 10 | 11 | This is the eleventh release of Log4j 2 and is being made available to encourage use and feedback from the community. 12 | 13 | Bug fixes and enhancements 14 | 15 | Changes in this version include: 16 | 17 | New features: 18 | o LOG4J2-399: Allow the default file rollover strategy to define the compression level. 19 | o LOG4J2-338: Add TLSAppender. Also added missing license headers to several files. Thanks to Tibor Benke. 20 | o LOG4J2-253: Added FAQ page to the site. 21 | o LOG4J2-362: Add a diagram to the site (FAQ page) that explains when to use which jar. 22 | o LOG4J2-374: Add more options to PatternLayout to display more detailed information about a Throwable. Thanks to Tibor Benke. 23 | o LOG4J2-383: [Pattern Layout] Customize level names by length. 24 | o LOG4J2-384: [Pattern Layout] Customize level names to lower-case. 25 | o LOG4J2-364: Add WebLookup to retrieve information from the ServletContext. Thanks to David Nault. 26 | o LOG4J2-360: Allow Plugins to have aliases. 27 | o LOG4J2-356: Create a JSON Layout. 28 | o LOG4J2-341: Enable XInclude for XML configurations. 29 | o LOG4J2-313: Add JNDILookup plugin. Thanks to Woonsan Ko. 30 | o LOG4J2-305: Ease porting from 1.x Logger.getRootLogger(): add LogManager.getRootLogger(). 31 | 32 | Fixed Bugs: 33 | o LOG4J2-226: Fix table of contents generation in pdf. 34 | o LOG4J2-395: Allow classpath scheme when specifying configuration file location as a system property. Thanks to Abhinav Shah. 35 | o LOG4J2-393: Initialize PluginManager once during configuration. Move advertisement setup into BaseConfiguration. 36 | o LOG4J2-391: FlumePersistentManager now handles LockConflictExceptions in Berkeley Db. Thanks to Kamal Bahadur. 37 | o LOG4J2-380: Use rollover date when substituting ${date} in the filePattern. 38 | o LOG4J2-322: Centralized reflective use of Reflection#getCallerClass and properly handled its instability in various versions 39 | of Java. 40 | o LOG4J2-293: Reset the Configuration if the ClassLoaderContextSelector creates a LoggerContext without a configuration 41 | location and then is later provided one. 42 | o LOG4J2-293: Changed the ConfigurationFactory to recognize and properly use the classpath: URI scheme in addition to the 43 | classloader: URI scheme. Thanks to Abhinav Shah. 44 | o LOG4J2-359: Changed the Servlet 3.0 auto-initializer so that it does nothing in a Servlet 2.5 or older application. This 45 | ensures behavioral consistency across containers. Thanks to Abhinav Shah. 46 | o LOG4J2-310: Fixed issue where SMTPAppender did not send mails with error or fatal level without prior info event. Thanks to Olivier Lemasle. 47 | o LOG4J2-368: Add PatternLayout constructor to Log4j 1.2 bridge for Velocity. 48 | o LOG4J2-333: Match artifact ids with Maven module names. Thanks to Hervé Boutemy. 49 | o LOG4J2-367: JMS appenders send two messages for one append. Thanks to David Parry. 50 | o LOG4J2-319: Double stack trace logging when using %throwable in %style and %highlight. 51 | o LOG4J2-358: NoSQLAppender using MongoDB provider ignores username and password attributes 52 | o LOG4J2-343: Removed unnecessary generics from Appender interface and implementing classes. Thanks to Henning Schmiedehausen. 53 | o LOG4J2-351: [OSGi] wrong Fragment-Host in manifest files. Thanks to Roland Weiglhofer. 54 | o LOG4J2-336: AsyncLogger errors after multiple calls to LoggerContext.reconfigure(). Thanks to Andre Bogus. 55 | o LOG4J2-347: Give the AsyncAppender thread a more descriptive name for easier debugging/profiling. Thanks to David Phillips. 56 | o LOG4J2-332: Modified documentation to refer to SLF4J Binding instead of SLF4J Bridge. Thanks to Hervé Boutemy. 57 | o LOG4J2-342: Ignore xml:base attributes. 58 | o LOG4J2-309: Insure jars and distributions only have a single License and Notice file. 59 | o LOG4J2-320: JPAAppender stops logging because META-INF/log4j-provider.properties is left open. 60 | o LOG4J2-335: FlumePersistentManager's writer thread had high CPU usage. 61 | o LOG4J2-331: Removed erroneous check for affected MongoDB records, which always returns zero on inserts. 62 | o LOG4J2-330: Added a BSON Transformer so that MongoDB can persist Log4j events. 63 | o LOG4J2-329: StatusLogger now only creates StatusData objects if they are the appropriate logging level. 64 | o LOG4J2-328: FlumePersistentManager was calling Berkeley DB's count method too frequently. 65 | o LOG4J2-280: Additional fix to make AsyncAppender threads daemon threads and improve their thread name. 66 | o LOG4J2-165: The slf4j-ext jar is now an optional dependency of the SLF4J bridge. 67 | o LOG4J2-166: RoutingAppender's default Route can now be an appender reference. 68 | o LOG4J2-299: Add getThrowable method to ThrowableProxy. 69 | o LOG4J2-216: ThrowableProxy no longer extends Throwable. 70 | o LOG4J2-311: Synchronized flush() and close() methods in the XxxFileManager and OutputStreamManager classes. 71 | o LOG4J2-304: Fixed Async Loggers memory leak. 72 | o LOG4J2-291: Fixed JDBC, JPA, and NoSQL appenders so that the failover appender properly fails over on error. 73 | o LOG4J2-397: Logger.info(Message) Javadoc is incorrect. Thanks to Yonatan Graber. 74 | 75 | Changes: 76 | o LOG4J2-317: Renamed FastFileAppender and FastRollingFileAppender to RandomAccessFileAppender 77 | and RollingRandomAccessFileAppender. Configurations using the Fast(Rolling)File element 78 | no longer work and should be modified to use the (Rolling)RandomAccessFile element. 79 | o Changed the "suppressExceptions" configuration attribute for all Appenders to "ignoreExceptions" to avoid 80 | confusion with Java 7 suppressed exceptions. Also renamed the Appender#isExceptionSuppressed() method to 81 | Appender#ignoreExceptions() to avoid the same confusion. All Appenders by default internally log and then ignore 82 | exceptions encountered while logging. Setting "ignoreExceptions" to "false" on an Appender causes it to allow 83 | exceptions to propagate to the caller. You must set "ignoreExceptions" to "false" for Appenders you are wrapping 84 | in the Failover Appender. 85 | o Changed the (relatively new) PatternLayout configuration attribute "suppressExceptions" to 86 | "alwaysWriteExceptions" to more correctly indicate what it does. As such, the meaning of this attribute has 87 | reversed (previous "true"s should become "false"s, and vice versa). Since this was an undocumented attribute up 88 | until now, it's unlikely this change will affect any users. 89 | o LOG4J2-355: Add support for multiple SD-ELEMENTs in a RFC 5424 syslog message. Thanks to Tibor Benke. 90 | o Cleaned up tests and cleared up documentation for the JPA appender following the resolution of EclipseLink 91 | issue #412454. 92 | o LOG4J2-318: Allow shutdown hook to be disabled in the configuration. 93 | o LOG4J2-312: XML layout improvements (compact vs. pretty, namespace, namespace prefix, root element). 94 | o LOG4J2-388: Update Java Mail dependency to 1.5.0 from 1.4.7. 95 | o LOG4J2-325: Update JDBC tests to use H2 database 1.3.173 from 1.3.172. 96 | o LOG4J2-366: Update commons-logging to 1.1.3 from 1.1.1. 97 | o LOG4J2-390: Update HSQLDB dependency to 2.3.0 from 2.2.9. 98 | o LOG4J2-308: Clarified which library versions were used in Async Loggers performance test. 99 | o LOG4J2-307: Updated Async Loggers' LMAX Disruptor library from 3.0.1 to 3.2.0. 100 | o LOG4J2-306: Update JSON Jackson library to 2.2.2 from 2.2.1. 101 | o LOG4J2-387: Update Jackson dependency to 1.9.13 from 1.9.11. 102 | o Improved site by adding quick jump-off page and menu for Javadoc links for all components. 103 | 104 | 105 | Apache Log4j 2.0-beta9 requires a minimum of Java 6 to build and run. Basic compatibility with 106 | Log4j 1.x is provided through the log4j-1.2-api component, however it does not implement some of the 107 | very implementation specific classes and methods. The package names and Maven groupId have been changed to 108 | org.apache.logging.log4j to avoid any conflicts with log4j 1.x. 109 | 110 | For complete information on Apache Log4j 2, including instructions on how to submit bug reports, 111 | patches, or suggestions for improvement, see the Apache Apache Log4j 2 website: 112 | 113 | http://logging.apache.org/log4j/2.x/ 114 | 115 | 116 | -------------------------------------------------------------------------------- /lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9-javadoc.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9-javadoc.jar -------------------------------------------------------------------------------- /lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9-sources.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9-sources.jar -------------------------------------------------------------------------------- /lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9.jar -------------------------------------------------------------------------------- /lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9-javadoc.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9-javadoc.jar -------------------------------------------------------------------------------- /lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9-sources.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9-sources.jar -------------------------------------------------------------------------------- /lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9.jar -------------------------------------------------------------------------------- /lib/lucene-analyzers-common-4.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/lucene-analyzers-common-4.0.0.jar -------------------------------------------------------------------------------- /lib/lucene-core-4.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/lucene-core-4.0.0.jar -------------------------------------------------------------------------------- /lib/lucene-queryparser-4.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/lucene-queryparser-4.0.0.jar -------------------------------------------------------------------------------- /lib/mongo-2.9.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/mongo-2.9.1.jar -------------------------------------------------------------------------------- /lib/mysql/mysql-connector-java-5.1.24-bin.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/mysql/mysql-connector-java-5.1.24-bin.jar -------------------------------------------------------------------------------- /metriken.sh: -------------------------------------------------------------------------------- 1 | grep "KSS: 19" typo-4-7095.log | wc -l 2 | grep "KSS: 18" typo-4-7095.log | wc -l 3 | grep "KSS: 17" typo-4-7095.log | wc -l 4 | grep "KSS: 16" typo-4-7095.log | wc -l 5 | grep "KSS: 15" typo-4-7095.log | wc -l 6 | grep "KSS: 14" typo-4-7095.log | wc -l 7 | grep "KSS: 13" typo-4-7095.log | wc -l 8 | grep "KSS: 12" typo-4-7095.log | wc -l 9 | grep "KSS: 11" typo-4-7095.log | wc -l 10 | grep "KSS: 10" typo-4-7095.log | wc -l 11 | grep "KSS: 9" typo-4-7095.log | wc -l 12 | grep "KSS: 8" typo-4-7095.log | wc -l 13 | grep "KSS: 7" typo-4-7095.log | wc -l 14 | grep "KSS: 6" typo-4-7095.log | wc -l 15 | grep "KSS: 5" typo-4-7095.log | wc -l 16 | grep "KSS: 4" typo-4-7095.log | wc -l 17 | grep "KSS: 3" typo-4-7095.log | wc -l 18 | grep "KSS: 2" typo-4-7095.log | wc -l 19 | grep "KSS: 1" typo-4-7095.log | wc -l 20 | 21 | 22 | #grep "KSS" typo-4-7095.log | grep "PREFIXLENGHT: 3" 23 | 24 | echo "no matches:" 25 | grep "NOTHING" typo-4-7095.log | wc -l 26 | echo "matches:" 27 | grep "HIT" typo-4-7095.log | wc -l 28 | 29 | echo "matches on rank 1 any prefix" 30 | grep "HIT" typo-4-7095.log | grep "RANK: 1" | wc -l 31 | 32 | echo "matches on rank 2 any prefix" 33 | grep "HIT" typo-4-7095.log | grep "RANK: 2" | wc -l 34 | 35 | echo "matches on rank 3 any prefix" 36 | grep "HIT" typo-4-7095.log | grep "RANK: 3" | wc -l 37 | 38 | echo "matches on rank 4 any prefix" 39 | grep "HIT" typo-4-7095.log | grep "RANK: 4" | wc -l 40 | 41 | echo "matches on rank 5 any prefix" 42 | grep "HIT" typo-4-7095.log | grep "RANK: 5" | wc -l 43 | 44 | 45 | echo "total number of hits with prefix 0:" 46 | HIT=`grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 0" | wc -l` 47 | echo $HIT 48 | echo "total number of NO HITS with prefix 0:" 49 | NO=`grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 0" | wc -l` 50 | echo $NO 51 | SUM=$((HIT + NO)) 52 | HIT=$((HIT * 1000)) 53 | frac=$((HIT / SUM)) 54 | echo $frac "Promille hits" 55 | 56 | echo "matches with on rank 1 and prefix 0:" 57 | grep "HIT" typo-4-7095.log | grep "RANK: 1" | grep "PREFIXLENGHT: 0" | wc -l 58 | 59 | echo "matches with on rank 2 and prefix 0:" 60 | grep "HIT" typo-4-7095.log | grep "RANK: 2" | grep "PREFIXLENGHT: 0" | wc -l 61 | 62 | echo "matches with on rank 3 and prefix 0:" 63 | grep "HIT" typo-4-7095.log | grep "RANK: 3" | grep "PREFIXLENGHT: 0" | wc -l 64 | 65 | echo "matches with on rank 4 and prefix 0:" 66 | grep "HIT" typo-4-7095.log | grep "RANK: 4" | grep "PREFIXLENGHT: 0" | wc -l 67 | 68 | echo "matches with on rank 5 and prefix 0:" 69 | grep "HIT" typo-4-7095.log | grep "RANK: 5" | grep "PREFIXLENGHT: 0" | wc -l 70 | 71 | 72 | 73 | 74 | echo "total number of hits with prefix 1:" 75 | grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 1" | wc -l 76 | echo "total number of NO HITS with prefix 1:" 77 | grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 1" | wc -l 78 | 79 | 80 | echo "matches with on rank 1 and prefix 1:" 81 | grep "HIT" typo-4-7095.log | grep "RANK: 1" | grep "PREFIXLENGHT: 1" | wc -l 82 | 83 | echo "matches with on rank 2 and prefix 1:" 84 | grep "HIT" typo-4-7095.log | grep "RANK: 2" | grep "PREFIXLENGHT: 1" | wc -l 85 | 86 | echo "matches with on rank 3 and prefix 1:" 87 | grep "HIT" typo-4-7095.log | grep "RANK: 3" | grep "PREFIXLENGHT: 1" | wc -l 88 | 89 | echo "matches with on rank 4 and prefix 1:" 90 | grep "HIT" typo-4-7095.log | grep "RANK: 4" | grep "PREFIXLENGHT: 1" | wc -l 91 | 92 | echo "matches with on rank 5 and prefix 1:" 93 | grep "HIT" typo-4-7095.log | grep "RANK: 5" | grep "PREFIXLENGHT: 1" | wc -l 94 | 95 | 96 | 97 | 98 | 99 | echo "total number of hits with prefix 2:" 100 | grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 2" | wc -l 101 | echo "total number of NO HITS with prefix 2:" 102 | grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 2" | wc -l 103 | 104 | 105 | echo "matches with on rank 1 and prefix 2:" 106 | grep "HIT" typo-4-7095.log | grep "RANK: 1" | grep "PREFIXLENGHT: 2" | wc -l 107 | 108 | echo "matches with on rank 2 and prefix 2:" 109 | grep "HIT" typo-4-7095.log | grep "RANK: 2" | grep "PREFIXLENGHT: 2" | wc -l 110 | 111 | echo "matches with on rank 3 and prefix 2:" 112 | grep "HIT" typo-4-7095.log | grep "RANK: 3" | grep "PREFIXLENGHT: 2" | wc -l 113 | 114 | echo "matches with on rank 4 and prefix 2:" 115 | grep "HIT" typo-4-7095.log | grep "RANK: 4" | grep "PREFIXLENGHT: 2" | wc -l 116 | 117 | echo "matches with on rank 5 and prefix 2:" 118 | grep "HIT" typo-4-7095.log | grep "RANK: 5" | grep "PREFIXLENGHT: 2" | wc -l 119 | 120 | 121 | 122 | 123 | echo "total number of hits with prefix 3:" 124 | grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 3" | wc -l 125 | echo "total number of NO HITS with prefix 3:" 126 | grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 3" | wc -l 127 | 128 | 129 | echo "matches with on rank 1 and prefix 3:" 130 | grep "HIT" typo-4-7095.log | grep "RANK: 1" | grep "PREFIXLENGHT: 3" | wc -l 131 | 132 | echo "matches with on rank 2 and prefix 3:" 133 | grep "HIT" typo-4-7095.log | grep "RANK: 2" | grep "PREFIXLENGHT: 3" | wc -l 134 | 135 | echo "matches with on rank 3 and prefix 3:" 136 | grep "HIT" typo-4-7095.log | grep "RANK: 3" | grep "PREFIXLENGHT: 3" | wc -l 137 | 138 | echo "matches with on rank 4 and prefix 3:" 139 | grep "HIT" typo-4-7095.log | grep "RANK: 4" | grep "PREFIXLENGHT: 3" | wc -l 140 | 141 | echo "matches with on rank 5 and prefix 3:" 142 | grep "HIT" typo-4-7095.log | grep "RANK: 5" | grep "PREFIXLENGHT: 3" | wc -l 143 | 144 | 145 | 146 | echo "total number of hits with prefix 0:" 147 | HIT=`grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 0" | wc -l` 148 | echo $HIT 149 | echo "total number of NO HITS with prefix 0:" 150 | NO=`grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 0" | wc -l` 151 | echo $NO 152 | SUM=$((HIT + NO)) 153 | HIT=$((HIT * 1000)) 154 | frac=$((HIT / SUM)) 155 | echo $frac "Promille hits" 156 | 157 | echo "total number of hits with prefix 1:" 158 | HIT=`grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 1" | wc -l` 159 | echo $HIT 160 | echo "total number of NO HITS with prefix 1:" 161 | NO=`grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 1" | wc -l` 162 | echo $NO 163 | SUM=$((HIT + NO)) 164 | HIT=$((HIT * 1000)) 165 | frac=$((HIT / SUM)) 166 | echo $frac "Promille hits" 167 | 168 | echo "total number of hits with prefix 2:" 169 | HIT=`grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 2" | wc -l` 170 | echo $HIT 171 | echo "total number of NO HITS with prefix 2:" 172 | NO=`grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 2" | wc -l` 173 | echo $NO 174 | SUM=$((HIT + NO)) 175 | HIT=$((HIT * 1000)) 176 | frac=$((HIT / SUM)) 177 | echo $frac "Promille hits" 178 | 179 | echo "total number of hits with prefix 3:" 180 | HIT=`grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 3" | wc -l` 181 | echo $HIT 182 | echo "total number of NO HITS with prefix 3:" 183 | NO=`grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 3" | wc -l` 184 | echo $NO 185 | SUM=$((HIT + NO)) 186 | HIT=$((HIT * 1000)) 187 | frac=$((HIT / SUM)) 188 | echo $frac "Promille hits" 189 | -------------------------------------------------------------------------------- /mvn.sh: -------------------------------------------------------------------------------- 1 | ulimit -v 20000000 2 | mvn clean 3 | mvn compile 4 | nice mvn exec:java -Dexec.mainClass="de.typology.executables.KneserNeyBuilder" -Dfile.encoding=UTF-8 5 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | UTF-8 4 | 5 | 4.0.0 6 | de.typology 7 | typology 8 | 0.0.1-SNAPSHOT 9 | 10 | 11 | src 12 | 13 | 14 | maven-compiler-plugin 15 | 2.3.2 16 | 17 | 1.6 18 | 1.6 19 | UTF-8 20 | 21 | 22 | 23 | org.codehaus.mojo 24 | exec-maven-plugin 25 | 1.2.1 26 | 27 | 28 | maven-surefire-plugin 29 | 2.16 30 | 31 | -Xmx6024m 32 | 33 | 34 | 35 | maven-assembly-plugin 36 | 37 | 38 | 39 | de.typology.executables.KneserNeyBuilder 40 | 41 | 42 | 43 | jar-with-dependencies 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | mysql 52 | mysql-connector-java 53 | 5.1.22 54 | 55 | 56 | org.apache.commons 57 | commons-io 58 | 1.3.2 59 | 60 | 61 | org.apache.logging.log4j 62 | log4j-api 63 | 2.0-beta9 64 | 65 | 66 | org.apache.logging.log4j 67 | log4j-core 68 | 2.0-beta9 69 | 70 | 71 | org.apache.commons 72 | commons-lang3 73 | 3.2 74 | 75 | 76 | junit 77 | junit 78 | 4.11 79 | test 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | #Scripts 2 | ##Content 3 | Contains scripts to perform to following tasks: 4 | + Insert the different language models into a MySQL and build indices 5 | + Aggregate the evaluation data 6 | + Plot the aggregated evaluation data 7 | 8 | ##Structure 9 | To be continued... 10 | 11 | ##Configuration 12 | To be continued... 13 | -------------------------------------------------------------------------------- /scripts/combineLM.sh: -------------------------------------------------------------------------------- 1 | # combine ngrams 2 | cd $1 3 | for n in 2 3 4 5 4 | do 5 | cd ${n}/ 6 | for l in A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 7 | do 8 | cat ${l}* > ../${l}.${n}n 9 | rm ${l}* 10 | mv ../${l}.${n}n . 11 | done 12 | 13 | 14 | for l in a b c d e f g h i j k l m n p q r s t u v w x y z 15 | do 16 | cat ${l}* > ../${l}.${n}n 17 | rm ${l}* 18 | mv ../${l}.${n}n . 19 | done 20 | 21 | 22 | mv other.${n}n ../ 23 | cat o* > ../o.${n}n 24 | rm o* 25 | mv ../o.${n}n . 26 | mv ../other.${n}n . 27 | cd ../ 28 | done 29 | -------------------------------------------------------------------------------- /scripts/combineTypo.sh: -------------------------------------------------------------------------------- 1 | # combine typology 2 | cd $1 3 | for n in 1 2 3 4 4 | do 5 | cd ${n}/ 6 | for l in A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 7 | do 8 | cat ${l}* > ../${l}.${n}es 9 | rm ${l}* 10 | mv ../${l}.${n}es . 11 | done 12 | 13 | 14 | for l in a b c d e f g h i j k l m n p q r s t u v w x y z 15 | do 16 | cat ${l}* > ../${l}.${n}es 17 | rm ${l}* 18 | mv ../${l}.${n}es . 19 | done 20 | 21 | 22 | mv other.${n}n ../ 23 | cat o* > ../o.${n}es 24 | rm o* 25 | mv ../o.${n}es . 26 | mv ../other.${n}es . 27 | cd ../ 28 | done 29 | -------------------------------------------------------------------------------- /scripts/combinefiles.sh: -------------------------------------------------------------------------------- 1 | # combine ngrams 2 | cd $1 3 | for n in 2 3 4 5 4 | do 5 | cd ${n}/ 6 | for l in A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 7 | do 8 | cat ${l}* > ../${l}.${n}n 9 | rm ${l}* 10 | mv ../${l}.${n}n . 11 | done 12 | 13 | 14 | for l in a b c d e f g h i j k l m n p q r s t u v w x y z 15 | do 16 | cat ${l}* > ../${l}.${n}n 17 | rm ${l}* 18 | mv ../${l}.${n}n . 19 | done 20 | 21 | 22 | mv other.${n}n ../ 23 | cat o* > ../o.${n}n 24 | rm o* 25 | mv ../o.${n}n . 26 | mv ../other.${n}n . 27 | cd ../ 28 | done 29 | 30 | 31 | 32 | # combine typology 33 | cd $1 34 | for n in 1 2 3 4 35 | do 36 | cd ${n}/ 37 | for l in A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 38 | do 39 | cat ${l}* > ../${l}.${n}es 40 | rm ${l}* 41 | mv ../${l}.${n}es . 42 | done 43 | 44 | 45 | for l in a b c d e f g h i j k l m n p q r s t u v w x y z 46 | do 47 | cat ${l}* > ../${l}.${n}es 48 | rm ${l}* 49 | mv ../${l}.${n}es . 50 | done 51 | 52 | 53 | mv other.${n}n ../ 54 | cat o* > ../o.${n}es 55 | rm o* 56 | mv ../o.${n}es . 57 | mv ../other.${n}es . 58 | cd ../ 59 | done 60 | -------------------------------------------------------------------------------- /scripts/index-glm.sh: -------------------------------------------------------------------------------- 1 | dbUser="importer" 2 | modelLength=5 3 | topK=5 4 | 5 | echo $1 6 | inputPath=${1/\/glm*/} 7 | dbLang=${inputPath##*/} 8 | inputPath=${inputPath%/*} 9 | dbType=${inputPath##*/} 10 | echo $inputPath 11 | echo $dbLang 12 | echo $dbType 13 | 14 | dbName=$dbType"_"$dbLang 15 | echo $dbName 16 | 17 | 18 | #dbPath="/mnt/vdb/typoeval/mysql/${dbName}/" #server 19 | dbPath=/var/lib/mysql/${dbName}/ #local machine 20 | 21 | mysql -u ${dbUser} -e "drop database ${dbName};" 22 | mysql -u ${dbUser} -e "create database ${dbName};" 23 | 24 | twoPowerModelLength=2**$modelLength 25 | 26 | for (( sequence=1 ; sequence < $twoPowerModelLength ; sequence++ )); do 27 | if [[ $sequence%2 -eq 0 ]]; then 28 | continue 29 | fi 30 | 31 | echo $sequence 32 | sequenceBinary=`echo "obase=2;$sequence" | bc` 33 | echo $sequenceBinary 34 | 35 | path=$1$sequenceBinary"/*" 36 | for file in $path 37 | do 38 | xpath=${file%/*} 39 | xbase=${file##*/} 40 | xfext=${xbase##*.} 41 | xpref=${xbase%.*} 42 | tableName=$xfext"_"$xpref 43 | echo "tableName: "$tableName 44 | echo "xpath: "$xpath 45 | echo "xbase: "$xbase 46 | echo "xfext: "$xfext 47 | echo "xpref: "$xpref 48 | 49 | indexQuery="create table "$tableName" (" 50 | indexSuffix="(" 51 | importQuery="load data local infile '"$file"' into table "$tableName" fields terminated by '\t' enclosed by '' lines terminated by '\n' (" 52 | sequenceLengthMinusOne=`expr ${#sequenceBinary} - 1` 53 | for (( sequencePointer=0; sequencePointer<$sequenceLengthMinusOne; sequencePointer++ )); do 54 | currentBit=${sequenceBinary:$sequencePointer:1} 55 | if [ $currentBit -eq 1 ]; then 56 | indexQuery=$indexQuery"source"$sequencePointer" varchar(60), " 57 | indexSuffix=$indexSuffix"source"$sequencePointer", " 58 | importQuery=$importQuery"source"$sequencePointer", " 59 | fi 60 | done; 61 | indexQuery=$indexQuery"target varchar(60), score float) engine=myisam character set utf8 collate utf8_bin;" 62 | if [ $sequence -ne 1 ]; then 63 | indexQuery=$indexQuery" create index "$tableName"_0_ix on "$tableName$indexSuffix"score desc);" 64 | fi 65 | 66 | for (( i=1 ; i <= $topK ; i++ )); do 67 | if [ $sequence -eq 1 ]; then 68 | indexQuery=$indexQuery" create index "$tableName"_"$i"_ix on "$tableName" (target("$i"), score desc);" 69 | else 70 | indexQuery=$indexQuery" create index "$tableName"_"$i"_ix on "$tableName" "$indexSuffix"target("$i"));" 71 | fi 72 | done; 73 | 74 | importQuery=$importQuery"target, score);" 75 | 76 | #create tables and indices 77 | mysql -u ${dbUser} $dbName --local-infile=1 -e "$indexQuery" 78 | 79 | #disable indices 80 | myisamchk --keys-used=0 -rq ${dbPath}${tableName} 81 | mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;" 82 | 83 | #import data 84 | mysql -u ${dbUser} $dbName --local-infile=1 -e "$importQuery" 85 | 86 | #compress table / really necessary? 87 | myisampack ${dbPath}${tableName} 88 | 89 | #enable index 90 | myisamchk -rq ${dbPath}${tableName} --tmpdir="/mnt/vdb/tmp" --sort_buffer=3G #--sort-index --sort-records=1 91 | 92 | #and flush index again 93 | mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;" 94 | done; 95 | done; 96 | 97 | -------------------------------------------------------------------------------- /scripts/index-kneser-ney.sh: -------------------------------------------------------------------------------- 1 | dbUser="importer" 2 | modelLength=5 3 | topK=5 4 | 5 | # call with e.g.: 6 | # ./index-kneser-ney.sh out/wiki/test/ kneser-ney 7 | # or 8 | # ./index-kneser-ney.sh out/wiki/test/ mod-kneser-ney 9 | 10 | echo $1 11 | echo $2 12 | inputPath=${1%/} 13 | dbLang=${inputPath##*/} 14 | inputPath=${inputPath%/*} 15 | dbDataSet=${inputPath##*/} 16 | dbType=${2//-/_} 17 | echo "inputpath: "$inputPath 18 | echo "lang: "$dbLang 19 | echo "dbDataSet: "$dbDataSet 20 | echo "dbType: "$dbType 21 | 22 | 23 | 24 | buildResultTable () { 25 | xpath=${file%/*} 26 | xbase=${file##*/} 27 | xfext=${xbase##*.} 28 | xpref=${xbase%.*} 29 | tableName=$xfext"_"$xpref 30 | echo "tableName: "$tableName 31 | echo "xpath: "$xpath 32 | echo "xbase: "$xbase 33 | echo "xfext: "$xfext 34 | echo "xpref: "$xpref 35 | 36 | indexQuery="create table "$tableName" (" 37 | indexSuffix="(" 38 | importQuery="load data local infile '"$file"' into table "$tableName" fields terminated by '\t' enclosed by '' lines terminated by '\n' (" 39 | sequenceLengthMinusOne=`expr ${#sequenceBinary} - 1` 40 | for (( sequencePointer=0; sequencePointer<$sequenceLengthMinusOne; sequencePointer++ )); do 41 | currentBit=${sequenceBinary:$sequencePointer:1} 42 | if [ $currentBit -eq 1 ]; then 43 | indexQuery=$indexQuery"source"$sequencePointer" varchar(60), " 44 | indexSuffix=$indexSuffix"source"$sequencePointer", " 45 | importQuery=$importQuery"source"$sequencePointer", " 46 | fi 47 | done 48 | indexQuery=$indexQuery"target varchar(60), score float) engine=myisam character set utf8 collate utf8_bin;" 49 | if [ $sequence -ne 1 ]; then 50 | indexQuery=$indexQuery" create index "$tableName"_0_ix on "$tableName$indexSuffix"score desc);" 51 | fi 52 | 53 | for (( i=1 ; i <= $topK ; i++ )); do 54 | if [ $sequence -eq 1 ]; then 55 | indexQuery=$indexQuery" create index "$tableName"_"$i"_ix on "$tableName" (target("$i"), score desc);" 56 | else 57 | indexQuery=$indexQuery" create index "$tableName"_"$i"_ix on "$tableName" "$indexSuffix"target("$i"));" 58 | fi 59 | done 60 | importQuery=$importQuery"target, score);" 61 | } 62 | 63 | buildDiscountTable () { 64 | xpath=${file%/*} 65 | xbase=${file##*/} 66 | xfext=${xbase##*.} 67 | xpref=${xbase%.*} 68 | tableName=$xfext"_"$xpref 69 | echo "tableName: "$tableName 70 | echo "xpath: "$xpath 71 | echo "xbase: "$xbase 72 | echo "xfext: "$xfext 73 | echo "xpref: "$xpref 74 | 75 | indexQuery="create table "$tableName" (" 76 | indexSuffix="(" 77 | importQuery="load data local infile '"$file"' into table "$tableName" fields terminated by '\t' enclosed by '' lines terminated by '\n' (" 78 | sequenceLengthMinusOne=`expr ${#sequenceBinary} - 1` 79 | for (( sequencePointer=0; sequencePointer<=$sequenceLengthMinusOne; sequencePointer++ )); do 80 | currentBit=${sequenceBinary:$sequencePointer:1} 81 | if [ $currentBit -eq 1 ]; then 82 | indexQuery=$indexQuery"source"$sequencePointer" varchar(60), " 83 | indexSuffix=$indexSuffix"source"$sequencePointer", " 84 | importQuery=$importQuery"source"$sequencePointer", " 85 | fi 86 | done 87 | indexQuery=$indexQuery"score float) engine=myisam character set utf8 collate utf8_bin;" 88 | importQuery=$importQuery"score);" 89 | 90 | # remove ", " from indexSuffix 91 | indexSuffix=${indexSuffix%?} 92 | indexSuffix=${indexSuffix%?} 93 | indexSuffix=$indexSuffix")" 94 | indexQuery=$indexQuery" create index "$tableName" on "$tableName$indexSuffix";" 95 | echo $indexQuery 96 | } 97 | 98 | buildIndices () { 99 | 100 | #create tables and indices 101 | mysql -u ${dbUser} $dbName --local-infile=1 -e "$indexQuery" 102 | 103 | #disable indices 104 | myisamchk --keys-used=0 -rq ${dbPath}${tableName} 105 | mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;" 106 | 107 | #import data 108 | mysql -u ${dbUser} $dbName --local-infile=1 -e "$importQuery" 109 | 110 | #compress table / really necessary? 111 | myisampack ${dbPath}${tableName} 112 | 113 | #enable index 114 | myisamchk -rq ${dbPath}${tableName} --tmpdir="/mnt/vdb/tmp" --sort_buffer=3G #--sort-index --sort-records=1 115 | 116 | #and flush index again 117 | mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;" 118 | } 119 | 120 | buildDatabase () { 121 | echo "dbName: "$dbName 122 | #dbPath="/mnt/vdb/typoeval/mysql/${dbName}/" #server 123 | dbPath=/var/lib/mysql/${dbName}/ #local machine 124 | echo "create database ${dbName};" 125 | mysql -u ${dbUser} -e "drop database \`${dbName}\`;" 126 | mysql -u ${dbUser} -e "create database \`${dbName}\`;" 127 | } 128 | 129 | #dbName=$dbDataSet"_"$dbLang"_"$dbType"_high" 130 | #buildDatabase 131 | #twoPowerModelLength=2**$modelLength 132 | # 133 | #for (( sequence=1 ; sequence < $twoPowerModelLength ; sequence++ )); do 134 | # if [[ $sequence%2 -eq 0 ]]; then 135 | # continue 136 | # fi 137 | # 138 | # echo $sequence 139 | # sequenceBinary=`echo "obase=2;$sequence" | bc` 140 | # echo $sequenceBinary 141 | # 142 | # path=$1"/kneser-ney-high/"$sequenceBinary"/*" 143 | # for file in $path; do 144 | # buildResultTable 145 | # buildIndices 146 | # done 147 | #done 148 | # 149 | #dbName=$dbDataSet"_"$dbLang"_"$dbType"_low" 150 | #buildDatabase 151 | # 152 | #modelLengthMinusOne=`expr $modelLength - 1` 153 | #twoPowerModelLengthMinusOne=2**$modelLengthMinusOne 154 | #for (( sequence=1 ; sequence < $twoPowerModelLengthMinusOne ; sequence++ )); do 155 | # if [[ $sequence%2 -eq 0 ]]; then 156 | # continue 157 | # fi 158 | # 159 | # echo $sequence 160 | # sequenceBinary=`echo "obase=2;$sequence" | bc` 161 | # echo $sequenceBinary 162 | # 163 | # path=$1"/kneser-ney-low/"$sequenceBinary"/*" 164 | # for file in $path; do 165 | # buildResultTable 166 | # buildIndices 167 | # done 168 | #done 169 | 170 | dbName=$dbDataSet"_"$dbLang"_"$dbType"_high_discount" 171 | buildDatabase 172 | 173 | modelLengthMinusOne=`expr $modelLength - 1` 174 | twoPowerModelLengthMinusOne=2**$modelLengthMinusOne 175 | for (( sequence=1 ; sequence < $twoPowerModelLengthMinusOne ; sequence++ )); do 176 | if [[ $sequence%2 -eq 0 ]]; then 177 | continue 178 | fi 179 | 180 | echo $sequence 181 | sequenceBinary=`echo "obase=2;$sequence" | bc` 182 | echo $sequenceBinary 183 | 184 | path=$1"/kneser-ney-high-discount/"$sequenceBinary"/*" 185 | for file in $path; do 186 | buildDiscountTable 187 | buildIndices 188 | done 189 | done 190 | 191 | dbName=$dbDataSet"_"$dbLang"_"$dbType"_low_discount" 192 | buildDatabase 193 | modelLengthMinusTwo=`expr $modelLength - 2` 194 | twoPowerModelLengthMinusTwo=2**$modelLengthMinusTwo 195 | for (( sequence=1 ; sequence < $twoPowerModelLengthMinusTwo ; sequence++ )); do 196 | if [[ $sequence%2 -eq 0 ]]; then 197 | continue 198 | fi 199 | 200 | echo $sequence 201 | sequenceBinary=`echo "obase=2;$sequence" | bc` 202 | echo $sequenceBinary 203 | 204 | path=$1"/kneser-ney-low-discount/"$sequenceBinary"/*" 205 | for file in $path; do 206 | buildDiscountTable 207 | buildIndices 208 | done 209 | done 210 | -------------------------------------------------------------------------------- /scripts/index-lm.sh: -------------------------------------------------------------------------------- 1 | dbUser="rene" 2 | echo $1 3 | inputPath=${1/\/ngrams*/} 4 | dbLang=${inputPath##*/} 5 | inputPath=${inputPath%/*} 6 | dbType=${inputPath##*/} 7 | echo $inputPath 8 | echo $dbLang 9 | echo $dbType 10 | 11 | dbName=$dbType"_"$dbLang"_ngram" 12 | echo $dbName 13 | #exit 1 14 | 15 | 16 | dbPath="/mnt/vdb/typoeval/mysql/${dbName}/" #server 17 | #dbPath=/var/lib/mysql/${dbName}/ #local machine 18 | 19 | mysql -u ${dbUser} -e "drop database ${dbName};" 20 | mysql -u ${dbUser} -e "create database ${dbName};" 21 | 22 | for (( i = 1 ; i <= 5; i++ )) 23 | do 24 | path=$1$i"gs/*" 25 | for file in $path 26 | do 27 | xpath=${file%/*} 28 | xbase=${file##*/} 29 | xfext=${xbase##*.} 30 | xpref=${xbase%.*} 31 | tablename=$xfext"_"$xpref 32 | echo "tablename: "$tablename 33 | echo "xpath: "$xpath 34 | echo "xbase: "$xbase 35 | echo "xfext: "$xfext 36 | echo "xpref: "$xpref 37 | 38 | #create tables and indices 39 | if [ $i -eq 1 ]; 40 | then 41 | mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin; 42 | create index ${tablename}_ix on ${tablename} (target(1), score desc); 43 | create index ${tablename}_2_ix on ${tablename} (target(2), score desc); 44 | create index ${tablename}_3_ix on ${tablename} (target(3), score desc); 45 | create index ${tablename}_4_ix on ${tablename} (target(4), score desc); 46 | create index ${tablename}_5_ix on ${tablename} (target(5), score desc);" 47 | fi 48 | 49 | if [ $i -eq 2 ]; 50 | then 51 | mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (source1 varchar(60),target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin; 52 | create index ${tablename}_ix on ${tablename} (source1(60), score desc); 53 | create index ${tablename}_2_ix on ${tablename} (source1(60), target(2)); 54 | create index ${tablename}_3_ix on ${tablename} (source1(60), target(3)); 55 | create index ${tablename}_4_ix on ${tablename} (source1(60), target(4)); 56 | create index ${tablename}_5_ix on ${tablename} (source1(60), target(5));" 57 | fi 58 | 59 | if [ $i -eq 3 ]; 60 | then 61 | mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (source1 varchar(60),source2 varchar(60),target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin; 62 | create index ${tablename}_ix on ${tablename} (source1(60), source2(60), score desc); 63 | create index ${tablename}_2_ix on ${tablename} (source1(60), source2(60), target(2)); 64 | create index ${tablename}_3_ix on ${tablename} (source1(60), source2(60), target(3)); 65 | create index ${tablename}_4_ix on ${tablename} (source1(60), source2(60), target(4)); 66 | create index ${tablename}_5_ix on ${tablename} (source1(60), source2(60), target(5));" 67 | fi 68 | 69 | if [ $i -eq 4 ]; 70 | then 71 | mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (source1 varchar(60),source2 varchar(60), source3 varchar(60), target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin; 72 | create index ${tablename}_ix on ${tablename} (source1(60), source2(60), source3(60), score desc); 73 | create index ${tablename}_2_ix on ${tablename} (source1(60), source2(60), source3(60), target(2)); 74 | create index ${tablename}_3_ix on ${tablename} (source1(60), source2(60), source3(60), target(3)); 75 | create index ${tablename}_4_ix on ${tablename} (source1(60), source2(60), source3(60), target(4)); 76 | create index ${tablename}_5_ix on ${tablename} (source1(60), source2(60), source3(60), target(5));" 77 | fi 78 | 79 | if [ $i -eq 5 ]; 80 | then 81 | mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (source1 varchar(60),source2 varchar(60), source3 varchar(60), source4 varchar(60), target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin; 82 | create index ${tablename}_ix on ${tablename} (source1(60), source2(60), source3(60), source4(60), score desc); 83 | create index ${tablename}_2_ix on ${tablename} (source1(60), source2(60), source3(60), source4(60), target(2)); 84 | create index ${tablename}_3_ix on ${tablename} (source1(60), source2(60), source3(60), source4(60), target(3)); 85 | create index ${tablename}_4_ix on ${tablename} (source1(60), source2(60), source3(60), source4(60), target(4)); 86 | create index ${tablename}_5_ix on ${tablename} (source1(60), source2(60), source3(60), source4(60), target(5));" 87 | fi 88 | 89 | #disable indices 90 | myisamchk --keys-used=0 -rq ${dbPath}${tablename} 91 | mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;" 92 | 93 | #import data 94 | if [ $i -eq 1 ]; 95 | then 96 | mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (target, score);" 97 | fi 98 | 99 | if [ $i -eq 2 ]; 100 | then 101 | mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (source1, target, score);" 102 | fi 103 | 104 | if [ $i -eq 3 ]; 105 | then 106 | mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (source1, source2, target, score);" 107 | fi 108 | 109 | if [ $i -eq 4 ]; 110 | then 111 | mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (source1, source2, source3, target, score);" 112 | fi 113 | 114 | if [ $i -eq 5 ]; 115 | then 116 | mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (source1, source2, source3, source4, target, score);" 117 | fi 118 | 119 | #compress table / really necessary? 120 | myisampack ${dbPath}${tablename} 121 | 122 | #enable index 123 | myisamchk -rq ${dbPath}${tablename} --tmpdir="/mnt/vdb/tmp" --sort_buffer=3G #--sort-index --sort-records=1 124 | 125 | #and flush index again 126 | mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;" 127 | done; 128 | done; 129 | -------------------------------------------------------------------------------- /scripts/index-typo.sh: -------------------------------------------------------------------------------- 1 | dbUser="rene" 2 | echo $1 3 | inputPath=${1/\/typos*/} 4 | dbLang=${inputPath##*/} 5 | inputPath=${inputPath%/*} 6 | dbType=${inputPath##*/} 7 | echo $inputPath 8 | echo $dbLang 9 | echo $dbType 10 | 11 | dbName=$dbType"_"$dbLang"_typo" 12 | echo $dbName 13 | #exit 1 14 | dbPath="/mnt/vdb/typoeval/mysql/${dbName}/" #server 15 | #dbPath=/var/lib/mysql/${dbName}/ #local machine 16 | 17 | mysql -u ${dbUser} -e "drop database ${dbName};" 18 | mysql -u ${dbUser} -e "create database ${dbName};" 19 | 20 | for (( i = 0; i < 5; i++ )) 21 | do 22 | path=$1$i"es/*" 23 | for file in $path 24 | do 25 | xpath=${file%/*} 26 | xbase=${file##*/} 27 | xfext=${xbase##*.} 28 | xpref=${xbase%.*} 29 | tablename=$xfext"_"$xpref; 30 | echo "tablename: "$tablename; 31 | echo "xpath: "$xpath; 32 | echo "xbase: "$xbase; 33 | echo "xfext: "$xfext; 34 | echo "xpref: "$xpref; 35 | 36 | #create tables and indices 37 | if [ $i -eq 0 ]; 38 | then 39 | mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin; 40 | create index ${tablename}_ix on ${tablename} (target(1), score desc); 41 | create index ${tablename}_2_ix on ${tablename} (target(2), score desc); 42 | create index ${tablename}_3_ix on ${tablename} (target(3), score desc); 43 | create index ${tablename}_4_ix on ${tablename} (target(4), score desc); 44 | create index ${tablename}_5_ix on ${tablename} (target(5), score desc);" 45 | else 46 | mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (source varchar(60),target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin; 47 | create index ${tablename}_ix on ${tablename} (source(60), score desc); 48 | create index ${tablename}_2_ix on ${tablename} (source(60), target(2), score desc); 49 | create index ${tablename}_3_ix on ${tablename} (source(60), target(3), score desc); 50 | create index ${tablename}_4_ix on ${tablename} (source(60), target(4), score desc); 51 | create index ${tablename}_5_ix on ${tablename} (source(60), target(5), score desc);" 52 | fi 53 | 54 | 55 | #disable indices 56 | myisamchk --keys-used=0 -rq ${dbPath}${tablename} 57 | 58 | mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;" 59 | 60 | #import data 61 | if [ $i -eq 0 ]; 62 | then 63 | mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (@dummy, target, score);" 64 | else 65 | mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (source, target, score);" 66 | fi 67 | 68 | #compress table / really necessary? 69 | myisampack ${dbPath}${tablename} 70 | 71 | #enable index 72 | myisamchk -rq ${dbPath}${tablename} --tmpdir="/mnt/vdb/tmp" --sort_buffer=3G #--sort-index --sort-records=1 73 | 74 | #and flush index again. 75 | mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;" 76 | 77 | done 78 | done 79 | -------------------------------------------------------------------------------- /scripts/preparequery.sh: -------------------------------------------------------------------------------- 1 | fSource=$1 2 | #fSource="Die" 3 | fEdge="4es${fSource:0:1}" 4 | tSource=$2 5 | #tSource="deutschen" 6 | tEdge="3es${tSource:0:1}" 7 | sSource=$3 8 | #sSource="Bauern" 9 | sEdge="2es${sSource:0:1}" 10 | oSource=$4 11 | #oSource="haben" 12 | oEdge="1es${oSource:0:1}" 13 | prefix="${5}%" 14 | #prefix="d%" 15 | 16 | query="select ${fEdge}.source, ${tEdge}.source, ${sEdge}.source, ${oEdge}.source, ${fEdge}.target, (IFNULL(${fEdge}.score, 0) + IFNULL(${tEdge}.score, 0) + IFNULL(${sEdge}.score, 0) + IFNULL(${oEdge}.score, 0)) as count from ${fEdge} right outer join ${tEdge} on ${tEdge}.target=${fEdge}.target right outer join ${sEdge} on ${sEdge}.target=${fEdge}.target right outer join ${oEdge} on ${oEdge}.target like ${fEdge}.target where ${fEdge}.source like \"${fSource}\" AND ${fEdge}.target like \"${prefix}\" and ${tEdge}.source like \"${tSource}\" AND ${tEdge}.target like \"${prefix}\" and ${sEdge}.source like \"${sSource}\" AND ${sEdge}.target like \"${prefix}\" and ${oEdge}.source like \"${oSource}\" AND ${oEdge}.target like \"${prefix}\" order by count desc limit 5;" 17 | 18 | 19 | #query="select source, target, score from ${fEdge} where ${fEdge}.source like \"${fSource}\" AND ${fEdge}.target like \"${prefix}\" order by score desc limit 5; 20 | #select source, target, score from ${tEdge} where ${tEdge}.source like \"${tSource}\" AND ${tEdge}.target like \"${prefix}\" order by score desc limit 5; 21 | #select source, target, score from ${sEdge} where ${sEdge}.source like \"${sSource}\" AND ${sEdge}.target like \"${prefix}\" order by score desc limit 5; 22 | #select source, target, score from ${oEdge} where ${oEdge}.source like \"${oSource}\" AND ${oEdge}.target like \"${prefix}\" order by score desc limit 5;" 23 | 24 | echo $query; 25 | 26 | #mysql -u importer typology --local-infile=1 -e "${query}" 27 | -------------------------------------------------------------------------------- /scripts/runpreparequery.sh: -------------------------------------------------------------------------------- 1 | fSource=$1 2 | #fSource="Die" 3 | fEdge="4es${fSource:0:1}" 4 | tSource=$2 5 | #tSource="deutschen" 6 | tEdge="3es${tSource:0:1}" 7 | sSource=$3 8 | #sSource="Bauern" 9 | sEdge="2es${sSource:0:1}" 10 | oSource=$4 11 | #oSource="haben" 12 | oEdge="1es${oSource:0:1}" 13 | prefix="${5}%" 14 | #prefix="d%" 15 | 16 | query="select ${fEdge}.source, ${tEdge}.source, ${sEdge}.source, ${oEdge}.source, ${fEdge}.target, (IFNULL(${fEdge}.score, 0) + IFNULL(${tEdge}.score, 0) + IFNULL(${sEdge}.score, 0) + IFNULL(${oEdge}.score, 0)) as count from ${fEdge} right outer join ${tEdge} on ${tEdge}.target=${fEdge}.target right outer join ${sEdge} on ${sEdge}.target=${fEdge}.target right outer join ${oEdge} on ${oEdge}.target like ${fEdge}.target where ${fEdge}.source like \"${fSource}\" AND ${fEdge}.target like \"${prefix}\" and ${tEdge}.source like \"${tSource}\" AND ${tEdge}.target like \"${prefix}\" and ${sEdge}.source like \"${sSource}\" AND ${sEdge}.target like \"${prefix}\" and ${oEdge}.source like \"${oSource}\" AND ${oEdge}.target like \"${prefix}\" order by count desc limit 5;" 17 | 18 | mysql -u importer typology --local-infile=1 -e "${query}" 19 | -------------------------------------------------------------------------------- /scripts/stats.txt: -------------------------------------------------------------------------------- 1 | google-de total: 19959412114 2 | google-en total: 122004194540 3 | google-es total: 23061292567 4 | google-fr total: 19925969585 5 | wiki-de total: 423089409 6 | wiki-en total: 745287250 7 | wiki-es total: 283966401 8 | wiki-fr total: 253536262 9 | wiki-it total: 146107024 10 | enron-en total: 82566655 11 | dgttm-de total: 24426630 12 | dgttm-en total: 29172197 13 | dgttm-es total: 27878291 14 | dgttm-fr total: 27666297 15 | dgttm-it total: 27600138 16 | google-de unique: 3685172 17 | google-en unique: 7379221 18 | google-es unique: 2233403 19 | google-fr unique: 1978551 20 | wiki-de unique: 9823118 21 | wiki-en unique: 11702124 22 | wiki-es unique: 4178681 23 | wiki-fr unique: 4058752 24 | wiki-it unique: 3085208 25 | enron-en unique: 588279 26 | dgttm-de unique: 664546 27 | dgttm-en unique: 458622 28 | dgttm-es unique: 446314 29 | dgttm-fr unique: 459495 30 | dgttm-it unique: 471740 31 | -------------------------------------------------------------------------------- /scripts/table.sh: -------------------------------------------------------------------------------- 1 | #AUTOR: Martin Koerner 2 | #Purpose to put results into a LaTeX table 3 | 4 | #run with: pfl=x k=x modelParameter=x ./table.sh 5 | 6 | #e.g.: 7 | #pfl=1 k=5 weight=no modelParameter=5 ./table.sh 8 | 9 | 10 | #storage directory for res.*.log files 11 | #LOGDIR="/var/lib/datasets/results/" 12 | LOGDIR="/home/martin/results/" 13 | #storage directory for return files 14 | #RETURNDIR="/var/lib/datasets/plots/" 15 | RETURNDIR="/home/martin/plots/" 16 | STATS="stats.txt" 17 | SECNKSS=1 18 | 19 | if [[ ${#pfl} == 0 || ${#k} == 0 || ${#weight} == 0 || ${#modelParameter} == 0 ]] 20 | then echo "set values for k, pfl, modelParameter and weight" 21 | exit 22 | fi 23 | 24 | #temp: 25 | #res.trainedOn-wiki-de-testedOn-wiki-de-lm-pic-modelParameter2-sam0-split95-joinlength10-nQ100000 26 | 27 | 28 | 29 | 30 | #general declarations 31 | PF1="res.trainedOn-" 32 | PF2="-testedOn-" 33 | PF3="-modelParameter" 34 | PF4="-sam0-split95-joinlength10-nQ100000.log" 35 | LANGS=(de en es fr it) 36 | 37 | LM="-lm-" 38 | TYPO="-typolgy-" 39 | FILENAME="table-k$k-pfl$pfl-modelParameter$modelParameter.txt" 40 | RETURN=$RETURNDIR$FILENAME 41 | 42 | #reset result file 43 | echo -n "" | tee "$RETURN" 44 | 45 | echo "\begin{table*}[bth]" | tee -a "$RETURN" 46 | echo "\begin{center}" | tee -a "$RETURN" 47 | echo "\begin{tabular}{lllll}" | tee -a "$RETURN" 48 | echo "Corpus & total words & unique words & MRR pfl=$pfl & Top $k Precision pfl=$pfl & NKSS@$SECNKSS & NKSS@$k \\\\" | tee -a "$RETURN" 49 | echo "\hline" | tee -a "$RETURN" 50 | 51 | CALC () { 52 | #echo -n "( CNTTYPO / $CNTLM - 1 ) * 100" 53 | 54 | RESULT=`echo "($CNTTYPO/$CNTLM-1)*100" | bc -l` 55 | echo -n " $RESULT" | awk '{ printf "%.1f", $0 }' | tee -a "$RETURN" 56 | #echo -n " $RESULT" | tee -a "$RETURN" 57 | } 58 | 59 | PRINTLN () { 60 | echo -n "$CORPUS & " | tee -a "$RETURN" 61 | TOTALWORDS=`grep "$TYP1$LANG total" $STATS` 62 | TOTALWORDS=${TOTALWORDS[0]//$TYP1$LANG total: /} 63 | echo -n "$TOTALWORDS & " | tee -a "$RETURN" 64 | 65 | UNIQUEWORDS=`grep "$TYP1$LANG unique" $STATS` 66 | UNIQUEWORDS=${UNIQUEWORDS[0]//$TYP1$LANG unique: /} 67 | echo -n "$UNIQUEWORDS & " | tee -a "$RETURN" 68 | 69 | CNTTYPO=`grep "MRR with pfl=$pfl" $FILETYPO` 70 | CNTTYPO=${CNTTYPO[0]//MRR with pfl=$pfl: /} 71 | echo $CNTTYPO 72 | CNTLM=`grep "MRR with pfl=$pfl" $FILELM` 73 | CNTLM=${CNTLM[0]//MRR with pfl=$pfl: /} 74 | echo $CNTLM 75 | CALC 76 | echo -n " & " | tee -a "$RETURN" 77 | CNTTYPO=`grep "Precision at k=$k with pfl=$pfl" $FILETYPO` 78 | CNTTYPO=${CNTTYPO[0]//Precision at k=$k with pfl=$pfl: /} 79 | echo $CNTTYPO 80 | CNTLM=`grep "Precision at k=$k with pfl=$pfl" $FILELM` 81 | CNTLM=${CNTLM[0]//Precision at k=$k with pfl=$pfl: /} 82 | echo $CNTLM 83 | CALC 84 | echo -n " & " | tee -a "$RETURN" 85 | 86 | 87 | CNTTYPO=`grep "NKSS at k=$SECNKSS" $FILETYPO` 88 | CNTTYPO=${CNTTYPO[0]//NKSS at k=$SECNKSS: /} 89 | echo $CNTTYPO 90 | CNTLM=`grep "NKSS at k=$SECNKSS" $FILELM` 91 | CNTLM=${CNTLM[0]//NKSS at k=$SECNKSS: /} 92 | echo $CNTLM 93 | CALC 94 | echo -n " & " | tee -a "$RETURN" 95 | 96 | CNTTYPO=`grep "NKSS at k=$k" $FILETYPO` 97 | CNTTYPO=${CNTTYPO[0]//NKSS at k=$k: /} 98 | echo $CNTTYPO 99 | CNTLM=`grep "NKSS at k=$k" $FILELM` 100 | CNTLM=${CNTLM[0]//NKSS at k=$k: /} 101 | echo $CNTLM 102 | CALC 103 | 104 | echo " \\\\" | tee -a "$RETURN" 105 | } 106 | 107 | #google 108 | for LANG in ${LANGS[@]} 109 | do 110 | if [[ $LANG != "it" ]] 111 | then 112 | TYP1="google-" 113 | TYP2="wiki-" 114 | CORPUS="$TYP1$TYP2$LANG" 115 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4 116 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4 117 | echo $FILETYPO 118 | echo $FILELM 119 | PRINTLN 120 | fi 121 | 122 | #wiki 123 | 124 | TYP1="wiki-" 125 | TYP2="wiki-" 126 | CORPUS="$TYP1$TYP2$LANG" 127 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4 128 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4 129 | echo $FILETYPO 130 | echo $FILELM 131 | PRINTLN 132 | 133 | #dgttm 134 | TYP1="dgttm-" 135 | TYP2="dgttm-" 136 | CORPUS="$TYP1$TYP2$LANG" 137 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4 138 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4 139 | echo $FILETYPO 140 | echo $FILELM 141 | PRINTLN 142 | 143 | 144 | #enron 145 | if [[ $LANG == "en" ]] 146 | then 147 | TYP1="enron-" 148 | TYP2="enron-" 149 | CORPUS="$TYP1$TYP2$LANG" 150 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4 151 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4 152 | echo $FILETYPO 153 | echo $FILELM 154 | PRINTLN 155 | fi 156 | if [[ $LANG == "en" ]] 157 | then 158 | TYP1="enron-" 159 | TYP2="wiki-" 160 | CORPUS="$TYP1$TYP2$LANG" 161 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4 162 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4 163 | echo $FILETYPO 164 | echo $FILELM 165 | PRINTLN 166 | fi 167 | if [[ $LANG == "en" ]] 168 | then 169 | TYP1="google-" 170 | TYP2="enron-" 171 | CORPUS="$TYP1$TYP2$LANG" 172 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4 173 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4 174 | echo $FILETYPO 175 | echo $FILELM 176 | PRINTLN 177 | fi 178 | 179 | done 180 | 181 | echo "\end{tabular}" | tee -a "$RETURN" 182 | echo "\label{tab:corporaStats}" | tee -a "$RETURN" 183 | echo "\caption{Statistics of our evaluation corpora}" | tee -a "$RETURN" 184 | echo "\end{center}" | tee -a "$RETURN" 185 | echo "\end{table*}" | tee -a "$RETURN" 186 | -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | dbUser="importer" 2 | testName="hybridtypology" 3 | 4 | #dbPath="/mnt/vdb/typoeval/mysql/${testName}/" #server 5 | dbPath=/var/lib/mysql/${testName}/ #local machine 6 | 7 | #mysql -u ${dbUser} -e "create database ${testName};" 8 | 9 | 10 | for (( i = 2 ; i <= 5; i++ )) 11 | do 12 | path="$1$i/*" 13 | for file in $path 14 | do 15 | 16 | 17 | done; 18 | done; 19 | -------------------------------------------------------------------------------- /src/de/typology/executables/KneserNeyBuilder.java: -------------------------------------------------------------------------------- 1 | package de.typology.executables; 2 | 3 | import java.io.File; 4 | import java.util.ArrayList; 5 | import java.util.HashMap; 6 | 7 | import org.apache.logging.log4j.LogManager; 8 | import org.apache.logging.log4j.Logger; 9 | 10 | import de.typology.indexes.WordIndex; 11 | import de.typology.indexes.WordIndexer; 12 | import de.typology.patterns.PatternBuilder; 13 | import de.typology.smoother.KneserNeySmoother; 14 | import de.typology.smoother.ModifiedKneserNeySmoother; 15 | import de.typology.splitter.AbsoluteSplitter; 16 | import de.typology.splitter.DataSetSplitter; 17 | import de.typology.splitter.SmoothingSplitter; 18 | import de.typology.tester.TestSequenceExtractor; 19 | import de.typology.utils.Config; 20 | 21 | public class KneserNeyBuilder { 22 | 23 | static Logger logger = LogManager.getLogger(KneserNeyBuilder.class 24 | .getName()); 25 | 26 | public static void main(String[] args) { 27 | 28 | // TODO: parameters as arguments 29 | File inputDirectory = new File(Config.get().outputDirectory 30 | + Config.get().inputDataSet); 31 | File inputFile = new File(inputDirectory.getAbsolutePath() 32 | + "/training.txt"); 33 | File indexFile = new File(inputDirectory.getAbsolutePath() 34 | + "/index.txt"); 35 | File absoluteDirectory = new File(inputDirectory.getAbsolutePath() 36 | + "/absolute"); 37 | File continuationDirectory = new File(inputDirectory.getAbsolutePath() 38 | + "/continuation"); 39 | if (Config.get().splitData) { 40 | DataSetSplitter dss = new DataSetSplitter(inputDirectory, 41 | "normalized.txt"); 42 | dss.split("training.txt", "learning.txt", "testing.txt", 43 | Config.get().modelLength); 44 | dss.splitIntoSequences(new File(inputDirectory.getAbsolutePath() 45 | + "/testing.txt"), Config.get().modelLength, 46 | Config.get().numberOfQueries); 47 | } 48 | if (Config.get().buildIndex) { 49 | logger.info("build word index: " + indexFile.getAbsolutePath()); 50 | WordIndexer wordIndexer = new WordIndexer(); 51 | wordIndexer.buildIndex(inputFile, indexFile, 52 | Config.get().maxCountDivider, " ", " "); 53 | } 54 | if (Config.get().buildGLM) { 55 | ArrayList glmForSmoothingPatterns = PatternBuilder 56 | .getReverseGLMForSmoothingPatterns(Config.get().modelLength); 57 | AbsoluteSplitter absolteSplitter = new AbsoluteSplitter(inputFile, 58 | indexFile, absoluteDirectory, "\t", 59 | Config.get().deleteTempFiles, " ", " "); 60 | logger.info("split into GLM sequences: " 61 | + inputFile.getAbsolutePath()); 62 | absolteSplitter.split(glmForSmoothingPatterns, 63 | Config.get().numberOfCores); 64 | } 65 | if (Config.get().buildContinuationGLM) { 66 | ArrayList lmPatterns = PatternBuilder 67 | .getReverseLMPatterns(Config.get().modelLength); 68 | SmoothingSplitter smoothingSplitter = new SmoothingSplitter( 69 | absoluteDirectory, continuationDirectory, indexFile, "\t", 70 | Config.get().deleteTempFiles); 71 | logger.info("split into continuation sequences: " 72 | + inputFile.getAbsolutePath()); 73 | smoothingSplitter.split(lmPatterns, Config.get().numberOfCores); 74 | } 75 | 76 | File testExtractOutputDirectory = new File( 77 | inputDirectory.getAbsolutePath() + "/testing-samples"); 78 | if (Config.get().extractContinuationGLM) { 79 | File testSequences = new File(inputDirectory.getAbsolutePath() 80 | + "/testing-samples-" + Config.get().modelLength + ".txt"); 81 | testExtractOutputDirectory.mkdir(); 82 | 83 | TestSequenceExtractor tse = new TestSequenceExtractor( 84 | testSequences, absoluteDirectory, continuationDirectory, 85 | testExtractOutputDirectory, "\t", new WordIndex(indexFile)); 86 | tse.extractSequences(Config.get().modelLength, 87 | Config.get().numberOfCores); 88 | tse.extractContinuationSequences(Config.get().modelLength, 89 | Config.get().numberOfCores); 90 | 91 | } 92 | 93 | HashMap> absoluteTypeSequenceValueMap = null; 94 | HashMap> continuationTypeSequenceValueMap = null; 95 | if (Config.get().buildKneserNey) { 96 | KneserNeySmoother kns = new KneserNeySmoother( 97 | testExtractOutputDirectory, absoluteDirectory, 98 | continuationDirectory, "\t"); 99 | 100 | // read absolute and continuation values into HashMaps 101 | logger.info("read absolute and continuation values into HashMaps for kneser ney"); 102 | absoluteTypeSequenceValueMap = kns 103 | .readAbsoluteValuesIntoHashMap(kns.extractedAbsoluteDirectory); 104 | 105 | continuationTypeSequenceValueMap = kns 106 | .readContinuationValuesIntoHashMap(kns.extractedContinuationDirectory); 107 | kns.absoluteTypeSequenceValueMap = absoluteTypeSequenceValueMap; 108 | kns.continuationTypeSequenceValueMap = continuationTypeSequenceValueMap; 109 | 110 | for (int i = Config.get().modelLength; i >= 1; i--) { 111 | File inputSequenceFile = new File( 112 | inputDirectory.getAbsolutePath() + "/testing-samples-" 113 | + i + ".txt"); 114 | File resultFile; 115 | // smooth simple 116 | if (Config.get().kneserNeySimple) { 117 | resultFile = new File(inputDirectory.getAbsolutePath() 118 | + "/kneser-ney-simple-backoffToCont-" + i + ".txt"); 119 | kns.smooth(inputSequenceFile, resultFile, i, false, 120 | Config.get().conditionalProbabilityOnly); 121 | } 122 | // smooth complex 123 | if (Config.get().kneserNeyComplex) { 124 | resultFile = new File(inputDirectory.getAbsolutePath() 125 | + "/kneser-ney-complex-backoffToCont-" + i + ".txt"); 126 | kns.smooth(inputSequenceFile, resultFile, i, true, 127 | Config.get().conditionalProbabilityOnly); 128 | } 129 | } 130 | } 131 | if (Config.get().buildModKneserNey) { 132 | ModifiedKneserNeySmoother mkns = new ModifiedKneserNeySmoother( 133 | testExtractOutputDirectory, absoluteDirectory, 134 | continuationDirectory, "\t", Config.get().decimalPlaces); 135 | 136 | if (absoluteTypeSequenceValueMap == null) { 137 | // read absolute and continuation values into HashMaps 138 | 139 | logger.info("read absolute and continuation values into HashMaps for mod kneser ney"); 140 | absoluteTypeSequenceValueMap = mkns 141 | .readAbsoluteValuesIntoHashMap(mkns.extractedAbsoluteDirectory); 142 | 143 | continuationTypeSequenceValueMap = mkns 144 | .readContinuationValuesIntoHashMap(mkns.extractedContinuationDirectory); 145 | } 146 | 147 | mkns.absoluteTypeSequenceValueMap = absoluteTypeSequenceValueMap; 148 | mkns.continuationTypeSequenceValueMap = continuationTypeSequenceValueMap; 149 | 150 | for (int i = Config.get().modelLength; i >= 1; i--) { 151 | File inputSequenceFile = new File( 152 | inputDirectory.getAbsolutePath() + "/testing-samples-" 153 | + i + ".txt"); 154 | File resultFile; 155 | // smooth simple 156 | if (Config.get().kneserNeySimple) { 157 | resultFile = new File(inputDirectory.getAbsolutePath() 158 | + "/mod-kneser-ney-simple-backoffToCont-" + i 159 | + ".txt"); 160 | mkns.smooth(inputSequenceFile, resultFile, i, false, 161 | Config.get().conditionalProbabilityOnly); 162 | } 163 | // smooth complex 164 | if (Config.get().kneserNeyComplex) { 165 | resultFile = new File(inputDirectory.getAbsolutePath() 166 | + "/mod-kneser-ney-complex-backoffToCont-" + i 167 | + ".txt"); 168 | mkns.smooth(inputSequenceFile, resultFile, i, true, 169 | Config.get().conditionalProbabilityOnly); 170 | } 171 | } 172 | } 173 | logger.info("done"); 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/de/typology/executables/MultiKneserNeyBuilder.java: -------------------------------------------------------------------------------- 1 | package de.typology.executables; 2 | 3 | import de.typology.utils.Config; 4 | 5 | public class MultiKneserNeyBuilder { 6 | 7 | public static void main(String[] args) { 8 | String[] languages = Config.get().languages.split(","); 9 | String inputDataSet = Config.get().inputDataSet; 10 | for (String language : languages) { 11 | Config.get().inputDataSet = inputDataSet + "/" + language; 12 | KneserNeyBuilder.main(args); 13 | } 14 | 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/de/typology/indexes/WordIndex.java: -------------------------------------------------------------------------------- 1 | package de.typology.indexes; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileReader; 7 | import java.io.FileWriter; 8 | import java.io.IOException; 9 | import java.util.Arrays; 10 | import java.util.HashMap; 11 | import java.util.Iterator; 12 | import java.util.Map.Entry; 13 | 14 | import org.apache.commons.io.FileUtils; 15 | 16 | /** 17 | * A class that is based on the text file produced by WordIndexer. 18 | * 19 | * @author Martin Koerner 20 | * 21 | */ 22 | public class WordIndex implements Iterable { 23 | protected String[] index; 24 | 25 | public WordIndex(File indexFile) { 26 | // count total number of lines in the index file 27 | int lineCount = 0; 28 | try { 29 | BufferedReader br = new BufferedReader(new FileReader(indexFile)); 30 | while (br.readLine() != null) { 31 | lineCount++; 32 | } 33 | br.close(); 34 | } catch (IOException e) { 35 | e.printStackTrace(); 36 | } 37 | 38 | this.index = new String[lineCount]; 39 | int currentLineCount = 0; 40 | 41 | // read the index file 42 | try { 43 | BufferedReader br = new BufferedReader(new FileReader(indexFile)); 44 | String line; 45 | String[] lineSplit; 46 | while ((line = br.readLine()) != null) { 47 | lineSplit = line.split("\t"); 48 | this.index[currentLineCount] = lineSplit[0]; 49 | currentLineCount++; 50 | } 51 | br.close(); 52 | } catch (IOException e) { 53 | e.printStackTrace(); 54 | } 55 | } 56 | 57 | public int getLength() { 58 | return this.index.length; 59 | } 60 | 61 | /** 62 | * returns the file in which word should be stored based on this.index 63 | * 64 | * @param word 65 | * @return 66 | */ 67 | public int rank(String word) { 68 | int lo = 0; 69 | int hi = this.index.length - 1; 70 | while (lo <= hi) { 71 | int mid = lo + (hi - lo) / 2; 72 | if (word.compareTo(this.index[mid]) < 0) { 73 | hi = mid - 1; 74 | } else if (word.compareTo(this.index[mid]) > 0) { 75 | lo = mid + 1; 76 | } else { 77 | return mid; 78 | } 79 | } 80 | // the following return statement is not the standard return result for 81 | // binary search 82 | return (lo + hi) / 2; 83 | } 84 | 85 | @Override 86 | public Iterator iterator() { 87 | return Arrays.asList(this.index).iterator(); 88 | } 89 | 90 | public HashMap openWriters(File outputDirectory) { 91 | HashMap writers = new HashMap(); 92 | 93 | File currentOutputDirectory = new File( 94 | outputDirectory.getAbsolutePath()); 95 | if (currentOutputDirectory.exists()) { 96 | try { 97 | FileUtils.deleteDirectory(currentOutputDirectory); 98 | } catch (IOException e) { 99 | // TODO Auto-generated catch block 100 | e.printStackTrace(); 101 | } 102 | } 103 | currentOutputDirectory.mkdir(); 104 | 105 | // calculate buffer size for writers 106 | // TODO: bufferSize calculation 107 | for (int fileCount = 0; fileCount < this.index.length; fileCount++) { 108 | try { 109 | writers.put(fileCount, new BufferedWriter(new FileWriter( 110 | currentOutputDirectory.getAbsolutePath() + "/" 111 | + fileCount), 10 * 8 * 1024)); 112 | } catch (IOException e) { 113 | // TODO Auto-generated catch block 114 | e.printStackTrace(); 115 | } 116 | } 117 | return writers; 118 | } 119 | 120 | public void closeWriters(HashMap writers) { 121 | for (Entry entry : writers.entrySet()) { 122 | try { 123 | entry.getValue().close(); 124 | } catch (IOException e) { 125 | // TODO Auto-generated catch block 126 | e.printStackTrace(); 127 | } 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/de/typology/indexes/WordIndexer.java: -------------------------------------------------------------------------------- 1 | package de.typology.indexes; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileNotFoundException; 7 | import java.io.FileReader; 8 | import java.io.FileWriter; 9 | import java.io.IOException; 10 | import java.util.Comparator; 11 | import java.util.Iterator; 12 | import java.util.Map; 13 | import java.util.Map.Entry; 14 | import java.util.TreeMap; 15 | 16 | /** 17 | * A class for building a text file containing a index representation for a 18 | * given text file based on the alphabetical distribution of its words. 19 | * 20 | * @author Martin Koerner 21 | * 22 | */ 23 | public class WordIndexer { 24 | 25 | private TreeMap buildMap(File InputFile, 26 | String addBeforeSentence, String addAfterSentence) { 27 | BufferedReader reader; 28 | try { 29 | reader = new BufferedReader(new FileReader(InputFile)); 30 | } catch (FileNotFoundException e1) { 31 | // TODO Auto-generated catch block 32 | e1.printStackTrace(); 33 | return null; 34 | } 35 | 36 | // a comparator for wordMap 37 | Comparator StringComparator = new Comparator() { 38 | @Override 39 | public int compare(String s1, String s2) { 40 | return s1.compareTo(s2); 41 | } 42 | }; 43 | 44 | TreeMap wordMap = new TreeMap( 45 | StringComparator); 46 | String line; 47 | // long lineCount=0L; 48 | try { 49 | while ((line = reader.readLine()) != null) { 50 | line = addBeforeSentence + line + addAfterSentence; 51 | String[] words = line.split("\\s+"); 52 | for (String word : words) { 53 | if (wordMap.containsKey(word)) { 54 | wordMap.put(word, wordMap.get(word) + 1); 55 | } else { 56 | wordMap.put(word, 1L); 57 | } 58 | } 59 | } 60 | reader.close(); 61 | 62 | } catch (IOException e) { 63 | e.printStackTrace(); 64 | } 65 | return wordMap; 66 | } 67 | 68 | /** 69 | * 70 | * @param inputFile 71 | * @param maxCountDivider 72 | * @return Long: maxCountPerFile 73 | */ 74 | public long buildIndex(File inputFile, File indexOutputFile, 75 | int maxCountDivider, String addBeforeSentence, 76 | String addAfterSentence) { 77 | 78 | // build WordMap 79 | TreeMap wordMap = this.buildMap(inputFile, 80 | addBeforeSentence, addAfterSentence); 81 | 82 | // summarize all word counts 83 | Long totalCount = 0L; 84 | for (Entry word : wordMap.entrySet()) { 85 | totalCount += word.getValue(); 86 | } 87 | 88 | // calculate max count per file 89 | Long maxCountPerFile = totalCount / maxCountDivider; 90 | // System.out.println("maxCountPerFile: " + maxCountPerFile); 91 | if (maxCountPerFile < 1L) { 92 | maxCountPerFile = 1L; 93 | } 94 | 95 | // build index 96 | BufferedWriter indexWriter; 97 | try { 98 | indexWriter = new BufferedWriter(new FileWriter(indexOutputFile)); 99 | Long currentFileCount = 0L; 100 | int fileCount = 0; 101 | Iterator> wordMapIterator = wordMap 102 | .entrySet().iterator(); 103 | Entry word; 104 | 105 | while (wordMapIterator.hasNext()) { 106 | // get next word 107 | word = wordMapIterator.next(); 108 | if (fileCount == 0 109 | || currentFileCount + word.getValue() > maxCountPerFile) { 110 | indexWriter.write(word.getKey() + "\t" + fileCount + "\n"); 111 | currentFileCount = word.getValue(); 112 | fileCount++; 113 | } else { 114 | currentFileCount += word.getValue(); 115 | } 116 | } 117 | indexWriter.close(); 118 | } catch (IOException e) { 119 | // make sure that no corrupted index file is stored 120 | if (indexOutputFile.exists()) { 121 | indexOutputFile.delete(); 122 | } 123 | e.printStackTrace(); 124 | // TODO Auto-generated catch block 125 | e.printStackTrace(); 126 | } 127 | return maxCountPerFile; 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /src/de/typology/patterns/PatternBuilder.java: -------------------------------------------------------------------------------- 1 | package de.typology.patterns; 2 | 3 | import java.util.ArrayList; 4 | 5 | public class PatternBuilder { 6 | 7 | public static ArrayList getGLMPatterns(int maxModelLength) { 8 | ArrayList patterns = new ArrayList(); 9 | for (int intPattern = 1; intPattern < Math.pow(2, maxModelLength); intPattern++) { 10 | // leave out even sequences since they don't contain a 11 | // target 12 | if (intPattern % 2 == 0) { 13 | continue; 14 | } 15 | patterns.add(PatternTransformer.getBooleanPattern(intPattern)); 16 | } 17 | return patterns; 18 | } 19 | 20 | public static ArrayList getReverseGLMPatterns(int maxModelLength) { 21 | ArrayList patterns = new ArrayList(); 22 | for (int intPattern = (int) (Math.pow(2, maxModelLength) - 1); intPattern > 0; intPattern--) { 23 | // leave out even sequences since they don't contain a 24 | // target 25 | if (intPattern % 2 == 0) { 26 | continue; 27 | } 28 | patterns.add(PatternTransformer.getBooleanPattern(intPattern)); 29 | } 30 | return patterns; 31 | } 32 | 33 | /** 34 | * Also returns sequences that are greater than maxModelLength but are 35 | * needed to calculate kneser ney smoothed values 36 | * 37 | * @param maxModelLength 38 | * @return 39 | */ 40 | public static ArrayList getGLMForSmoothingPatterns( 41 | int maxModelLength) { 42 | ArrayList patterns = new ArrayList(); 43 | for (int intPattern = 1; intPattern < Math.pow(2, maxModelLength); intPattern++) { 44 | // // leave out even sequences since they don't contain a 45 | // // target 46 | // if (intPattern % 2 == 0) { 47 | // continue; 48 | // } 49 | patterns.add(PatternTransformer.getBooleanPattern(intPattern)); 50 | } 51 | return patterns; 52 | } 53 | 54 | public static ArrayList getReverseGLMForSmoothingPatterns( 55 | int maxModelLength) { 56 | ArrayList patterns = new ArrayList(); 57 | for (int intPattern = (int) Math.pow(2, maxModelLength) - 1; intPattern > 0; intPattern--) { 58 | // // leave out even sequences since they don't contain a 59 | // // target 60 | // if (intPattern % 2 == 0) { 61 | // continue; 62 | // } 63 | patterns.add(PatternTransformer.getBooleanPattern(intPattern)); 64 | } 65 | return patterns; 66 | } 67 | 68 | public static ArrayList getLMPatterns(int maxModelLength) { 69 | ArrayList patterns = new ArrayList(); 70 | for (int intPattern = 1; intPattern < Math.pow(2, maxModelLength); intPattern++) { 71 | String stringPattern = Integer.toBinaryString(intPattern); 72 | if (Integer.bitCount(intPattern) == stringPattern.length()) { 73 | patterns.add(PatternTransformer.getBooleanPattern(intPattern)); 74 | } 75 | } 76 | return patterns; 77 | } 78 | 79 | public static ArrayList getReverseLMPatterns(int maxModelLength) { 80 | ArrayList patterns = new ArrayList(); 81 | for (int intPattern = (int) (Math.pow(2, maxModelLength) - 1); intPattern > 0; intPattern--) { 82 | String stringPattern = Integer.toBinaryString(intPattern); 83 | if (Integer.bitCount(intPattern) == stringPattern.length()) { 84 | patterns.add(PatternTransformer.getBooleanPattern(intPattern)); 85 | } 86 | } 87 | return patterns; 88 | } 89 | 90 | public static ArrayList getTypologyPatterns(int maxModelLength) { 91 | ArrayList patterns = new ArrayList(); 92 | for (int intPattern = 1; intPattern < Math.pow(2, maxModelLength); intPattern++) { 93 | String stringPattern = Integer.toBinaryString(intPattern); 94 | if (Integer.bitCount(intPattern) <= 2 95 | && stringPattern.startsWith("1") 96 | && stringPattern.endsWith("1")) { 97 | patterns.add(PatternTransformer.getBooleanPattern(intPattern)); 98 | } 99 | } 100 | return patterns; 101 | } 102 | 103 | public static ArrayList getReverseTypologyPatterns( 104 | int maxModelLength) { 105 | ArrayList patterns = new ArrayList(); 106 | for (int intPattern = (int) (Math.pow(2, maxModelLength) - 1); intPattern > 0; intPattern--) { 107 | String stringPattern = Integer.toBinaryString(intPattern); 108 | if (Integer.bitCount(intPattern) <= 2 109 | && stringPattern.startsWith("1") 110 | && stringPattern.endsWith("1")) { 111 | patterns.add(PatternTransformer.getBooleanPattern(intPattern)); 112 | } 113 | } 114 | return patterns; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/de/typology/patterns/PatternTransformer.java: -------------------------------------------------------------------------------- 1 | package de.typology.patterns; 2 | 3 | import java.util.Arrays; 4 | 5 | /** 6 | * A class for transforming the used boolean pattern into different formats like 7 | * a binary string representation. 8 | * 9 | * @author Martin Koerner 10 | * 11 | */ 12 | public class PatternTransformer { 13 | 14 | public static void main(String[] args) { 15 | boolean[] bool1 = { true, false, true, true }; 16 | System.out.println(getStringPattern(bool1)); 17 | int i = 8; 18 | boolean[] bool2 = PatternTransformer.getBooleanPattern(Integer 19 | .toBinaryString(i)); 20 | System.out.println(Arrays.toString(bool2)); 21 | } 22 | 23 | public static String getStringPattern(boolean[] booleanPattern) { 24 | String stringPattern = new String(); 25 | for (boolean bool : booleanPattern) { 26 | if (bool) { 27 | stringPattern += 1; 28 | } else { 29 | stringPattern += 0; 30 | } 31 | } 32 | return stringPattern; 33 | } 34 | 35 | public static boolean[] getBooleanPattern(int intPattern) { 36 | return PatternTransformer.getBooleanPattern(Integer 37 | .toBinaryString(intPattern)); 38 | } 39 | 40 | public static boolean[] getBooleanPattern(String stringPattern) { 41 | boolean[] booleanPattern = new boolean[stringPattern.length()]; 42 | for (int i = 0; i < stringPattern.length(); i++) { 43 | if (stringPattern.charAt(i) == '1') { 44 | booleanPattern[i] = true; 45 | } else { 46 | booleanPattern[i] = false; 47 | } 48 | } 49 | return booleanPattern; 50 | } 51 | 52 | public static boolean[] getBooleanPatternWithOnes(int length) { 53 | boolean[] booleanPattern = new boolean[length]; 54 | for (int i = 0; i < booleanPattern.length; i++) { 55 | booleanPattern[i] = true; 56 | } 57 | return booleanPattern; 58 | } 59 | 60 | public static int getIntPattern(boolean[] booleanPattern) { 61 | String stringPattern = PatternTransformer 62 | .getStringPattern(booleanPattern); 63 | if (stringPattern.length() == 0) { 64 | return 0; 65 | } else { 66 | return Integer.parseInt(stringPattern, 2); 67 | } 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/de/typology/smoother/ModifiedKneserNeySmoother.java: -------------------------------------------------------------------------------- 1 | package de.typology.smoother; 2 | 3 | import java.io.File; 4 | import java.util.HashMap; 5 | 6 | import de.typology.patterns.PatternTransformer; 7 | import de.typology.utils.Counter; 8 | 9 | public class ModifiedKneserNeySmoother extends KneserNeySmoother { 10 | 11 | public ModifiedKneserNeySmoother(File extractedSequenceDirectory, 12 | File absoluteDirectory, File continuationDirectory, 13 | String delimiter, int decimalPlaces) { 14 | super(extractedSequenceDirectory, absoluteDirectory, 15 | continuationDirectory, delimiter); 16 | 17 | this.discountTypesValuesMapFile = new File(this.absoluteDirectory 18 | .getParentFile().getAbsolutePath() 19 | + "/discount-values-mod-kneser-ney.ser"); 20 | 21 | } 22 | 23 | private double d1; 24 | private double d2; 25 | private double d3plus; 26 | 27 | /** 28 | * @param args 29 | */ 30 | 31 | @Override 32 | protected HashMap> calculateDiscountValues( 33 | HashMap> discountTypeValuesMap, 34 | File inputDirectory) { 35 | for (File absoluteTypeDirectory : inputDirectory.listFiles()) { 36 | if (absoluteTypeDirectory.getName().contains("split")) { 37 | continue; 38 | } 39 | HashMap discountValuesMap = new HashMap(); 40 | long n1 = Counter.countCountsInDirectory(1, absoluteTypeDirectory, 41 | ""); 42 | long n2 = Counter.countCountsInDirectory(2, absoluteTypeDirectory, 43 | ""); 44 | long n3 = Counter.countCountsInDirectory(3, absoluteTypeDirectory, 45 | ""); 46 | long n4 = Counter.countCountsInDirectory(4, absoluteTypeDirectory, 47 | ""); 48 | this.logger.info("n1 for " + absoluteTypeDirectory.getName() + ":" 49 | + n1); 50 | this.logger.info("n2 for " + absoluteTypeDirectory.getName() + ":" 51 | + n2); 52 | this.logger.info("n3 for " + absoluteTypeDirectory.getName() + ":" 53 | + n3); 54 | this.logger.info("n4 for " + absoluteTypeDirectory.getName() + ":" 55 | + n4); 56 | double y = n1 / ((double) n1 + 2 * n2); 57 | this.d1 = 1 - 2 * y * ((double) n2 / (double) n1); 58 | this.d2 = 2 - 3 * y * ((double) n3 / (double) n2); 59 | this.d3plus = 3 - 4 * y * ((double) n4 / (double) n3); 60 | // this.d1plus = 0.5; 61 | this.logger.info("D1 for " + absoluteTypeDirectory.getName() + ":" 62 | + this.d1); 63 | this.logger.info("D2 for " + absoluteTypeDirectory.getName() + ":" 64 | + this.d2); 65 | this.logger.info("D3+ for " + absoluteTypeDirectory.getName() + ":" 66 | + this.d3plus); 67 | discountValuesMap.put("D1", this.d1); 68 | discountValuesMap.put("D2", this.d2); 69 | discountValuesMap.put("D3+", this.d3plus); 70 | 71 | discountTypeValuesMap.put(absoluteTypeDirectory.getName(), 72 | discountValuesMap); 73 | } 74 | return discountTypeValuesMap; 75 | 76 | } 77 | 78 | /** 79 | * 80 | * @param sequenceStringPattern 81 | * @param sequenceCount 82 | * @return 83 | */ 84 | @Override 85 | protected double getDiscountValue(String sequenceStringPattern, 86 | long sequenceCount) { 87 | String stringPatternForBitcount = sequenceStringPattern.replaceAll("_", 88 | "0"); 89 | if (Integer.bitCount(PatternTransformer 90 | .getIntPattern(PatternTransformer 91 | .getBooleanPattern(stringPatternForBitcount))) > 1) { 92 | // not lowest order 93 | if (sequenceCount == 1) { 94 | return this.discountTypeValuesMap.get(sequenceStringPattern) 95 | .get("D1"); 96 | } 97 | if (sequenceCount == 2) { 98 | return this.discountTypeValuesMap.get(sequenceStringPattern) 99 | .get("D2"); 100 | } 101 | if (sequenceCount >= 3) { 102 | return this.discountTypeValuesMap.get(sequenceStringPattern) 103 | .get("D3+"); 104 | } 105 | // count < 1 106 | return 0; 107 | } else { 108 | // lowest order 109 | return 0; 110 | } 111 | } 112 | 113 | @Override 114 | protected double calculateWeightNumerator(String continuationPattern, 115 | String sequence, int sequenceLength, String sequenceStringPattern) { 116 | // [0]=1+ 117 | // [1]=1 118 | // [2]=2 119 | // [3]=3+ 120 | return this.getDiscountValue(continuationPattern, 1) 121 | * this.calculateContinuationLast(sequence, sequenceLength, 122 | sequenceStringPattern, 1) 123 | + this.getDiscountValue(continuationPattern, 2) 124 | * this.calculateContinuationLast(sequence, sequenceLength, 125 | sequenceStringPattern, 2) 126 | + this.getDiscountValue(continuationPattern, 3) 127 | * this.calculateContinuationLast(sequence, sequenceLength, 128 | sequenceStringPattern, 3); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/de/typology/splitter/AbsoluteSplitter.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.util.ArrayList; 9 | import java.util.concurrent.ExecutorService; 10 | import java.util.concurrent.Executors; 11 | import java.util.concurrent.TimeUnit; 12 | 13 | import org.apache.commons.io.FileUtils; 14 | import org.apache.logging.log4j.LogManager; 15 | import org.apache.logging.log4j.Logger; 16 | 17 | import de.typology.indexes.WordIndex; 18 | import de.typology.patterns.PatternTransformer; 19 | 20 | /** 21 | * Split 22 | * 23 | * @author Martin Koerner 24 | * 25 | */ 26 | public class AbsoluteSplitter { 27 | private File inputFile; 28 | private File indexFile; 29 | private File outputDirectory; 30 | private String delimiter; 31 | protected boolean deleteTempFiles; 32 | protected String addBeforeSentence; 33 | protected String addAfterSentence; 34 | 35 | Logger logger = LogManager.getLogger(this.getClass().getName()); 36 | 37 | public AbsoluteSplitter(File inputFile, File indexFile, 38 | File outputDirectory, String delimiter, boolean deleteTempFiles, 39 | String addBeforeSentence, String addAfterSentence) { 40 | this.inputFile = inputFile; 41 | this.indexFile = indexFile; 42 | this.outputDirectory = outputDirectory; 43 | this.delimiter = delimiter; 44 | this.deleteTempFiles = deleteTempFiles; 45 | this.addBeforeSentence = addBeforeSentence; 46 | this.addAfterSentence = addAfterSentence; 47 | // delete old directory 48 | if (outputDirectory.exists()) { 49 | try { 50 | FileUtils.deleteDirectory(outputDirectory); 51 | } catch (IOException e) { 52 | // TODO Auto-generated catch block 53 | e.printStackTrace(); 54 | } 55 | } 56 | outputDirectory.mkdir(); 57 | } 58 | 59 | public void split(ArrayList patterns, int cores) { 60 | 61 | this.logger 62 | .info("read word index: " + this.indexFile.getAbsolutePath()); 63 | WordIndex wordIndex = new WordIndex(this.indexFile); 64 | 65 | // initialize executerService 66 | // int cores = Runtime.getRuntime().availableProcessors(); 67 | ExecutorService executorService = Executors.newFixedThreadPool(cores); 68 | for (boolean[] pattern : patterns) { 69 | this.logger.debug("execute SplitterTask for: " 70 | + PatternTransformer.getStringPattern(pattern) 71 | + " sequences"); 72 | 73 | try { 74 | InputStream inputFileInputStream = new FileInputStream( 75 | this.inputFile); 76 | SplitterTask splitterTask = new SplitterTask( 77 | inputFileInputStream, this.outputDirectory, wordIndex, 78 | pattern, PatternTransformer.getStringPattern(pattern), 79 | this.delimiter, 0, this.deleteTempFiles, 80 | this.addBeforeSentence, this.addAfterSentence, false, 81 | false, false); 82 | executorService.execute(splitterTask); 83 | } catch (FileNotFoundException e) { 84 | // TODO Auto-generated catch block 85 | e.printStackTrace(); 86 | this.logger.error("inputFile not found: " 87 | + this.inputFile.getAbsolutePath()); 88 | return; 89 | } 90 | } 91 | executorService.shutdown(); 92 | try { 93 | executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); 94 | } catch (InterruptedException e) { 95 | // TODO Auto-generated catch block 96 | e.printStackTrace(); 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/de/typology/splitter/Aggregator.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileReader; 7 | import java.io.FileWriter; 8 | import java.io.IOException; 9 | import java.util.Comparator; 10 | import java.util.Map.Entry; 11 | import java.util.SortedMap; 12 | import java.util.SortedSet; 13 | import java.util.TreeMap; 14 | import java.util.TreeSet; 15 | 16 | import org.apache.logging.log4j.LogManager; 17 | import org.apache.logging.log4j.Logger; 18 | 19 | /** 20 | * A class for aggregating sequences by counting their occurrences. Expects an 21 | * inputStream with a size that is 30% of the allocated main memory. 22 | * 23 | * @author Martin Koerner 24 | * 25 | */ 26 | public class Aggregator { 27 | File inputFile; 28 | File outputFile; 29 | String delimiter; 30 | int startSortAtColumn; 31 | boolean additionalCounts; 32 | 33 | Logger logger = LogManager.getLogger(this.getClass().getName()); 34 | 35 | // this comparator is based on the value of startSortAtColumn 36 | private Comparator stringComparator = new Comparator() { 37 | @Override 38 | public int compare(String string1, String string2) { 39 | if (Aggregator.this.startSortAtColumn == 0) { 40 | return string1.compareTo(string2); 41 | } else { 42 | String[] string1Split = string1.split("\\s"); 43 | String[] string2Split = string2.split("\\s"); 44 | String newString1 = ""; 45 | String newString2 = ""; 46 | for (int i = Aggregator.this.startSortAtColumn; i < string1Split.length; i++) { 47 | newString1 += string1Split[i] + " "; 48 | newString2 += string2Split[i] + " "; 49 | } 50 | newString1 = newString1.replaceFirst(" $", ""); 51 | newString2 = newString2.replaceFirst(" $", ""); 52 | int result = newString1.compareTo(newString2); 53 | if (result != 0) { 54 | // not equal 55 | return result; 56 | } else { 57 | int i = 0; 58 | while (i < Aggregator.this.startSortAtColumn) { 59 | String newNewString1 = newString1; 60 | String newNewString2 = newString2; 61 | for (int j = i; j >= 0; j--) { 62 | newNewString1 = string1Split[j] + " " 63 | + newNewString1; 64 | newNewString2 = string2Split[j] + " " 65 | + newNewString2; 66 | } 67 | result = newNewString1.compareTo(newNewString2); 68 | if (result != 0) { 69 | // not equal 70 | return result; 71 | } 72 | // equal 73 | i++; 74 | } 75 | // final result: equal 76 | return 0; 77 | } 78 | } 79 | } 80 | }; 81 | 82 | /** 83 | * @param inputStream 84 | * @param outputStream 85 | * @param delimiter 86 | * @param startSortAtColumn 87 | * : First column is zero 88 | */ 89 | public Aggregator(File inputFile, File outputFile, String delimiter, 90 | int startSortAtColumn, boolean additionalCounts) { 91 | this.inputFile = inputFile; 92 | this.outputFile = outputFile; 93 | this.delimiter = delimiter; 94 | this.startSortAtColumn = startSortAtColumn; 95 | this.additionalCounts = additionalCounts; 96 | 97 | } 98 | 99 | public void aggregateCounts() { 100 | try { 101 | BufferedReader inputFileReader = new BufferedReader(new FileReader( 102 | this.inputFile)); 103 | 104 | SortedMap wordMapAdditionalCounts = new TreeMap( 105 | this.stringComparator); 106 | SortedMap wordMapNoAdditionalCounts = new TreeMap( 107 | this.stringComparator); 108 | String inputLine; 109 | 110 | while ((inputLine = inputFileReader.readLine()) != null) { 111 | String[] inputLineSplit = inputLine.split(this.delimiter); 112 | String words = inputLineSplit[0]; 113 | long count = Long.parseLong(inputLineSplit[1]); 114 | if (words.length() == 0) { 115 | // TODO: understand the following comment 116 | // logger.error("empty row in " + this.inputFile + ": \"" 117 | // + inputLine + "\""); 118 | // logger.error("exiting JVM"); 119 | // System.exit(1); 120 | continue; 121 | } 122 | 123 | if (this.additionalCounts) { 124 | this.addCountWithAdditional(wordMapAdditionalCounts, words, 125 | count); 126 | } else { 127 | this.addCountWithNoAdditional(wordMapNoAdditionalCounts, 128 | words, count); 129 | } 130 | } 131 | 132 | inputFileReader.close(); 133 | BufferedWriter outputFileWriter = new BufferedWriter( 134 | new FileWriter(this.outputFile)); 135 | if (this.additionalCounts) { 136 | for (Entry entry : wordMapAdditionalCounts 137 | .entrySet()) { 138 | String words = entry.getKey(); 139 | // [0]=1+ 140 | // [1]=1 141 | // [2]=2 142 | // [3]=3+ 143 | outputFileWriter.write(words + this.delimiter 144 | + entry.getValue()[0] + this.delimiter 145 | + entry.getValue()[1] + this.delimiter 146 | + entry.getValue()[2] + this.delimiter 147 | + entry.getValue()[3] + "\n"); 148 | } 149 | } else { 150 | for (Entry entry : wordMapNoAdditionalCounts 151 | .entrySet()) { 152 | String words = entry.getKey(); 153 | outputFileWriter.write(words + this.delimiter 154 | + entry.getValue() + "\n"); 155 | } 156 | } 157 | outputFileWriter.close(); 158 | } catch (IOException e) { 159 | // TODO Auto-generated catch block 160 | e.printStackTrace(); 161 | } 162 | } 163 | 164 | private void addCountWithNoAdditional( 165 | SortedMap wordMapNoAdditionalCounts, String words, 166 | long count) { 167 | if (wordMapNoAdditionalCounts.containsKey(words)) { 168 | wordMapNoAdditionalCounts.put(words, 169 | wordMapNoAdditionalCounts.get(words) + count); 170 | } else { 171 | wordMapNoAdditionalCounts.put(words, count); 172 | } 173 | } 174 | 175 | private void addCountWithAdditional(SortedMap wordMap, 176 | String words, long count) { 177 | if (wordMap.containsKey(words)) { 178 | Long[] countTypeArray = wordMap.get(words); 179 | countTypeArray[0] = countTypeArray[0] + count; 180 | if (count == 1) { 181 | countTypeArray[1] = countTypeArray[1] + count; 182 | } 183 | if (count == 2) { 184 | countTypeArray[2] = countTypeArray[2] + count; 185 | } 186 | if (count >= 3) { 187 | countTypeArray[3] = countTypeArray[3] + count; 188 | } 189 | } else { 190 | Long[] countTypeArray = new Long[4]; 191 | countTypeArray[0] = count; 192 | if (count == 1) { 193 | countTypeArray[1] = count; 194 | } else { 195 | countTypeArray[1] = 0L; 196 | } 197 | if (count == 2) { 198 | countTypeArray[2] = count; 199 | } else { 200 | countTypeArray[2] = 0L; 201 | } 202 | if (count >= 3) { 203 | countTypeArray[3] = count; 204 | } else { 205 | countTypeArray[3] = 0L; 206 | } 207 | wordMap.put(words, countTypeArray); 208 | } 209 | } 210 | 211 | public void aggregateWithoutCounts() { 212 | try { 213 | BufferedReader inputFileReader = new BufferedReader(new FileReader( 214 | this.inputFile)); 215 | 216 | SortedSet wordSet = new TreeSet( 217 | this.stringComparator); 218 | String inputLine; 219 | 220 | while ((inputLine = inputFileReader.readLine()) != null) { 221 | wordSet.add(inputLine); 222 | } 223 | inputFileReader.close(); 224 | BufferedWriter outputFileWriter = new BufferedWriter( 225 | new FileWriter(this.outputFile)); 226 | for (String line : wordSet) { 227 | outputFileWriter.write(line + "\n"); 228 | } 229 | outputFileWriter.close(); 230 | } catch (IOException e) { 231 | // TODO Auto-generated catch block 232 | e.printStackTrace(); 233 | } 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /src/de/typology/splitter/DataSetSplitter.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileReader; 7 | import java.io.FileWriter; 8 | import java.io.IOException; 9 | import java.util.HashMap; 10 | import java.util.Map.Entry; 11 | 12 | import org.apache.logging.log4j.LogManager; 13 | import org.apache.logging.log4j.Logger; 14 | 15 | import de.typology.utils.Config; 16 | 17 | /** 18 | * This class splits and samples a given input file into trainings and test data 19 | * 20 | * The threasholds can be configured in config.txt the relevant fields are 21 | * 22 | * splitDataRatio 23 | * 24 | * smpleRate 25 | * 26 | * nGramLength 27 | * 28 | * @author Rene Pickhardt, Martin Koerner 29 | * 30 | */ 31 | public class DataSetSplitter { 32 | 33 | /** 34 | * @param args 35 | */ 36 | public static void main(String[] args) { 37 | String outputDirectory = Config.get().outputDirectory 38 | + Config.get().inputDataSet; 39 | 40 | DataSetSplitter dss = new DataSetSplitter(new File(outputDirectory), 41 | "normalized.txt"); 42 | dss.split("training.txt", "learning.txt", "testing.txt", 5); 43 | dss.splitIntoSequences(new File(outputDirectory + "/training.txt"), 44 | Config.get().modelLength, Config.get().numberOfQueries); 45 | 46 | } 47 | 48 | private File directory; 49 | 50 | private String inputName; 51 | 52 | Logger logger = LogManager.getLogger(this.getClass().getName()); 53 | 54 | public DataSetSplitter(File directory, String inputName) { 55 | 56 | this.directory = directory; 57 | this.inputName = inputName; 58 | } 59 | 60 | /** 61 | * Takes a given input file and provides a 3 way split. The file can be 62 | * sampled via the sampleRatio. A high sample ratio means that a large 63 | * portion of the file is being thrown away 64 | * 65 | * There the splitDataRatio specifies the percentage of the file that is 66 | * used as training data. The rest will be used as test and learing data. 67 | * 68 | * The last parameter splitTestRatio is usually set to 50 and means that the 69 | * test data is also devided again into learning and testing data 70 | * 71 | * 72 | * @param inputFile 73 | * potentially large text file that needs to be split 74 | * @param trainingFileName 75 | * filename where training data are to be stored 76 | * @param learningFileName 77 | * filename where learning data are to be stored 78 | * @param testingFileName 79 | * filename where test data are to be stored 80 | */ 81 | public void split(String trainingFileName, String learningFileName, 82 | String testingFileName, int sequenceLength) { 83 | this.logger.info("splitting into training, testing and learning file: " 84 | + this.directory + "/" + this.inputName); 85 | try { 86 | BufferedReader reader = new BufferedReader(new FileReader( 87 | this.directory.getAbsolutePath() + "/" + this.inputName)); 88 | BufferedWriter trainingDataWriter = new BufferedWriter( 89 | new FileWriter(this.directory.getAbsolutePath() + "/" 90 | + trainingFileName)); 91 | BufferedWriter learningDataWriter = new BufferedWriter( 92 | new FileWriter(this.directory.getAbsolutePath() + "/" 93 | + learningFileName)); 94 | BufferedWriter testingDataWriter = new BufferedWriter( 95 | new FileWriter(this.directory.getAbsolutePath() + "/" 96 | + testingFileName)); 97 | int rand; 98 | String line; 99 | while ((line = reader.readLine()) != null) { 100 | rand = (int) (Math.random() * 100); 101 | if (rand >= Config.get().sampleRate) { 102 | // keep data 103 | rand = (int) (Math.random() * 100); 104 | if (rand >= Config.get().splitDataRatio) { 105 | // store data in testing or learning file 106 | rand = (int) (Math.random() * 100); 107 | if (rand >= Config.get().splitTestRatio) { 108 | if (Config.get().addSentenceTags) { 109 | // TODO make this flexible 110 | line = " " + line + " "; 111 | } 112 | learningDataWriter.write(line + "\n"); 113 | } else { 114 | if (Config.get().addSentenceTags) { 115 | // TODO make this flexible 116 | line = " " + line + " "; 117 | } 118 | testingDataWriter.write(line + "\n"); 119 | } 120 | } else { 121 | // store data in training file 122 | trainingDataWriter.write(line + "\n"); 123 | } 124 | } 125 | } 126 | reader.close(); 127 | trainingDataWriter.close(); 128 | learningDataWriter.close(); 129 | testingDataWriter.close(); 130 | 131 | this.logger.info("splitting done"); 132 | 133 | } catch (IOException e) { 134 | // TODO Auto-generated catch block 135 | e.printStackTrace(); 136 | } 137 | } 138 | 139 | public void splitIntoSequences(File inputFile, int maxSequenceLength, 140 | int numberOfSequences) { 141 | System.out.println(maxSequenceLength); 142 | String[] fileNameSplit = inputFile.getName().split("\\."); 143 | 144 | HashMap testSequenceFileWriters = new HashMap(); 145 | for (int i = 1; i <= maxSequenceLength; i++) { 146 | try { 147 | testSequenceFileWriters.put(i, 148 | new BufferedWriter(new FileWriter(new File( 149 | this.directory.getAbsolutePath() + "/" 150 | + fileNameSplit[0] + "-samples-" + i 151 | + "." + fileNameSplit[1])))); 152 | } catch (IOException e) { 153 | // TODO Auto-generated catch block 154 | e.printStackTrace(); 155 | } 156 | } 157 | 158 | // get total count from stats file 159 | long sequenceCount = 0L; 160 | try { 161 | BufferedReader reader = new BufferedReader( 162 | new FileReader(inputFile)); 163 | String line; 164 | // count sequences 165 | while ((line = reader.readLine()) != null) { 166 | String[] lineSplit = line.split("\\s"); 167 | if (lineSplit.length < maxSequenceLength) { 168 | continue; 169 | } else { 170 | int sequenceStart = 0; 171 | while (lineSplit.length - sequenceStart >= maxSequenceLength) { 172 | sequenceCount++; 173 | sequenceStart++; 174 | } 175 | } 176 | } 177 | reader.close(); 178 | } catch (IOException e) { 179 | // TODO Auto-generated catch block 180 | e.printStackTrace(); 181 | } 182 | this.logger.debug("sequenceCount: " + sequenceCount); 183 | double sequenceProbability = (double) numberOfSequences / sequenceCount; 184 | long skipDistance = sequenceCount / numberOfSequences; 185 | this.logger.debug("skipDistance: " + skipDistance); 186 | 187 | try { 188 | BufferedReader reader = new BufferedReader( 189 | new FileReader(inputFile)); 190 | this.logger.info("splitting " + inputFile.getName() 191 | + " into sequences"); 192 | String line; 193 | while ((line = reader.readLine()) != null) { 194 | String[] originalLineSplit = line.split("\\s"); 195 | int linePointer = 0; 196 | while (originalLineSplit.length - linePointer >= maxSequenceLength) { 197 | // build current Sequence 198 | String currentSequence = ""; 199 | for (int i = 0; i < maxSequenceLength; i++) { 200 | currentSequence += originalLineSplit[linePointer + i] 201 | + " "; 202 | } 203 | currentSequence = currentSequence.replaceFirst(" $", ""); 204 | if (Math.random() <= sequenceProbability) { 205 | String[] currentSequenceSplit = currentSequence 206 | .split("\\s"); 207 | for (int i = 1; i <= maxSequenceLength; i++) { 208 | // build result sequence 209 | String resultSequence = ""; 210 | for (int j = 0; j < i; j++) { 211 | resultSequence += currentSequenceSplit[j] + " "; 212 | } 213 | resultSequence = resultSequence.replaceFirst(" $", 214 | ""); 215 | testSequenceFileWriters.get(i).write( 216 | resultSequence + "\n"); 217 | } 218 | } 219 | linePointer++; 220 | } 221 | } 222 | 223 | reader.close(); 224 | for (Entry testSequenceWritersEntry : testSequenceFileWriters 225 | .entrySet()) { 226 | testSequenceWritersEntry.getValue().close(); 227 | } 228 | } catch (IOException e) { 229 | // TODO Auto-generated catch block 230 | e.printStackTrace(); 231 | } 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /src/de/typology/splitter/LineCounterTask.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileWriter; 7 | import java.io.IOException; 8 | import java.io.InputStream; 9 | import java.io.InputStreamReader; 10 | 11 | import org.apache.commons.io.FileUtils; 12 | import org.apache.logging.log4j.LogManager; 13 | import org.apache.logging.log4j.Logger; 14 | 15 | public class LineCounterTask implements Runnable { 16 | protected InputStream inputStream; 17 | protected File outputDirectory; 18 | protected String patternLabel; 19 | protected String delimiter; 20 | protected boolean setCountToOne; 21 | protected boolean additionalCounts; 22 | 23 | Logger logger = LogManager.getLogger(this.getClass().getName()); 24 | 25 | public LineCounterTask(InputStream inputStream, File outputDirectory, 26 | String patternLabel, String delimiter, boolean setCountToOne, 27 | boolean additionalCounts) { 28 | this.inputStream = inputStream; 29 | this.outputDirectory = outputDirectory; 30 | this.patternLabel = patternLabel; 31 | this.delimiter = delimiter; 32 | this.setCountToOne = setCountToOne; 33 | this.additionalCounts = additionalCounts; 34 | } 35 | 36 | @Override 37 | public void run() { 38 | File outputDirectory = new File(this.outputDirectory.getAbsolutePath() 39 | + "/" + this.patternLabel); 40 | if (outputDirectory.exists()) { 41 | try { 42 | FileUtils.deleteDirectory(outputDirectory); 43 | } catch (IOException e) { 44 | // TODO Auto-generated catch block 45 | e.printStackTrace(); 46 | } 47 | } 48 | outputDirectory.mkdir(); 49 | this.logger.info("count lines for: " 50 | + outputDirectory.getAbsolutePath()); 51 | 52 | BufferedReader inputStreamReader = new BufferedReader( 53 | new InputStreamReader(this.inputStream)); 54 | long onePlusLineCount = 0L; 55 | long oneLineCount = 0L; 56 | long twoLineCount = 0L; 57 | long threePlusLineCount = 0L; 58 | String line; 59 | try { 60 | if (this.setCountToOne) { 61 | while ((line = inputStreamReader.readLine()) != null) { 62 | onePlusLineCount++; 63 | } 64 | } else { 65 | while ((line = inputStreamReader.readLine()) != null) { 66 | long currentCount = Long.parseLong(line 67 | .split(this.delimiter)[1]); 68 | onePlusLineCount += currentCount; 69 | if (currentCount == 1L) { 70 | oneLineCount += currentCount; 71 | } 72 | if (currentCount == 2L) { 73 | twoLineCount += currentCount; 74 | } 75 | if (currentCount >= 3L) { 76 | threePlusLineCount += currentCount; 77 | } 78 | } 79 | } 80 | inputStreamReader.close(); 81 | 82 | BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter( 83 | outputDirectory.getAbsolutePath() + "/" + "all")); 84 | if (this.additionalCounts) { 85 | bufferedWriter.write(onePlusLineCount + this.delimiter 86 | + oneLineCount + this.delimiter + twoLineCount 87 | + this.delimiter + threePlusLineCount + "\n"); 88 | } else { 89 | bufferedWriter.write(onePlusLineCount + "\n"); 90 | } 91 | bufferedWriter.close(); 92 | 93 | } catch (IOException e) { 94 | // TODO Auto-generated catch block 95 | e.printStackTrace(); 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/de/typology/splitter/SequenceModifier.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.io.OutputStream; 9 | import java.io.OutputStreamWriter; 10 | 11 | /** 12 | * A class for modifying the sequences in InputDirectory based on the given 13 | * Pattern. The modified sequences are returned as outputStream 14 | * 15 | * @author Martin Koerner 16 | * 17 | */ 18 | public class SequenceModifier implements Runnable { 19 | private File inputDirectory; 20 | private OutputStream outputStream; 21 | private String delimiter; 22 | private boolean[] pattern; 23 | private boolean modifyCount; 24 | private boolean setCountToOne; 25 | 26 | public SequenceModifier(File inputDirectory, OutputStream outputStream, 27 | String delimiter, boolean[] pattern, boolean modifyCount, 28 | boolean setCountToOne) { 29 | this.inputDirectory = inputDirectory; 30 | this.outputStream = outputStream; 31 | this.delimiter = delimiter; 32 | this.pattern = pattern; 33 | this.modifyCount = modifyCount; 34 | this.setCountToOne = setCountToOne; 35 | } 36 | 37 | @Override 38 | public void run() { 39 | BufferedWriter outputStreamWriter = new BufferedWriter( 40 | new OutputStreamWriter(this.outputStream)); 41 | try { 42 | for (File inputFile : this.inputDirectory.listFiles()) { 43 | BufferedReader inputFileReader = new BufferedReader( 44 | new FileReader(inputFile)); 45 | String line; 46 | while ((line = inputFileReader.readLine()) != null) { 47 | String[] lineSplit = line.split(this.delimiter); 48 | if (this.modifyCount) { 49 | String[] words = lineSplit[0].split("\\s"); 50 | String modifiedWords = ""; 51 | try { 52 | for (int i = 0; i < this.pattern.length; i++) { 53 | if (this.pattern[i]) { 54 | modifiedWords += words[i] + " "; 55 | } 56 | } 57 | } catch (Exception e) { 58 | e.printStackTrace(); 59 | } 60 | modifiedWords = modifiedWords.replaceFirst(" $", ""); 61 | // TODO: better solution? 62 | if (words[0].equals("")) { 63 | // for kneser-ney smoothing: every sequence that 64 | // starts 65 | // with counts as a new sequence 66 | if (this.inputDirectory.getName().equals("1")) { 67 | continue; 68 | } 69 | if (!this.pattern[0]) { 70 | // set in _1 to zero 71 | if (this.inputDirectory.getName().equals("11") 72 | && words[1].equals("")) { 73 | outputStreamWriter.write("" 74 | + this.delimiter + "0\n"); 75 | } else { 76 | outputStreamWriter.write(modifiedWords 77 | + this.delimiter 78 | + line.split(this.delimiter)[1] 79 | + "\n"); 80 | } 81 | } 82 | // if pattern[0]==true: leave out sequence 83 | } else { 84 | if (this.setCountToOne) { 85 | outputStreamWriter.write(modifiedWords 86 | + this.delimiter + "1\n"); 87 | } else { 88 | outputStreamWriter.write(modifiedWords 89 | + this.delimiter + lineSplit[1] + "\n"); 90 | } 91 | } 92 | } else { 93 | outputStreamWriter.write(line + "\n"); 94 | } 95 | 96 | } 97 | inputFileReader.close(); 98 | } 99 | outputStreamWriter.close(); 100 | } catch (IOException e) { 101 | // TODO Auto-generated catch block 102 | e.printStackTrace(); 103 | } 104 | 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/de/typology/splitter/Sequencer.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.InputStreamReader; 9 | import java.util.HashMap; 10 | 11 | import org.apache.logging.log4j.LogManager; 12 | import org.apache.logging.log4j.Logger; 13 | 14 | import de.typology.indexes.WordIndex; 15 | 16 | /** 17 | * A class for splitting a text file (via inputStream) into sequences that are 18 | * stored in different files based on the indexFile in outputDirectory. 19 | * 20 | * @author Martin Koerner 21 | * 22 | */ 23 | public class Sequencer { 24 | protected InputStream inputStream; 25 | protected File outputDirectory; 26 | protected WordIndex wordIndex; 27 | protected boolean[] pattern; 28 | protected String addBeforeSentence; 29 | protected String addAfterSentence; 30 | protected String delimiter; 31 | protected boolean completeLine; 32 | private int startSortAtColumn; 33 | 34 | Logger logger = LogManager.getLogger(this.getClass().getName()); 35 | 36 | public Sequencer(InputStream inputStream, File outputDirectory, 37 | WordIndex wordIndex, boolean[] pattern, String addBeforeSentence, 38 | String addAfterSentence, String delimiter, boolean completeLine, 39 | int startSortAtColumn) { 40 | this.inputStream = inputStream; 41 | this.outputDirectory = outputDirectory; 42 | this.wordIndex = wordIndex; 43 | this.pattern = pattern; 44 | this.addBeforeSentence = addBeforeSentence; 45 | this.addAfterSentence = addAfterSentence; 46 | this.delimiter = delimiter; 47 | this.completeLine = completeLine; 48 | this.startSortAtColumn = startSortAtColumn; 49 | 50 | } 51 | 52 | public void splitIntoFiles() { 53 | HashMap writers = this.wordIndex 54 | .openWriters(this.outputDirectory); 55 | // TODO: bufferSize calculation 56 | BufferedReader bufferedReader = new BufferedReader( 57 | new InputStreamReader(this.inputStream), 100 * 8 * 1024); 58 | // BufferedReader bufferedReader = new BufferedReader( 59 | // new InputStreamReader(this.inputStream), 10 * 8 * 1024); 60 | String line; 61 | try { 62 | while ((line = bufferedReader.readLine()) != null) { 63 | line = this.addBeforeSentence + line + this.addAfterSentence; 64 | if (this.completeLine) { 65 | String[] lineSplit = line.split("\\s"); 66 | writers.get( 67 | this.wordIndex 68 | .rank(lineSplit[this.startSortAtColumn])) 69 | .write(line + "\n"); 70 | } else { 71 | String[] lineSplit = line.split("\\s"); 72 | int linePointer = 0; 73 | while (lineSplit.length - linePointer >= this.pattern.length) { 74 | String sequence = ""; 75 | for (int i = 0; i < this.pattern.length; i++) { 76 | if (this.pattern[i]) { 77 | sequence += lineSplit[linePointer + i] + " "; 78 | } 79 | } 80 | sequence = sequence.replaceFirst(" $", ""); 81 | sequence += this.delimiter + "1\n"; 82 | 83 | // write sequence 84 | 85 | writers.get( 86 | this.wordIndex.rank(sequence.split(" ")[this.startSortAtColumn])) 87 | .write(sequence); 88 | 89 | linePointer++; 90 | } 91 | } 92 | } 93 | bufferedReader.close(); 94 | } catch (IOException e) { 95 | // TODO Auto-generated catch block 96 | e.printStackTrace(); 97 | } 98 | 99 | this.wordIndex.closeWriters(writers); 100 | } 101 | 102 | public boolean[] getPattern() { 103 | return this.pattern; 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/de/typology/splitter/SmoothingSplitter.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.OutputStream; 6 | import java.io.PipedInputStream; 7 | import java.io.PipedOutputStream; 8 | import java.util.ArrayList; 9 | import java.util.Comparator; 10 | import java.util.HashSet; 11 | import java.util.Map.Entry; 12 | import java.util.SortedMap; 13 | import java.util.TreeMap; 14 | import java.util.concurrent.ExecutorService; 15 | import java.util.concurrent.Executors; 16 | import java.util.concurrent.TimeUnit; 17 | 18 | import org.apache.logging.log4j.LogManager; 19 | import org.apache.logging.log4j.Logger; 20 | 21 | import de.typology.indexes.WordIndex; 22 | import de.typology.patterns.PatternTransformer; 23 | 24 | public class SmoothingSplitter { 25 | private File absoluteDirectory; 26 | private File continuationDirectory; 27 | private File indexFile; 28 | private String delimiter; 29 | protected boolean deleteTempFiles; 30 | 31 | Logger logger = LogManager.getLogger(this.getClass().getName()); 32 | private ExecutorService executorService; 33 | 34 | private Comparator patternComparator = new Comparator() { 35 | @Override 36 | public int compare(boolean[] pattern1, boolean[] pattern2) { 37 | return PatternTransformer.getStringPattern(pattern2).compareTo( 38 | PatternTransformer.getStringPattern(pattern1)); 39 | } 40 | }; 41 | 42 | public SmoothingSplitter(File absoluteDirectory, 43 | File continuationDirectory, File indexFile, String delimiter, 44 | boolean deleteTempFiles) { 45 | this.absoluteDirectory = absoluteDirectory; 46 | this.continuationDirectory = continuationDirectory; 47 | continuationDirectory.mkdir(); 48 | this.indexFile = indexFile; 49 | this.delimiter = delimiter; 50 | this.deleteTempFiles = deleteTempFiles; 51 | } 52 | 53 | public void split(ArrayList patterns, int cores) { 54 | // read Index 55 | this.logger 56 | .info("read word index: " + this.indexFile.getAbsolutePath()); 57 | WordIndex wordIndex = new WordIndex(this.indexFile); 58 | // initialize executerService 59 | // int cores = Runtime.getRuntime().availableProcessors(); 60 | 61 | SortedMap continuationMap = this 62 | .filterContinuationMap(this.getContinuationMap(patterns)); 63 | 64 | HashSet finishedPatterns = new HashSet(); 65 | 66 | while (finishedPatterns.size() < continuationMap.size()) { 67 | ArrayList currentPatterns = new ArrayList(); 68 | this.executorService = Executors.newFixedThreadPool(cores); 69 | 70 | for (Entry entry : continuationMap.entrySet()) { 71 | // list for storing patterns that are currently computed 72 | 73 | if (!finishedPatterns.contains(entry.getKey())) { 74 | if (!PatternTransformer.getStringPattern(entry.getValue()) 75 | .contains("0")) { 76 | // read absolute files 77 | currentPatterns.add(entry.getKey()); 78 | this.logger.info("build continuation for " 79 | + PatternTransformer.getStringPattern(entry 80 | .getKey()) 81 | + " from absolute " 82 | + PatternTransformer.getStringPattern(entry 83 | .getValue())); 84 | 85 | String inputPatternLabel = PatternTransformer 86 | .getStringPattern(entry.getValue()); 87 | boolean[] outputPattern = PatternTransformer 88 | .getBooleanPattern(PatternTransformer 89 | .getStringPattern(entry.getKey()) 90 | .replaceAll("0", "")); 91 | String outputPatternLabel = PatternTransformer 92 | .getStringPattern(entry.getKey()).replaceAll( 93 | "0", "_"); 94 | 95 | File currentAbsoluteInputDirectory = new File( 96 | this.absoluteDirectory.getAbsolutePath() + "/" 97 | + inputPatternLabel); 98 | 99 | this.logger.debug("inputPattern: " 100 | + PatternTransformer.getStringPattern(entry 101 | .getValue())); 102 | this.logger.debug("inputPatternLabel: " 103 | + inputPatternLabel); 104 | this.logger.debug("outputPattern: " 105 | + PatternTransformer 106 | .getStringPattern(outputPattern)); 107 | this.logger.debug("newPatternLabel: " 108 | + outputPatternLabel); 109 | this.logger.debug("patternForModifier: " 110 | + PatternTransformer.getStringPattern(entry 111 | .getKey())); 112 | 113 | this.splitType(currentAbsoluteInputDirectory, 114 | this.continuationDirectory, outputPattern, 115 | outputPatternLabel, entry.getKey(), wordIndex, 116 | true, true); 117 | } else { 118 | if (finishedPatterns.contains(entry.getValue())) { 119 | // read continuation files 120 | currentPatterns.add(entry.getKey()); 121 | this.logger.info("build continuation for " 122 | + PatternTransformer.getStringPattern(entry 123 | .getKey()) 124 | + " from continuation " 125 | + PatternTransformer.getStringPattern(entry 126 | .getValue())); 127 | 128 | String inputPatternLabel = PatternTransformer 129 | .getStringPattern(entry.getValue()) 130 | .replaceAll("0", "_"); 131 | boolean[] outputPattern = PatternTransformer 132 | .getBooleanPattern(PatternTransformer 133 | .getStringPattern(entry.getKey()) 134 | .replaceAll("0", "")); 135 | String outputPatternLabel = PatternTransformer 136 | .getStringPattern(entry.getKey()) 137 | .replaceAll("0", "_"); 138 | 139 | File currentContinuationInputDirectory = new File( 140 | this.continuationDirectory 141 | .getAbsolutePath() 142 | + "/" 143 | + inputPatternLabel); 144 | 145 | // build patternForModifier 146 | boolean[] patternForModifier = new boolean[Integer 147 | .bitCount(PatternTransformer 148 | .getIntPattern(entry.getValue()))]; 149 | System.out.println(outputPatternLabel + "<--" 150 | + inputPatternLabel + " " 151 | + patternForModifier.length); 152 | int patternPointer = 0; 153 | for (int i = 0; i < entry.getValue().length; i++) { 154 | if (entry.getKey()[i] && entry.getValue()[i]) { 155 | patternForModifier[patternPointer] = true; 156 | patternPointer++; 157 | } else { 158 | if (!entry.getKey()[i] 159 | && entry.getValue()[i]) { 160 | patternForModifier[patternPointer] = false; 161 | patternPointer++; 162 | } 163 | } 164 | } 165 | 166 | this.logger.debug("inputPattern: " 167 | + PatternTransformer.getStringPattern(entry 168 | .getValue())); 169 | this.logger.debug("inputPatternLabel: " 170 | + inputPatternLabel); 171 | this.logger.debug("outputPattern: " 172 | + PatternTransformer 173 | .getStringPattern(outputPattern)); 174 | this.logger.debug("newPatternLabel: " 175 | + outputPatternLabel); 176 | this.logger 177 | .debug("patternForModifier: " 178 | + PatternTransformer 179 | .getStringPattern(patternForModifier)); 180 | 181 | this.splitType(currentContinuationInputDirectory, 182 | this.continuationDirectory, outputPattern, 183 | outputPatternLabel, patternForModifier, 184 | wordIndex, false, true); 185 | 186 | } 187 | } 188 | } 189 | } 190 | this.executorService.shutdown(); 191 | this.logger.info("end of this round of calculation"); 192 | try { 193 | this.executorService.awaitTermination(Long.MAX_VALUE, 194 | TimeUnit.SECONDS); 195 | } catch (InterruptedException e) { 196 | // TODO Auto-generated catch block 197 | e.printStackTrace(); 198 | } 199 | // add currently computed patterns to finishedPatterns 200 | for (boolean[] currentPattern : currentPatterns) { 201 | finishedPatterns.add(currentPattern); 202 | } 203 | } 204 | 205 | } 206 | 207 | private void splitType(File currentInputDirectory, File outputDirectory, 208 | boolean[] newPattern, String newPatternLabel, 209 | boolean[] patternForModifier, WordIndex wordIndex, 210 | boolean setCountToOne, boolean additionalCounts) { 211 | PipedInputStream pipedInputStream = new PipedInputStream(100 * 8 * 1024); 212 | 213 | if (Integer.bitCount(PatternTransformer.getIntPattern(newPattern)) == 0) { 214 | LineCounterTask lineCountTask = new LineCounterTask( 215 | pipedInputStream, outputDirectory, newPatternLabel, 216 | this.delimiter, setCountToOne, additionalCounts); 217 | this.executorService.execute(lineCountTask); 218 | } else { 219 | // don't add tags here 220 | SplitterTask splitterTask = new SplitterTask(pipedInputStream, 221 | outputDirectory, wordIndex, newPattern, newPatternLabel, 222 | this.delimiter, 0, this.deleteTempFiles, "", "", true, 223 | false, additionalCounts); 224 | this.executorService.execute(splitterTask); 225 | } 226 | 227 | try { 228 | OutputStream pipedOutputStream = new PipedOutputStream( 229 | pipedInputStream); 230 | SequenceModifier sequenceModifier = new SequenceModifier( 231 | currentInputDirectory, pipedOutputStream, this.delimiter, 232 | patternForModifier, true, setCountToOne); 233 | this.executorService.execute(sequenceModifier); 234 | 235 | } catch (IOException e) { 236 | // TODO Auto-generated catch block 237 | e.printStackTrace(); 238 | } 239 | 240 | } 241 | 242 | private SortedMap filterContinuationMap( 243 | SortedMap continuationMap) { 244 | SortedMap newContinuationMap = new TreeMap( 245 | this.patternComparator); 246 | for (Entry entry : continuationMap.entrySet()) { 247 | if (PatternTransformer.getStringPattern(entry.getKey()).equals( 248 | PatternTransformer.getStringPattern(entry.getValue()))) { 249 | continue; 250 | } 251 | boolean[] currentPattern = entry.getKey(); 252 | if (currentPattern.length > 2) { 253 | if (!currentPattern[0] && !currentPattern[1]) { 254 | continue; 255 | } 256 | } 257 | newContinuationMap.put(entry.getKey(), entry.getValue()); 258 | 259 | } 260 | return newContinuationMap; 261 | } 262 | 263 | private SortedMap getContinuationMap( 264 | ArrayList patterns) { 265 | SortedMap continuationMap = new TreeMap( 266 | this.patternComparator); 267 | 268 | for (boolean[] inputPattern : patterns) { 269 | this.addPatterns(continuationMap, inputPattern, inputPattern, 0); 270 | } 271 | return continuationMap; 272 | } 273 | 274 | private void addPatterns(SortedMap continuationMap, 275 | boolean[] pattern, boolean[] oldPattern, int position) { 276 | if (position < pattern.length) { 277 | boolean[] newPattern = pattern.clone(); 278 | newPattern[position] = false; 279 | continuationMap.put(newPattern, pattern); 280 | continuationMap.put(pattern, oldPattern); 281 | this.addPatterns(continuationMap, newPattern, pattern, position + 1); 282 | this.addPatterns(continuationMap, pattern, oldPattern, position + 1); 283 | } 284 | } 285 | 286 | } 287 | -------------------------------------------------------------------------------- /src/de/typology/splitter/SplitterTask.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | 7 | import org.apache.commons.io.FileUtils; 8 | import org.apache.logging.log4j.LogManager; 9 | import org.apache.logging.log4j.Logger; 10 | 11 | import de.typology.indexes.WordIndex; 12 | 13 | /** 14 | * A class for running Sequencer and Aggregator for a given pattern. 15 | * 16 | * @author Martin Koerner 17 | * 18 | */ 19 | public class SplitterTask implements Runnable { 20 | private InputStream inputStream; 21 | private File outputDirectory; 22 | private WordIndex wordIndex; 23 | private boolean[] pattern; 24 | private String patternLabel; 25 | private String delimiter; 26 | private int startSortAtColumn; 27 | private boolean deleteTempFiles; 28 | 29 | private String addBeforeSentence; 30 | private String addAfterSentence; 31 | private boolean sequenceModifyCounts; 32 | private boolean aggregateCompleteLine; 33 | private boolean additionalCounts; 34 | 35 | Logger logger = LogManager.getLogger(this.getClass().getName()); 36 | 37 | public SplitterTask(InputStream inputStream, File outputDirectory, 38 | WordIndex wordIndex, boolean[] pattern, String patternLabel, 39 | String delimiter, int startSortAtColumn, boolean deleteTempFiles, 40 | String addBeforeSentence, String addAfterSentence, 41 | boolean sequenceModifyCounts, boolean aggregateCompleteLine, 42 | boolean additionalCounts) { 43 | this.inputStream = inputStream; 44 | this.outputDirectory = outputDirectory; 45 | this.wordIndex = wordIndex; 46 | this.pattern = pattern; 47 | this.patternLabel = patternLabel; 48 | this.delimiter = delimiter; 49 | this.startSortAtColumn = startSortAtColumn; 50 | this.deleteTempFiles = deleteTempFiles; 51 | this.addBeforeSentence = addBeforeSentence; 52 | this.addAfterSentence = addAfterSentence; 53 | this.sequenceModifyCounts = sequenceModifyCounts; 54 | this.aggregateCompleteLine = aggregateCompleteLine; 55 | this.additionalCounts = additionalCounts; 56 | } 57 | 58 | @Override 59 | public void run() { 60 | File sequencerOutputDirectory = new File( 61 | this.outputDirectory.getAbsolutePath() + "/" 62 | + this.patternLabel + "-split"); 63 | if (sequencerOutputDirectory.exists()) { 64 | try { 65 | FileUtils.deleteDirectory(sequencerOutputDirectory); 66 | } catch (IOException e) { 67 | // TODO Auto-generated catch block 68 | e.printStackTrace(); 69 | } 70 | } 71 | sequencerOutputDirectory.mkdir(); 72 | this.logger.info("start building: " 73 | + sequencerOutputDirectory.getAbsolutePath()); 74 | 75 | // initialize sequencer 76 | Sequencer sequencer = new Sequencer(this.inputStream, 77 | sequencerOutputDirectory, this.wordIndex, this.pattern, 78 | this.addBeforeSentence, this.addAfterSentence, this.delimiter, 79 | this.sequenceModifyCounts, this.startSortAtColumn); 80 | sequencer.splitIntoFiles(); 81 | 82 | File aggregatedOutputDirectory = new File( 83 | this.outputDirectory.getAbsolutePath() + "/" 84 | + this.patternLabel); 85 | if (aggregatedOutputDirectory.exists()) { 86 | try { 87 | FileUtils.deleteDirectory(aggregatedOutputDirectory); 88 | } catch (IOException e) { 89 | // TODO Auto-generated catch block 90 | e.printStackTrace(); 91 | } 92 | } 93 | aggregatedOutputDirectory.mkdir(); 94 | this.logger.info("aggregate into: " + aggregatedOutputDirectory); 95 | 96 | for (File splitFile : sequencerOutputDirectory.listFiles()) { 97 | Aggregator aggregator = new Aggregator(splitFile, new File( 98 | aggregatedOutputDirectory.getAbsolutePath() + "/" 99 | + splitFile.getName()), this.delimiter, 100 | this.startSortAtColumn, this.additionalCounts); 101 | if (this.aggregateCompleteLine) { 102 | aggregator.aggregateWithoutCounts(); 103 | } else { 104 | aggregator.aggregateCounts(); 105 | } 106 | } 107 | 108 | // delete sequencerOutputDirectory 109 | if (this.deleteTempFiles) { 110 | try { 111 | FileUtils.deleteDirectory(sequencerOutputDirectory); 112 | } catch (IOException e) { 113 | // TODO Auto-generated catch block 114 | e.printStackTrace(); 115 | } 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/de/typology/tester/SequenceExtractorTask.java: -------------------------------------------------------------------------------- 1 | package de.typology.tester; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileReader; 7 | import java.io.FileWriter; 8 | import java.io.IOException; 9 | import java.util.ArrayList; 10 | import java.util.HashSet; 11 | 12 | import org.apache.commons.io.FileUtils; 13 | import org.apache.logging.log4j.LogManager; 14 | import org.apache.logging.log4j.Logger; 15 | 16 | /** 17 | * This class takes an ArrayList of sequences and a directory of Files as an 18 | * input and writes all occurrences of the sequences into new files in the 19 | * outputDirectory 20 | * 21 | * @author Martin Koerner 22 | * 23 | */ 24 | public class SequenceExtractorTask implements Runnable { 25 | 26 | Logger logger = LogManager.getLogger(this.getClass().getName()); 27 | 28 | private ArrayList originalSequences; 29 | private boolean[] pattern; 30 | private File inputDirectory; 31 | private File outputDirectory; 32 | private String delimiter; 33 | 34 | public SequenceExtractorTask(ArrayList originalSequences, 35 | boolean[] pattern, File inputDirectory, File outputDirectory, 36 | String delimiter) { 37 | this.originalSequences = originalSequences; 38 | this.pattern = pattern; 39 | 40 | this.inputDirectory = inputDirectory; 41 | this.outputDirectory = outputDirectory; 42 | if (this.outputDirectory.exists()) { 43 | try { 44 | FileUtils.deleteDirectory(this.outputDirectory); 45 | } catch (IOException e) { 46 | // TODO Auto-generated catch block 47 | e.printStackTrace(); 48 | } 49 | } 50 | this.outputDirectory.mkdirs(); 51 | this.delimiter = delimiter; 52 | 53 | } 54 | 55 | @Override 56 | public void run() { 57 | HashSet newSequences = this.getNewSequences(); 58 | 59 | for (File inputFile : this.inputDirectory.listFiles()) { 60 | File outputFile = new File(this.outputDirectory.getAbsolutePath() 61 | + "/" + inputFile.getName()); 62 | if (inputFile.getName().equals("all")) { 63 | try { 64 | FileUtils.copyFile(inputFile, outputFile); 65 | } catch (IOException e) { 66 | // TODO Auto-generated catch block 67 | e.printStackTrace(); 68 | } 69 | } else { 70 | try { 71 | BufferedReader inputFileReader = new BufferedReader( 72 | new FileReader(inputFile)); 73 | BufferedWriter outputFileWriter = new BufferedWriter( 74 | new FileWriter(outputFile)); 75 | String line; 76 | 77 | while ((line = inputFileReader.readLine()) != null) { 78 | if (newSequences 79 | .contains(line.split(this.delimiter)[0])) { 80 | 81 | outputFileWriter.write(line + "\n"); 82 | } 83 | } 84 | inputFileReader.close(); 85 | outputFileWriter.close(); 86 | } catch (IOException e) { 87 | // TODO Auto-generated catch block 88 | e.printStackTrace(); 89 | } 90 | } 91 | 92 | } 93 | 94 | } 95 | 96 | private HashSet getNewSequences() { 97 | HashSet newSequences = new HashSet(); 98 | 99 | for (String originalLine : this.originalSequences) { 100 | // modify sequences for continuation 101 | if (!this.pattern[0] || !this.pattern[this.pattern.length - 1]) { 102 | for (boolean element : this.pattern) { 103 | if (element) { 104 | break; 105 | } else { 106 | originalLine = " " + originalLine; 107 | } 108 | } 109 | for (int i = this.pattern.length - 1; i >= 0; i--) { 110 | if (this.pattern[i]) { 111 | break; 112 | } else { 113 | originalLine = originalLine + " "; 114 | } 115 | } 116 | } 117 | String[] originalLineSplit = originalLine.split("\\s"); 118 | int linePointer = 0; 119 | while (originalLineSplit.length - linePointer >= this.pattern.length) { 120 | 121 | // build current Sequence 122 | String currentSequence = ""; 123 | for (int i = 0; i < this.pattern.length; i++) { 124 | currentSequence += originalLineSplit[linePointer + i] + " "; 125 | } 126 | currentSequence = currentSequence.replaceFirst(" $", ""); 127 | 128 | String[] currentSequenceSplit = currentSequence.split("\\s"); 129 | String newSequence = ""; 130 | for (int i = 0; i < this.pattern.length; i++) { 131 | if (this.pattern[i]) { 132 | newSequence += currentSequenceSplit[i] + " "; 133 | } 134 | } 135 | newSequence = newSequence.replaceFirst(" $", ""); 136 | if (newSequence.length() > 0) { 137 | newSequences.add(newSequence); 138 | } 139 | 140 | linePointer++; 141 | } 142 | } 143 | return newSequences; 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/de/typology/tester/TestSequenceExtractor.java: -------------------------------------------------------------------------------- 1 | package de.typology.tester; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.concurrent.ExecutorService; 9 | import java.util.concurrent.Executors; 10 | import java.util.concurrent.TimeUnit; 11 | 12 | import de.typology.indexes.WordIndex; 13 | import de.typology.patterns.PatternBuilder; 14 | import de.typology.patterns.PatternTransformer; 15 | 16 | /** 17 | * This class extracts all sequences that are needed for computing the 18 | * Kneser-Ney smoothed values for a set of given test sequences. 19 | * 20 | * @author Martin Koerner 21 | * 22 | */ 23 | public class TestSequenceExtractor { 24 | private File testSequenceFile; 25 | private File absoluteDirectory; 26 | private File continuationDirectory; 27 | private File outputDirectory; 28 | 29 | private String delimiter; 30 | private WordIndex wordIndex; 31 | 32 | public TestSequenceExtractor(File testSequenceFile, File absoluteDirectory, 33 | File continuationDirectory, File outputDirectory, String delimiter, 34 | WordIndex wordIndex) { 35 | this.testSequenceFile = testSequenceFile; 36 | this.absoluteDirectory = absoluteDirectory; 37 | this.continuationDirectory = continuationDirectory; 38 | this.outputDirectory = outputDirectory; 39 | this.delimiter = delimiter; 40 | this.wordIndex = wordIndex; 41 | 42 | } 43 | 44 | public void extractSequences(int maxModelLength, int cores) { 45 | 46 | // read test sequences into HashSet 47 | ArrayList sequences = new ArrayList(); 48 | try { 49 | BufferedReader testSequenceReader = new BufferedReader( 50 | new FileReader(this.testSequenceFile)); 51 | String line; 52 | while ((line = testSequenceReader.readLine()) != null) { 53 | sequences.add(line); 54 | } 55 | testSequenceReader.close(); 56 | } catch (IOException e) { 57 | // TODO Auto-generated catch block 58 | e.printStackTrace(); 59 | } 60 | 61 | ArrayList absolutePatterns = PatternBuilder 62 | .getGLMForSmoothingPatterns(maxModelLength); 63 | 64 | // call SequenceExtractorTasks 65 | 66 | // initialize executerService 67 | // int cores = Runtime.getRuntime().availableProcessors(); 68 | ExecutorService executorService = Executors.newFixedThreadPool(cores); 69 | 70 | for (boolean[] absolutePattern : absolutePatterns) { 71 | // extract absolute sequences 72 | String absoluteStringPattern = PatternTransformer 73 | .getStringPattern(absolutePattern); 74 | File absoluteInputDirectory = new File( 75 | this.absoluteDirectory.getAbsolutePath() + "/" 76 | + absoluteStringPattern); 77 | File absoluteOutputDirectory = new File(this.outputDirectory + "/" 78 | + this.absoluteDirectory.getName() + "/" 79 | + absoluteStringPattern); 80 | SequenceExtractorTask absoluteSET = new SequenceExtractorTask( 81 | sequences, absolutePattern, absoluteInputDirectory, 82 | absoluteOutputDirectory, this.delimiter); 83 | executorService.execute(absoluteSET); 84 | 85 | } 86 | executorService.shutdown(); 87 | try { 88 | executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); 89 | } catch (InterruptedException e) { 90 | // TODO Auto-generated catch block 91 | e.printStackTrace(); 92 | } 93 | 94 | } 95 | 96 | public void extractContinuationSequences(int maxModelLength, int cores) { 97 | 98 | // read test sequences into HashSet 99 | ArrayList sequences = new ArrayList(); 100 | try { 101 | BufferedReader testSequenceReader = new BufferedReader( 102 | new FileReader(this.testSequenceFile)); 103 | String line; 104 | while ((line = testSequenceReader.readLine()) != null) { 105 | sequences.add(line); 106 | } 107 | testSequenceReader.close(); 108 | } catch (IOException e) { 109 | // TODO Auto-generated catch block 110 | e.printStackTrace(); 111 | } 112 | // call SequenceExtractorTasks 113 | 114 | // initialize executerService 115 | // int cores = Runtime.getRuntime().availableProcessors(); 116 | ExecutorService executorService = Executors.newFixedThreadPool(cores); 117 | 118 | for (File continuationTypeDirectory : this.continuationDirectory 119 | .listFiles()) { 120 | // extract absolute sequences 121 | String continuationStringPattern = continuationTypeDirectory 122 | .getName(); 123 | boolean[] continuationPattern = PatternTransformer 124 | .getBooleanPattern(continuationStringPattern.replaceAll( 125 | "_", "0")); 126 | File continuationOutputDirectory = new File(this.outputDirectory 127 | + "/" + this.continuationDirectory.getName() + "/" 128 | + continuationStringPattern); 129 | SequenceExtractorTask continuationSET = new SequenceExtractorTask( 130 | sequences, continuationPattern, continuationTypeDirectory, 131 | continuationOutputDirectory, this.delimiter); 132 | executorService.execute(continuationSET); 133 | 134 | } 135 | executorService.shutdown(); 136 | try { 137 | executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); 138 | } catch (InterruptedException e) { 139 | // TODO Auto-generated catch block 140 | e.printStackTrace(); 141 | } 142 | 143 | } 144 | // public void extractContinuationSequences(int maxModelLength, int cores) { 145 | // ArrayList absolutePatterns = PatternBuilder 146 | // .getLMPatterns(maxModelLength); 147 | // 148 | // // initialize executerService 149 | // // int cores = Runtime.getRuntime().availableProcessors(); 150 | // ExecutorService executorService = Executors.newFixedThreadPool(cores); 151 | // for (boolean[] absolutePattern : absolutePatterns) { 152 | // File originalSequencesDirectory = new File( 153 | // this.outputDirectory.getAbsolutePath() 154 | // + "/" 155 | // + this.absoluteDirectory.getName() 156 | // + "/" 157 | // + PatternTransformer 158 | // .getStringPattern(absolutePattern)); 159 | // File outputDirectory = new File( 160 | // this.outputDirectory.getAbsolutePath() + "/continuation"); 161 | // ContinuationExtractorTask cet = new ContinuationExtractorTask( 162 | // originalSequencesDirectory, absolutePattern, 163 | // this.absoluteDirectory, outputDirectory, this.wordIndex, 164 | // this.delimiter); 165 | // executorService.execute(cet); 166 | // } 167 | // 168 | // executorService.shutdown(); 169 | // try { 170 | // executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); 171 | // } catch (InterruptedException e) { 172 | // // TODO Auto-generated catch block 173 | // e.printStackTrace(); 174 | // } 175 | // 176 | // } 177 | } 178 | -------------------------------------------------------------------------------- /src/de/typology/utils/Config.java: -------------------------------------------------------------------------------- 1 | package de.typology.utils; 2 | 3 | import java.io.BufferedInputStream; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.lang.reflect.Field; 7 | import java.util.Properties; 8 | 9 | /** 10 | * This is an interface class to the Config file for this project. For each 11 | * class field one java property must be defined in config.txt. The fields will 12 | * be automatically filled! 13 | * 14 | * Allowed Types are String, int, boolean, String[] and long[] where arrays are 15 | * defined by semicolon-separated Strings like "array=a;b;c" boolen fields are 16 | * initialized with true or false 17 | * 18 | * lines starting with # will be ignored and can serve as comments 19 | * 20 | * @author Jonas Kunze, Rene Pickhardt 21 | * 22 | */ 23 | public class Config extends Properties { 24 | // CONTROLL PARAMETERS 25 | public int numberOfCores; 26 | 27 | public String languages; 28 | 29 | public boolean splitData; 30 | public boolean buildIndex; 31 | public boolean buildGLM; 32 | public boolean buildContinuationGLM; 33 | public boolean extractContinuationGLM; 34 | public boolean buildKneserNey; 35 | public boolean buildModKneserNey; 36 | 37 | public boolean conditionalProbabilityOnly; 38 | public boolean backoffAbsolute; 39 | 40 | public boolean kneserNeySimple; 41 | public boolean kneserNeyComplex; 42 | 43 | public boolean deleteTempFiles; 44 | 45 | public boolean addSentenceTags; 46 | public boolean addFakeStartTag; 47 | 48 | public int decimalPlaces; 49 | // DEBUGGING 50 | public String inputDataSet; 51 | 52 | // STEP 0 GLOBAL CONFIGS 53 | 54 | public String outputDirectory; 55 | public int maxCountDivider; 56 | public int modelLength; 57 | 58 | public int numberOfQueries; 59 | 60 | // STEP 2 SAMPLING AND MAKE TRAININGS DATA SPLIT 61 | public int sampleRate; // \in [0, 100] 0 means no data from input will be 62 | // used. 100 means all input data will be used 63 | public int splitDataRatio; // \in [0, 100] 0 means no training data. 100 64 | // means only training data 65 | public int splitTestRatio; // \in [0, 100] 0 means all data is stored in 66 | // test file. 100 means all data is stored in (smaller) learning file 67 | 68 | private static final long serialVersionUID = -4439565094382127683L; 69 | 70 | static Config instance = null; 71 | 72 | public Config() { 73 | String file = "config.txt"; 74 | try { 75 | BufferedInputStream stream = new BufferedInputStream( 76 | new FileInputStream(file)); 77 | this.load(stream); 78 | stream.close(); 79 | } catch (IOException e) { 80 | e.printStackTrace(); 81 | } 82 | try { 83 | this.initialize(); 84 | } catch (IllegalArgumentException e) { 85 | e.printStackTrace(); 86 | } catch (IllegalAccessException e) { 87 | e.printStackTrace(); 88 | } 89 | } 90 | 91 | /** 92 | * Fills all fields with the data defined in the config file. 93 | * 94 | * @throws IllegalArgumentException 95 | * @throws IllegalAccessException 96 | */ 97 | private void initialize() throws IllegalArgumentException, 98 | IllegalAccessException { 99 | Field[] fields = this.getClass().getFields(); 100 | for (Field f : fields) { 101 | if (this.getProperty(f.getName()) == null) { 102 | System.err.print("Property '" + f.getName() 103 | + "' not defined in config file"); 104 | } 105 | if (f.getType().equals(String.class)) { 106 | f.set(this, this.getProperty(f.getName())); 107 | } else if (f.getType().equals(long.class)) { 108 | f.setLong(this, Long.valueOf(this.getProperty(f.getName()))); 109 | } else if (f.getType().equals(int.class)) { 110 | f.setInt(this, Integer.valueOf(this.getProperty(f.getName()))); 111 | } else if (f.getType().equals(boolean.class)) { 112 | f.setBoolean(this, 113 | Boolean.valueOf(this.getProperty(f.getName()))); 114 | } else if (f.getType().equals(String[].class)) { 115 | f.set(this, this.getProperty(f.getName()).split(";")); 116 | } else if (f.getType().equals(int[].class)) { 117 | String[] tmp = this.getProperty(f.getName()).split(";"); 118 | int[] ints = new int[tmp.length]; 119 | for (int i = 0; i < tmp.length; i++) { 120 | ints[i] = Integer.parseInt(tmp[i]); 121 | } 122 | f.set(this, ints); 123 | } else if (f.getType().equals(long[].class)) { 124 | String[] tmp = this.getProperty(f.getName()).split(";"); 125 | long[] longs = new long[tmp.length]; 126 | for (int i = 0; i < tmp.length; i++) { 127 | longs[i] = Long.parseLong(tmp[i]); 128 | } 129 | f.set(this, longs); 130 | } 131 | } 132 | } 133 | 134 | public static Config get() { 135 | if (instance == null) { 136 | instance = new Config(); 137 | } 138 | return instance; 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/de/typology/utils/Counter.java: -------------------------------------------------------------------------------- 1 | package de.typology.utils; 2 | 3 | import java.io.BufferedInputStream; 4 | import java.io.BufferedReader; 5 | import java.io.File; 6 | import java.io.FileInputStream; 7 | import java.io.FileNotFoundException; 8 | import java.io.FileReader; 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | 12 | public class Counter { 13 | 14 | public static long countLinesInDirectory(File directory) { 15 | long totalCount = 0; 16 | for (File file : directory.listFiles()) { 17 | totalCount += countLines(file); 18 | } 19 | return totalCount; 20 | } 21 | 22 | // derived from: 23 | // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java 24 | public static long countLines(File file) { 25 | InputStream is; 26 | try { 27 | is = new BufferedInputStream(new FileInputStream(file)); 28 | try { 29 | try { 30 | byte[] c = new byte[1024]; 31 | long count = 0; 32 | int readChars = 0; 33 | boolean empty = true; 34 | while ((readChars = is.read(c)) != -1) { 35 | empty = false; 36 | for (int i = 0; i < readChars; ++i) { 37 | if (c[i] == '\n') { 38 | ++count; 39 | } 40 | } 41 | } 42 | return count == 0 && !empty ? 1 : count; 43 | } finally { 44 | is.close(); 45 | } 46 | } catch (IOException e) { 47 | e.printStackTrace(); 48 | } 49 | } catch (FileNotFoundException e) { 50 | e.printStackTrace(); 51 | } 52 | return 0; 53 | } 54 | 55 | private static int columnNumberStartZero; 56 | private static File directory; 57 | private static long currentCountForDirectory; 58 | 59 | public static long countColumnCountsInDirectory(int columnNumberStartZero, 60 | File directory) { 61 | if (columnNumberStartZero == Counter.columnNumberStartZero 62 | && directory.equals(Counter.directory)) { 63 | return Counter.currentCountForDirectory; 64 | } else { 65 | long totalCount = 0; 66 | for (File file : directory.listFiles()) { 67 | totalCount += countColumnCounts(columnNumberStartZero, file); 68 | } 69 | Counter.columnNumberStartZero = columnNumberStartZero; 70 | Counter.currentCountForDirectory = totalCount; 71 | Counter.directory = directory; 72 | return totalCount; 73 | } 74 | } 75 | 76 | public static long countColumnCounts(int columnNumberStartZero, File file) { 77 | long totalCount = 0; 78 | try { 79 | BufferedReader br = new BufferedReader(new FileReader(file)); 80 | try { 81 | String line; 82 | String[] lineSplit; 83 | while ((line = br.readLine()) != null) { 84 | lineSplit = line.split("\t"); 85 | totalCount += Long 86 | .parseLong(lineSplit[columnNumberStartZero]); 87 | } 88 | } finally { 89 | br.close(); 90 | } 91 | } catch (IOException e) { 92 | // TODO Auto-generated catch block 93 | e.printStackTrace(); 94 | } 95 | return totalCount; 96 | } 97 | 98 | /** 99 | * used for aggregating the counts in a directory 100 | * 101 | * @param count 102 | * @param directory 103 | * @return 104 | */ 105 | public static long aggregateCountsInDirectory(File directory) { 106 | long totalCount = 0; 107 | for (File file : directory.listFiles()) { 108 | totalCount += aggregateCounts(file); 109 | } 110 | return totalCount; 111 | } 112 | 113 | /** 114 | * used for calculating the count of counts in smoothing methods 115 | * 116 | * @param count 117 | * @param directoryName 118 | * @return 119 | */ 120 | public static long aggregateCounts(File file) { 121 | long totalCount = 0; 122 | try { 123 | BufferedReader br = new BufferedReader(new FileReader(file)); 124 | try { 125 | String line; 126 | String[] lineSplit; 127 | while ((line = br.readLine()) != null) { 128 | // TODO remove this or make it pretty 129 | if (line.startsWith("")) { 130 | continue; 131 | } 132 | lineSplit = line.split("\t"); 133 | totalCount += Long 134 | .parseLong(lineSplit[lineSplit.length - 1]); 135 | } 136 | } finally { 137 | br.close(); 138 | } 139 | } catch (IOException e) { 140 | // TODO Auto-generated catch block 141 | e.printStackTrace(); 142 | } 143 | return totalCount; 144 | } 145 | 146 | /** 147 | * used for calculating the count of counts in smoothing methods 148 | * 149 | * @param count 150 | * @param directory 151 | * @return 152 | */ 153 | public static long countCountsInDirectory(int count, File directory, 154 | String skipSequence) { 155 | long totalCount = 0; 156 | for (File file : directory.listFiles()) { 157 | if (!file.getName().contains("-split")) { 158 | totalCount += countCounts(count, file, skipSequence); 159 | } 160 | } 161 | return totalCount; 162 | } 163 | 164 | /** 165 | * used for calculating the count of counts in smoothing methods 166 | * 167 | * @param count 168 | * @param directoryName 169 | * @return 170 | */ 171 | public static long countCounts(int count, File file, String skipSequence) { 172 | long totalCount = 0; 173 | try { 174 | BufferedReader br = new BufferedReader(new FileReader(file)); 175 | try { 176 | String line; 177 | String[] lineSplit; 178 | while ((line = br.readLine()) != null) { 179 | if (line.startsWith("")) { 180 | continue; 181 | } 182 | // FIXME: put the delimiter to a global config file or at 183 | // least as a constant 184 | lineSplit = line.split("\t"); 185 | long currentCount; 186 | if (lineSplit.length == 1) { 187 | currentCount = Long.parseLong(lineSplit[0]); 188 | } else { 189 | currentCount = Long.parseLong(lineSplit[1]); 190 | } 191 | if (count == currentCount && !lineSplit[0].equals("")) { 192 | totalCount += 1; 193 | } 194 | } 195 | } finally { 196 | br.close(); 197 | } 198 | } catch (IOException e) { 199 | // TODO Auto-generated catch block 200 | e.printStackTrace(); 201 | } 202 | return totalCount; 203 | } 204 | } -------------------------------------------------------------------------------- /src/de/typology/utils/DecimalFormatter.java: -------------------------------------------------------------------------------- 1 | package de.typology.utils; 2 | 3 | import java.text.DecimalFormat; 4 | import java.text.DecimalFormatSymbols; 5 | 6 | public class DecimalFormatter { 7 | 8 | DecimalFormat decimalFormat; 9 | 10 | public DecimalFormatter(int decimalPlaces) { 11 | String format = "###."; 12 | for (int i = 0; i < decimalPlaces; i++) { 13 | format += "#"; 14 | } 15 | // set decimalFormat to override LOCALE values 16 | this.decimalFormat = new DecimalFormat(format); 17 | DecimalFormatSymbols symbols = new DecimalFormatSymbols(); 18 | symbols.setDecimalSeparator('.'); 19 | this.decimalFormat.setDecimalFormatSymbols(symbols); 20 | } 21 | 22 | public String getRoundedResult(double input) { 23 | return this.decimalFormat.format(input); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/de/typology/utils/SequenceFormatter.java: -------------------------------------------------------------------------------- 1 | package de.typology.utils; 2 | 3 | public class SequenceFormatter { 4 | 5 | /** 6 | * Removes word at position removeWordAtPosition. Words are separated with 7 | * whitespaces. Returns the resulting string. 8 | * 9 | * @param inputString 10 | * @param removeWordAtPosition 11 | * @return 12 | */ 13 | public static String removeWord(String inputString, int removeWordAtPosition) { 14 | String[] words = inputString.split("\\s"); 15 | String result = ""; 16 | for (int i = 0; i < words.length; i++) { 17 | if (i != removeWordAtPosition) { 18 | result += words[i] + " "; 19 | } 20 | } 21 | result = result.replaceFirst(" $", ""); 22 | return result; 23 | } 24 | 25 | public static String removeWords(String inputString, boolean[] pattern) { 26 | String[] words = inputString.split("\\s"); 27 | 28 | if (words.length == pattern.length) { 29 | String resultString = ""; 30 | for (int i = 0; i < pattern.length; i++) { 31 | if (pattern[i]) { 32 | resultString += words[i] + " "; 33 | } 34 | } 35 | resultString = resultString.replaceFirst(" $", ""); 36 | return resultString; 37 | } else { 38 | return ""; 39 | } 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /testDataset/testDataset.txt: -------------------------------------------------------------------------------- 1 | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. 2 | 3 | Lorem ipsum dolor sit amet, consetetur 4 | Lorem ipsum dolor sit amet, 5 | Lorem ipsum dolor sit 6 | Lorem ipsum dolor 7 | Lorem ipsum 8 | Lorem 9 | 10 | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, 11 | 12 | §$ $ % & / ) ( ! = + * + ~ # ' _ ,. > < < | ^ ° 13 | -------------------------------------------------------------------------------- /testDataset/training.txt: -------------------------------------------------------------------------------- 1 | Lorem ipsum dolor sit amet , consetetur sadipscing 2 | Lorem ipsum dolor sit amet , consetetur 3 | Lorem ipsum dolor sit amet , 4 | Lorem ipsum dolor sit amet 5 | Lorem ipsum dolor sit 6 | Lorem ipsum dolor 7 | Lorem ipsum 8 | Lorem 9 | et justo duo dolores et ea 10 | et justo duo dolores et ea 11 | et justo duo dolores et 12 | et justo duo dolores et 13 | et justo duo dolores 14 | et justo duo dolores 15 | justo duo dolores 16 | justo duo dolores 17 | duo dolores 18 | duo dolores 19 | dolores 20 | dolores -------------------------------------------------------------------------------- /tests/de/typology/indexes/WordIndexTest.java: -------------------------------------------------------------------------------- 1 | package de.typology.indexes; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertTrue; 5 | 6 | import java.io.File; 7 | 8 | import org.junit.After; 9 | import org.junit.AfterClass; 10 | import org.junit.Before; 11 | import org.junit.BeforeClass; 12 | import org.junit.Test; 13 | 14 | public class WordIndexTest { 15 | File inputFile = new File("testDataset/training.txt"); 16 | File indexFile = new File("testDataset/index.txt"); 17 | 18 | @BeforeClass 19 | public static void setUpBeforeClass() throws Exception { 20 | } 21 | 22 | @AfterClass 23 | public static void tearDownAfterClass() throws Exception { 24 | } 25 | 26 | @Before 27 | public void setUp() throws Exception { 28 | if (this.indexFile.exists()) { 29 | this.indexFile.delete(); 30 | } 31 | WordIndexer wi = new WordIndexer(); 32 | wi.buildIndex(this.inputFile, this.indexFile, 10, " ", " "); 33 | } 34 | 35 | @After 36 | public void tearDown() throws Exception { 37 | if (this.indexFile.exists()) { 38 | this.indexFile.delete(); 39 | } 40 | } 41 | 42 | @Test 43 | public void rankTest() { 44 | WordIndex wi = new WordIndex(this.indexFile); 45 | assertEquals(8, wi.rank("et")); 46 | assertEquals(3, wi.rank("A")); 47 | assertEquals(4, wi.rank("Z")); 48 | assertEquals(11, wi.rank("tempora")); 49 | assertEquals(11, wi.rank("z")); 50 | 51 | for (String word : wi) { 52 | assertTrue(word.length() > 0); 53 | } 54 | } 55 | 56 | @Test 57 | public void iteratorTest() { 58 | WordIndex wi = new WordIndex(this.indexFile); 59 | 60 | for (String word : wi) { 61 | assertTrue(word.length() > 0); 62 | } 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /tests/de/typology/indexes/WordIndexerTest.java: -------------------------------------------------------------------------------- 1 | package de.typology.indexes; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.io.File; 6 | 7 | import org.junit.After; 8 | import org.junit.AfterClass; 9 | import org.junit.Before; 10 | import org.junit.BeforeClass; 11 | import org.junit.Test; 12 | 13 | public class WordIndexerTest { 14 | File inputFile = new File("testDataset/training.txt"); 15 | File indexFile = new File("testDataset/index.txt"); 16 | 17 | @BeforeClass 18 | public static void setUpBeforeClass() throws Exception { 19 | } 20 | 21 | @AfterClass 22 | public static void tearDownAfterClass() throws Exception { 23 | } 24 | 25 | @Before 26 | public void setUp() throws Exception { 27 | if (this.indexFile.exists()) { 28 | this.indexFile.delete(); 29 | } 30 | } 31 | 32 | @After 33 | public void tearDown() throws Exception { 34 | if (this.indexFile.exists()) { 35 | this.indexFile.delete(); 36 | } 37 | } 38 | 39 | @Test 40 | public void buildIndexTest() { 41 | WordIndexer wi = new WordIndexer(); 42 | long maxCountPerFile = wi.buildIndex(this.inputFile, this.indexFile, 43 | 10, " ", " "); 44 | assertEquals(13, maxCountPerFile); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /tests/de/typology/smoother/KneserNeySmootherTest.java: -------------------------------------------------------------------------------- 1 | package de.typology.smoother; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.io.File; 6 | import java.util.ArrayList; 7 | import java.util.HashMap; 8 | 9 | import org.junit.AfterClass; 10 | import org.junit.Before; 11 | import org.junit.Test; 12 | 13 | import de.typology.indexes.WordIndexer; 14 | import de.typology.patterns.PatternBuilder; 15 | import de.typology.splitter.AbsoluteSplitter; 16 | import de.typology.splitter.SmoothingSplitter; 17 | 18 | public class KneserNeySmootherTest { 19 | 20 | File extractedSequenceDirectory; 21 | 22 | File absoluteDirectory; 23 | File continuationDirectory; 24 | File testSequenceFile; 25 | File kneserNeyFile; 26 | 27 | @AfterClass 28 | public static void tearDownAfterClass() throws Exception { 29 | } 30 | 31 | @Before 32 | public void setUp() throws Exception { 33 | String inputDirectoryPath = "testDataset/"; 34 | File inputFile = new File(inputDirectoryPath + "training.txt"); 35 | File indexFile = new File(inputDirectoryPath + "index.txt"); 36 | WordIndexer wier = new WordIndexer(); 37 | wier.buildIndex(inputFile, indexFile, 10, " ", " "); 38 | this.absoluteDirectory = new File(inputDirectoryPath + "absolute"); 39 | this.continuationDirectory = new File(inputDirectoryPath 40 | + "continuation"); 41 | 42 | AbsoluteSplitter as = new AbsoluteSplitter(inputFile, indexFile, 43 | this.absoluteDirectory, "\t", true, " ", " "); 44 | as.split(PatternBuilder.getGLMForSmoothingPatterns(5), 2); 45 | 46 | ArrayList lmPatterns = PatternBuilder 47 | .getReverseLMPatterns(5); 48 | SmoothingSplitter smoothingSplitter = new SmoothingSplitter( 49 | this.absoluteDirectory, this.continuationDirectory, indexFile, 50 | "\t", true); 51 | smoothingSplitter.split(lmPatterns, 2); 52 | 53 | this.testSequenceFile = new File(inputDirectoryPath 54 | + "test-sequences-5.txt"); 55 | this.extractedSequenceDirectory = new File(inputDirectoryPath); 56 | this.absoluteDirectory = new File(inputDirectoryPath + "absolute"); 57 | // TestSequenceExtractor tse = new TestSequenceExtractor( 58 | // this.testSequenceFile, this.absoluteDirectory, 59 | // this.continuationDirectory, this.extractedSequenceDirectory, 60 | // "\t", wi); 61 | // tse.extractContinuationSequences(5, 2); 62 | this.kneserNeyFile = new File(inputDirectoryPath + "kn-sequences-5.txt"); 63 | } 64 | 65 | // @Test 66 | // public void calculateDiscoutValuesTest() { 67 | // 68 | // KneserNeySmoother kns = new KneserNeySmoother( 69 | // this.extractedSequenceDirectory, this.absoluteDirectory, 70 | // this.continuationDirectory, "\t", 5); 71 | // kns.smooth(this.testSequenceFile, this.kneserNeyFile, 5, false); 72 | // double d = kns.discountTypeValueMap.get("1").get("D1+"); 73 | // assertEquals(0.529412, d, 0.00001); 74 | // } 75 | 76 | @Test 77 | public void calculateLowerOrderResultSimpleTest() { 78 | 79 | KneserNeySmoother kns = new KneserNeySmoother( 80 | this.extractedSequenceDirectory, this.absoluteDirectory, 81 | this.continuationDirectory, "\t"); 82 | 83 | HashMap> absoluteTypeSequenceValueMap = null; 84 | HashMap> continuationTypeSequenceValueMap = null; 85 | absoluteTypeSequenceValueMap = kns 86 | .readAbsoluteValuesIntoHashMap(kns.extractedAbsoluteDirectory); 87 | 88 | continuationTypeSequenceValueMap = kns 89 | .readContinuationValuesIntoHashMap(kns.extractedContinuationDirectory); 90 | 91 | kns.absoluteTypeSequenceValueMap = absoluteTypeSequenceValueMap; 92 | kns.continuationTypeSequenceValueMap = continuationTypeSequenceValueMap; 93 | 94 | kns.smooth(this.testSequenceFile, this.kneserNeyFile, 5, false, true, 95 | false); 96 | System.out.println(kns.continuationTypeSequenceValueMap.get("__").get( 97 | "")); 98 | assertEquals(0.625, kns.discountTypeValuesMap.get("_11").get("D1+"), 99 | 0.00001); 100 | assertEquals(0.0357, 101 | kns.calculateLowerOrderResult("dolor", 1, "1", false), 0.0001); 102 | assertEquals(0.07143, 103 | kns.calculateLowerOrderResult("et", 1, "1", false), 0.0001); 104 | assertEquals(0.39282, 105 | kns.calculateLowerOrderResult("", 1, "1", false), 0.0001); 106 | assertEquals(0.00840, 107 | kns.calculateLowerOrderResult("", 1, "1", false), 0.0001); 108 | assertEquals(0.2098, 109 | kns.calculateLowerOrderResult("sit amet", 2, "11", false), 110 | 0.0001); 111 | assertEquals(0.00525, 112 | kns.calculateLowerOrderResult("sit unknown", 2, "11", false), 113 | 0.0001); 114 | assertEquals(0.309885, kns.calculateLowerOrderResult("dolor sit amet", 115 | 3, "111", false), 0.0001); 116 | assertEquals(0.3595, kns.calculateLowerOrderResult( 117 | "ipsum dolor sit amet", 4, "1111", false), 0.0001); 118 | assertEquals(0.77929, kns.calculateConditionalProbability( 119 | "Lorem ipsum dolor sit amet", 5, "11111", false), 0.0001); 120 | 121 | } 122 | 123 | @Test 124 | public void calculateLowerOrderResultComplexTest() { 125 | 126 | KneserNeySmoother kns = new KneserNeySmoother( 127 | this.extractedSequenceDirectory, this.absoluteDirectory, 128 | this.continuationDirectory, "\t"); 129 | 130 | HashMap> absoluteTypeSequenceValueMap = null; 131 | HashMap> continuationTypeSequenceValueMap = null; 132 | 133 | absoluteTypeSequenceValueMap = kns 134 | .readAbsoluteValuesIntoHashMap(kns.extractedAbsoluteDirectory); 135 | 136 | continuationTypeSequenceValueMap = kns 137 | .readContinuationValuesIntoHashMap(kns.extractedContinuationDirectory); 138 | 139 | kns.absoluteTypeSequenceValueMap = absoluteTypeSequenceValueMap; 140 | kns.continuationTypeSequenceValueMap = continuationTypeSequenceValueMap; 141 | 142 | kns.smooth(this.testSequenceFile, this.kneserNeyFile, 5, true, true, 143 | false); 144 | try { 145 | Thread.sleep(500); 146 | } catch (InterruptedException ex) { 147 | Thread.currentThread().interrupt(); 148 | } 149 | System.out.println("----"); 150 | assertEquals(0.0084, 151 | kns.calculateConditionalProbability("notFound", 1, "1", false), 152 | 0.0001); 153 | assertEquals(0.0084, 154 | kns.calculateLowerOrderResult("notFound", 1, "1", false), 155 | 0.0001); 156 | kns.calculateProbability("Lorem ipsum dolor sit amet", 5, "11111", 157 | false); 158 | assertEquals(0.625, kns.discountTypeValuesMap.get("_11").get("D1+"), 159 | 0.00001); 160 | assertEquals(0.0357, 161 | kns.calculateLowerOrderResult("dolor", 1, "1", false), 0.0001); 162 | assertEquals(0.07143, 163 | kns.calculateLowerOrderResult("et", 1, "1", false), 0.0001); 164 | assertEquals(0.08474, 165 | kns.calculateConditionalProbability("et", 1, "1", false), 166 | 0.0001); 167 | assertEquals(0.39282, 168 | kns.calculateLowerOrderResult("", 1, "1", false), 0.0001); 169 | assertEquals(0.0084, 170 | kns.calculateLowerOrderResult("", 1, "1", false), 0.0001); 171 | assertEquals(0.2321, 172 | kns.calculateLowerOrderResult("sit amet", 2, "11", false), 173 | 0.0001); 174 | 175 | assertEquals(0.0275, 176 | kns.calculateLowerOrderResult("sit unknown", 2, "11", false), 177 | 0.0001); 178 | assertEquals(0.3587, kns.calculateLowerOrderResult("dolor sit amet", 3, 179 | "111", false), 0.0001); 180 | assertEquals(0.4173, kns.calculateLowerOrderResult( 181 | "ipsum dolor sit amet", 4, "1111", false), 0.0001); 182 | assertEquals(0.09857, kns.calculateConditionalProbability( 183 | " At vero eos et", 5, "11111", false), 0.0001); 184 | assertEquals(0.79221, kns.calculateConditionalProbability( 185 | "Lorem ipsum dolor sit amet", 5, "11111", false), 0.0001); 186 | 187 | System.out.println(kns.calculateProbability( 188 | "Lorem ipsum dolor sit amet", 5, "11111", false)); 189 | // assertEquals(0.00875, kns.calculateProbability( 190 | // "Lorem ipsum dolor sit amet", 5, "11111", false), 0.0001); 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /tests/de/typology/splitter/AggregatorTest.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertNull; 5 | 6 | import java.io.BufferedReader; 7 | import java.io.BufferedWriter; 8 | import java.io.File; 9 | import java.io.FileReader; 10 | import java.io.FileWriter; 11 | import java.io.IOException; 12 | 13 | import org.junit.After; 14 | import org.junit.AfterClass; 15 | import org.junit.Before; 16 | import org.junit.Test; 17 | 18 | public class AggregatorTest { 19 | File inputFile = new File("testDataset/aggregator-in.txt"); 20 | File outputFile = new File("testDataset/aggregator-out.txt"); 21 | 22 | @AfterClass 23 | public static void tearDownAfterClass() throws Exception { 24 | } 25 | 26 | @Before 27 | public void setUp() throws Exception { 28 | BufferedWriter br = new BufferedWriter(new FileWriter(this.inputFile)); 29 | br.write("b y b\t1\n"); 30 | br.write("c x a\t1\n"); 31 | br.write("b y a\t1\n"); 32 | br.write("a z a\t1\n"); 33 | br.write("c y b\t1\n"); 34 | br.write("c x a\t1\n"); 35 | br.close(); 36 | } 37 | 38 | @After 39 | public void tearDown() throws Exception { 40 | this.inputFile.delete(); 41 | } 42 | 43 | @Test 44 | public void aggregatorCol0Test() { 45 | Aggregator aggregator = new Aggregator(this.inputFile, this.outputFile, 46 | "\t", 0, false); 47 | aggregator.aggregateCounts(); 48 | try { 49 | BufferedReader br = new BufferedReader(new FileReader( 50 | this.outputFile)); 51 | assertEquals("a z a\t1", br.readLine()); 52 | assertEquals("b y a\t1", br.readLine()); 53 | assertEquals("b y b\t1", br.readLine()); 54 | assertEquals("c x a\t2", br.readLine()); 55 | assertEquals("c y b\t1", br.readLine()); 56 | assertNull(br.readLine()); 57 | br.close(); 58 | } catch (IOException e) { 59 | // TODO Auto-generated catch block 60 | e.printStackTrace(); 61 | } 62 | this.outputFile.delete(); 63 | 64 | } 65 | 66 | @Test 67 | public void aggregatorCol1Test() { 68 | Aggregator aggregator = new Aggregator(this.inputFile, this.outputFile, 69 | "\t", 1, false); 70 | aggregator.aggregateCounts(); 71 | try { 72 | BufferedReader br = new BufferedReader(new FileReader( 73 | this.outputFile)); 74 | assertEquals("c x a\t2", br.readLine()); 75 | assertEquals("b y a\t1", br.readLine()); 76 | assertEquals("b y b\t1", br.readLine()); 77 | assertEquals("c y b\t1", br.readLine()); 78 | assertEquals("a z a\t1", br.readLine()); 79 | assertNull(br.readLine()); 80 | br.close(); 81 | } catch (IOException e) { 82 | // TODO Auto-generated catch block 83 | e.printStackTrace(); 84 | } 85 | this.outputFile.delete(); 86 | } 87 | 88 | @Test 89 | public void aggregatorCol2Test() { 90 | Aggregator aggregator = new Aggregator(this.inputFile, this.outputFile, 91 | "\t", 2, false); 92 | aggregator.aggregateCounts(); 93 | try { 94 | BufferedReader br = new BufferedReader(new FileReader( 95 | this.outputFile)); 96 | assertEquals("a z a\t1", br.readLine()); 97 | assertEquals("b y a\t1", br.readLine()); 98 | assertEquals("c x a\t2", br.readLine()); 99 | assertEquals("b y b\t1", br.readLine()); 100 | assertEquals("c y b\t1", br.readLine()); 101 | assertNull(br.readLine()); 102 | br.close(); 103 | } catch (IOException e) { 104 | // TODO Auto-generated catch block 105 | e.printStackTrace(); 106 | } 107 | this.outputFile.delete(); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /tests/de/typology/splitter/SequenceModifierTest.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertNull; 5 | 6 | import java.io.BufferedReader; 7 | import java.io.BufferedWriter; 8 | import java.io.File; 9 | import java.io.FileWriter; 10 | import java.io.IOException; 11 | import java.io.InputStreamReader; 12 | import java.io.OutputStream; 13 | import java.io.PipedInputStream; 14 | import java.io.PipedOutputStream; 15 | 16 | import org.apache.commons.io.FileUtils; 17 | import org.junit.After; 18 | import org.junit.Before; 19 | import org.junit.Test; 20 | 21 | public class SequenceModifierTest { 22 | File inputDirectory = new File("testDataset/sequenceModifier"); 23 | OutputStream outputStream; 24 | private PipedInputStream pipedInputStream; 25 | private OutputStream pipedOutputStream; 26 | 27 | @Before 28 | public void setUp() throws Exception { 29 | if (this.inputDirectory.exists()) { 30 | FileUtils.deleteDirectory(this.inputDirectory); 31 | } 32 | this.inputDirectory.mkdir(); 33 | BufferedWriter br1 = new BufferedWriter(new FileWriter( 34 | this.inputDirectory.getAbsolutePath() + "/1")); 35 | br1.write("a b c\t13\n"); 36 | br1.write("d e f\t14\n"); 37 | br1.write("g h i\t15\n"); 38 | br1.close(); 39 | BufferedWriter br2 = new BufferedWriter(new FileWriter( 40 | this.inputDirectory.getAbsolutePath() + "/2")); 41 | br2.write("j k l\t16\n"); 42 | br2.write("m n o\t17\n"); 43 | br2.write("ä ö ü\t18\n"); 44 | br2.write("p q r\t19\n"); 45 | br2.close(); 46 | this.pipedInputStream = new PipedInputStream(10 * 8 * 1024); 47 | this.pipedOutputStream = new PipedOutputStream(this.pipedInputStream); 48 | } 49 | 50 | @After 51 | public void tearDown() throws Exception { 52 | if (this.inputDirectory.exists()) { 53 | FileUtils.deleteDirectory(this.inputDirectory); 54 | } 55 | } 56 | 57 | @Test 58 | public void sequenceModifier101Test() { 59 | boolean[] pattern = { true, false, true }; 60 | 61 | SequenceModifier sequenceModifier = new SequenceModifier( 62 | this.inputDirectory, this.pipedOutputStream, "\t", pattern, 63 | true, true); 64 | sequenceModifier.run(); 65 | BufferedReader bufferedReader = new BufferedReader( 66 | new InputStreamReader(this.pipedInputStream)); 67 | 68 | try { 69 | assertEquals("a c\t1", bufferedReader.readLine()); 70 | assertEquals("d f\t1", bufferedReader.readLine()); 71 | assertEquals("g i\t1", bufferedReader.readLine()); 72 | assertEquals("j l\t1", bufferedReader.readLine()); 73 | assertEquals("m o\t1", bufferedReader.readLine()); 74 | assertEquals("ä ü\t1", bufferedReader.readLine()); 75 | assertEquals("p r\t1", bufferedReader.readLine()); 76 | assertNull(bufferedReader.readLine()); 77 | bufferedReader.close(); 78 | } catch (IOException e) { 79 | // TODO Auto-generated catch block 80 | e.printStackTrace(); 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /tests/de/typology/splitter/SequencerTest.java: -------------------------------------------------------------------------------- 1 | package de.typology.splitter; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertNull; 5 | 6 | import java.io.BufferedReader; 7 | import java.io.File; 8 | import java.io.FileInputStream; 9 | import java.io.FileReader; 10 | import java.io.IOException; 11 | import java.io.InputStream; 12 | 13 | import org.apache.commons.io.FileUtils; 14 | import org.junit.After; 15 | import org.junit.AfterClass; 16 | import org.junit.Before; 17 | import org.junit.BeforeClass; 18 | import org.junit.Test; 19 | 20 | import de.typology.indexes.WordIndex; 21 | import de.typology.indexes.WordIndexer; 22 | 23 | public class SequencerTest { 24 | File inputFile = new File("testDataset/training.txt"); 25 | File indexFile = new File("testDataset/index.txt"); 26 | File sequencerOutputDirectory = new File("testDataset/sequencer/"); 27 | 28 | @BeforeClass 29 | public static void setUpBeforeClass() throws Exception { 30 | } 31 | 32 | @AfterClass 33 | public static void tearDownAfterClass() throws Exception { 34 | } 35 | 36 | @Before 37 | public void setUp() throws Exception { 38 | WordIndexer wordIndexer = new WordIndexer(); 39 | wordIndexer.buildIndex(this.inputFile, this.indexFile, 10, " ", 40 | " "); 41 | if (this.sequencerOutputDirectory.exists()) { 42 | FileUtils.deleteDirectory(this.sequencerOutputDirectory); 43 | } 44 | this.sequencerOutputDirectory.mkdir(); 45 | } 46 | 47 | @After 48 | public void tearDown() throws Exception { 49 | if (this.sequencerOutputDirectory.exists()) { 50 | FileUtils.deleteDirectory(this.sequencerOutputDirectory); 51 | } 52 | if (this.indexFile.exists()) { 53 | this.indexFile.delete(); 54 | } 55 | } 56 | 57 | @Test 58 | public void squencing1Test() { 59 | WordIndex wordIndex = new WordIndex(this.indexFile); 60 | boolean[] pattern = { true }; 61 | 62 | try { 63 | InputStream inputStream = new FileInputStream(this.inputFile); 64 | Sequencer sequencer = new Sequencer(inputStream, 65 | this.sequencerOutputDirectory, wordIndex, pattern, 66 | " ", " ", "\t", false, 0); 67 | 68 | sequencer.splitIntoFiles(); 69 | 70 | // test file contents 71 | BufferedReader br8 = new BufferedReader(new FileReader( 72 | this.sequencerOutputDirectory.getAbsolutePath() + "/8")); 73 | for (int i = 0; i < 10; i++) { 74 | assertEquals("et\t1", br8.readLine()); 75 | } 76 | assertNull(br8.readLine()); 77 | br8.close(); 78 | 79 | BufferedReader br2 = new BufferedReader(new FileReader( 80 | this.sequencerOutputDirectory.getAbsolutePath() + "/3")); 81 | for (int i = 0; i < 20; i++) { 82 | assertEquals("\t1", br2.readLine()); 83 | } 84 | assertNull(br2.readLine()); 85 | br2.close(); 86 | } catch (IOException e) { 87 | // TODO Auto-generated catch block 88 | e.printStackTrace(); 89 | } 90 | } 91 | 92 | @Test 93 | public void squencing1101Test() { 94 | WordIndex wordIndex = new WordIndex(this.indexFile); 95 | boolean[] pattern = { true, true, false, true }; 96 | 97 | try { 98 | InputStream inputStream = new FileInputStream(this.inputFile); 99 | Sequencer sequencer = new Sequencer(inputStream, 100 | this.sequencerOutputDirectory, wordIndex, pattern, 101 | " ", " ", "\t", false, 0); 102 | sequencer.splitIntoFiles(); 103 | 104 | // test file contents 105 | BufferedReader br0 = new BufferedReader(new FileReader( 106 | this.sequencerOutputDirectory.getAbsolutePath() + "/8")); 107 | for (int i = 0; i < 6; i++) { 108 | assertEquals("et justo dolores\t1", br0.readLine()); 109 | } 110 | assertNull(br0.readLine()); 111 | br0.close(); 112 | 113 | BufferedReader br10 = new BufferedReader(new FileReader( 114 | this.sequencerOutputDirectory.getAbsolutePath() + "/3")); 115 | for (int i = 0; i < 6; i++) { 116 | assertEquals(" Lorem dolor\t1", br10.readLine()); 117 | } 118 | assertEquals(" Lorem \t1", br10.readLine()); 119 | br10.close(); 120 | } catch (IOException e) { 121 | // TODO Auto-generated catch block 122 | e.printStackTrace(); 123 | } 124 | } 125 | } 126 | --------------------------------------------------------------------------------