├── .gitignore
├── A Generalized Language Model as the Combination of Skipped n-grams and Modified Kneser-Ney Smoothing.pdf
├── README.md
├── bachelor-thesis-martin-koerner.pdf
├── calcentropy.py
├── config.sample.txt
├── lib
    ├── README
    ├── apache
    │   ├── commons-compress-1.5
    │   │   ├── LICENSE.txt
    │   │   ├── NOTICE.txt
    │   │   ├── README.txt
    │   │   ├── RELEASE-NOTES.txt
    │   │   ├── commons-compress-1.5-javadoc.jar
    │   │   ├── commons-compress-1.5-sources.jar
    │   │   └── commons-compress-1.5.jar
    │   ├── commons-io-2.4
    │   │   ├── LICENSE.txt
    │   │   ├── NOTICE.txt
    │   │   ├── RELEASE-NOTES.txt
    │   │   ├── commons-io-2.4-javadoc.jar
    │   │   ├── commons-io-2.4-sources.jar
    │   │   └── commons-io-2.4.jar
    │   └── log4j-2.0-beta9
    │   │   ├── LICENSE.txt
    │   │   ├── NOTICE.txt
    │   │   ├── RELEASE-NOTES.txt
    │   │   ├── log4j-api-2.0-beta9-javadoc.jar
    │   │   ├── log4j-api-2.0-beta9-sources.jar
    │   │   ├── log4j-api-2.0-beta9.jar
    │   │   ├── log4j-core-2.0-beta9-javadoc.jar
    │   │   ├── log4j-core-2.0-beta9-sources.jar
    │   │   └── log4j-core-2.0-beta9.jar
    ├── lucene-analyzers-common-4.0.0.jar
    ├── lucene-core-4.0.0.jar
    ├── lucene-queryparser-4.0.0.jar
    ├── mongo-2.9.1.jar
    └── mysql
    │   └── mysql-connector-java-5.1.24-bin.jar
├── metriken.sh
├── mvn.sh
├── pom.xml
├── scripts
    ├── README.md
    ├── combineLM.sh
    ├── combineTypo.sh
    ├── combinefiles.sh
    ├── createPlot.sh
    ├── index-glm.sh
    ├── index-kneser-ney.sh
    ├── index-lm.sh
    ├── index-typo.sh
    ├── preparequery.sh
    ├── runpreparequery.sh
    ├── stats.txt
    ├── table.sh
    └── test.sh
├── src
    ├── de
    │   └── typology
    │   │   ├── executables
    │   │       ├── KneserNeyBuilder.java
    │   │       └── MultiKneserNeyBuilder.java
    │   │   ├── indexes
    │   │       ├── WordIndex.java
    │   │       └── WordIndexer.java
    │   │   ├── patterns
    │   │       ├── PatternBuilder.java
    │   │       └── PatternTransformer.java
    │   │   ├── smoother
    │   │       ├── KneserNeySmoother.java
    │   │       └── ModifiedKneserNeySmoother.java
    │   │   ├── splitter
    │   │       ├── AbsoluteSplitter.java
    │   │       ├── Aggregator.java
    │   │       ├── DataSetSplitter.java
    │   │       ├── LineCounterTask.java
    │   │       ├── SequenceModifier.java
    │   │       ├── Sequencer.java
    │   │       ├── SmoothingSplitter.java
    │   │       └── SplitterTask.java
    │   │   ├── tester
    │   │       ├── SequenceExtractorTask.java
    │   │       └── TestSequenceExtractor.java
    │   │   └── utils
    │   │       ├── Config.java
    │   │       ├── Counter.java
    │   │       ├── DecimalFormatter.java
    │   │       └── SequenceFormatter.java
    ├── log4j2.xml
    └── main
    │   └── resources
    │       └── log4j2.xml
├── testDataset
    ├── testDataset.txt
    └── training.txt
└── tests
    └── de
        └── typology
            ├── indexes
                ├── WordIndexTest.java
                └── WordIndexerTest.java
            ├── smoother
                └── KneserNeySmootherTest.java
            └── splitter
                ├── AggregatorTest.java
                ├── SequenceModifierTest.java
                └── SequencerTest.java


/.gitignore:
--------------------------------------------------------------------------------
 1 | /.classpath
 2 | /.project
 3 | /.settings
 4 | /bin
 5 | /config.txt
 6 | /testDataset
 7 | /logs
 8 | /target
 9 | 
10 | .ser
11 | mod*
12 | kneser*
13 | normalized*
14 | training.txt
15 | learning.txt
16 | test*
17 | absolute
18 | continuation
19 | parsed.txt


--------------------------------------------------------------------------------
/A Generalized Language Model as the Combination of Skipped n-grams and Modified Kneser-Ney Smoothing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/A Generalized Language Model as the Combination of Skipped n-grams and Modified Kneser-Ney Smoothing.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #Generalized Language Model Toolkit
 2 | 
 3 | The software can be used to compute a Generalized Language Model which is yet another mean to compute a [Language Model](http://en.wikipedia.org/wiki/Language_model). As shown [in this publication](http://arxiv.org/pdf/1404.3377v1.pdf) Generalized Language models can outperform Modified Kneser Ney Smoothing by 10 to 25 % in Terms of perplexity. 
 4 | 
 5 | ## Getting started
 6 | ```
 7 | git clone git@github.com:renepickhardt/generalized-language-modeling-toolkit.git
 8 | sudo chmod a+x mvn.sh
 9 | ```
10 | You will need to install maven in order to build the project.
11 | ```
12 | sudo apt-get install maven2
13 | ```
14 | 
15 | You need to copy config.sample.txt to config.txt and read the instructions in config.sample.txt.
16 | ```
17 | cp config.sample.txt config.txt
18 | emacs config.txt
19 | ```
20 | 
21 | After you set all your directories in config.txt you can run the project
22 | ```
23 | ./mvn.sh
24 | ```
25 | 
26 | ## Disk and Main memory requirements
27 | Since Generalized language models can become very large the software is written to use the hard disk. In this sense you can theoretically run the programm with very little memory. Still we recommend 16 GB of main memory for the large english wikipedia data sets. 
28 | 
29 | We tried to avoid frequent disc hits. Still the programm will execute much faster if you store your data on a Solid State disk. 
30 | 
31 | ## Download the test data sets
32 | you need to have a file called `normalized.txt` which serves as your input. This file should contain one sentence per line. You will learn language models based on this file. 
33 | 
34 | Please refere to http://glm.rene-pickhardt.de/data in order to download preprocessed and formatted data sets. 
35 | 
36 | If you whish to parse the data yourself (e.g. because you want to use a newer wikipedia dump) refer to https://github.com/mkrnr/lexer-parser
37 | 
38 | ## Processing pipeline of the GLM toolkit: 
39 | 
40 | you have to start with a file called `normalized.txt` which has to be stored in your data directory (according to `config.txt`). `mvn.sh` will compile the program and start the flow of the following steps (which can be configured by switching the fields ind `config.txt` from `true` to `false`)
41 | 
42 | * splitting `normalized.txt` to `training.txt` and `testing.txt` according to the datasplit parameters in `config.txt`
43 | * building a wordindex `index.txt` this index is used to split the language models into files of equal size
44 | * creating absolute counts and continuation counts in the directories `absolute` and `continuation`
45 | ** the various models are stored in folders like `11111` meaning a regular 5 gram or `11011` meaning a skipped 5 gram at the third position
46 | * creation of testing samples from `testing.txt`: `testing-samples-4.txt` for example contains about 100k sequences of 4 words to be tested
47 | * calculating the D and N values for Modified Kneser Ney Smoothing and making them persistent in the two *.ser files (for speeding up various tests)
48 | * running the experiments by creating files like `mod-kneser-ney-complex-backoffToCont-3.txt`: depending on your configuration the files could be named with a `simple` instead of `complex` (complex meaning GLM, simple meaning LM). Exchanging the `3` you can have different model lenghts. These files contain the testing samples with the log of their probabilities.
49 | * you have to manually calculate the entropy by running the python script as an argument you might want to pass `mod*.txt`: in this way you can calculate the entropy for all files and experiments.
50 | 
51 | ## Citing the paper
52 | If this software or data is of any help to your research please be so fair and cite the [original publication](http://arxiv.org/pdf/1404.3377v1.pdf) which is also in the home directory of [this git repository](https://github.com/renepickhardt/generalized-language-modeling-toolkit/raw/master/A Generalized Language Model as the Combination of Skipped n-grams and Modified Kneser-Ney Smoothing.pdf).
53 | You might want to use the following bibtex entry:
54 | ```
55 | @inproceedings{Pickhardt:2014:GLM, 
56 |    author = {Pickhardt, Rene and Gottron, Thomas and Körner, Martin and  Wagner, Paul Georg and  Speicher, Till and  Staab, Steffen}, 
57 |    title = {A Generalized Language Model as the Combination of Skipped n-grams and Modified Kneser Ney Smoothing}, 
58 |    year = {2014}, 
59 |    booktitle = {ACL'14: Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics}, 
60 |  } 
61 | ```
62 | 
63 | ## History
64 | The Generalized Language models envolved from Paul Georg Wagner's and Till Speicher's Young Scientists project called [Typology](http://www.typology.de) which I advised in 2012.
65 | The Typology project played around and evaluated an idea I had (inspired by [the PhD thesis of Adam Schenker](http://scholarcommons.usf.edu/cgi/viewcontent.cgi?article=2466&context=etd)) of presenting text as a graph in which the edges would encode relationships (nowerdays known as skipped bi-grams). The Graph was used to produce an answer to the next word prediction problem applied to word suggestions in keyboards of modern smartphones.
66 | From the convincing results I developed the theory of Generalized Language models. 
67 | Most of the Code was written by my student assistent [Martin Körner](http://mkoerner.de/) who also created his [bachlor thesis](https://github.com/renepickhardt/generalized-language-modeling-toolkit/raw/master/bachelor-thesis-martin-koerner.pdf) about the implementation of a preliminary vesion of the Generalized Language Models. This thesis is a nice reference if you want to get an understanding of modified kneser ney smoothing for standard language models. In terms of notation and building of generalized language models it is outdated.
68 | 
69 | ## Questions, Feedback, Bugs
70 | If you have questions feel free to contact me via the issue tracker. on [my blog](http://www.rene-pickhardt.de) or in the paper you could find my mail address.
71 | 


--------------------------------------------------------------------------------
/bachelor-thesis-martin-koerner.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/bachelor-thesis-martin-koerner.pdf


--------------------------------------------------------------------------------
/calcentropy.py:
--------------------------------------------------------------------------------
 1 | """                                                                                                                                                    
 2 | author: rene pickhardt                                                                                                                                 
 3 |                                                                                                                                                        
 4 | give a list of files as arguments. The files contain test sequences                                                                                    
 5 | with probabilities according to a trained language model                                                                                               
 6 |                                                                                                                                                        
 7 | this code ist GPLv3                                                                                                                                    
 8 | """
 9 | from math import log
10 | import sys
11 | 
12 | def calc(arg):
13 |     f=open(arg,"r")
14 |     res = 0
15 |     zero = 0;
16 |     wc = 0;
17 |     for l in f:
18 | 	fl = float(l.split("\t")[1])
19 | 	if (fl==0):
20 |             zero = zero + 1
21 |             continue
22 | 	res=res + log(fl,2)
23 | 	wc = wc + len(l.split(" "))
24 |     print arg + "\t entropy: " +  str((res*-1)/wc) + "\tsequences with zeros: " + str(zero)
25 | 
26 | 
27 | 
28 | for arg in sys.argv:
29 |     if (arg==sys.argv[0]):
30 | 	continue
31 |     calc(arg)
32 | 


--------------------------------------------------------------------------------
/config.sample.txt:
--------------------------------------------------------------------------------
  1 | ##################################################################################################
  2 | #
  3 | #		Wellcome to the Generalized Language Model toolkit config file.
  4 | #
  5 | # !!!!!!!!!! copy config.sample.txt to config.txt !!!!!!!!
  6 | #
  7 | # this is the configuration file for the generalized language modelleing toolkit 
  8 | # you can configure everything such as model length and the place of your training data.
  9 | #
 10 | # also this software can be used to run in several stages if your data set is large and your machine
 11 | # got killed for some reason you don't have to redo the calculation. 
 12 | #
 13 | # if you have questions please send a mail to rene@rene-pickhardt.de
 14 | ##################################################################################################
 15 | 
 16 | ##################################################################################################
 17 | ### basic settings which will be most certainly be needed to change
 18 | ##################################################################################################
 19 | 
 20 | #directory from which we will start to work
 21 | outputDirectory = /media/mssd/datasets/glm/out/
 22 | 
 23 | #length of the model to be trained
 24 | modelLength = 5
 25 | 
 26 | #amount of threads that should be concurrently assigned to the program
 27 | numberOfCores = 4
 28 | 
 29 | #name of the input data set (this is supposed to be a subfolder of outputDirectory) in this folder the trainingfile should be named normalized.txt and should contain one sentence per line.
 30 | inputDataSet = wiki
 31 | 
 32 | #can be used for multiple languages
 33 | languages = en
 34 | 
 35 | ##################################################################################################
 36 | ### stages of the entire calculaten in the order they are being processed
 37 | ### usefull for big data sets. here if something goes wrong you don't have to start over again
 38 | ### set the following values to false for the stages of processing you wish to skip
 39 | ##################################################################################################
 40 | 
 41 | ### first the data sets are split to training and test data
 42 | splitData = true
 43 | 
 44 | ### state if the index of words should be build. The index is used to create subfiles for counting and aggregating sequences
 45 | buildIndex = true
 46 | 
 47 | ### if the absolute values for skipped sequences should be build
 48 | buildGLM = true
 49 | 
 50 | ### states if also all the continuation values should be build.
 51 | buildContinuationGLM = true
 52 | 
 53 | ### the absolute counts and continuation counts from the entire LM which are needed for the testing-samples
 54 | ### will be extracted and stored in testing-samples/ pay attantion. If your testing-samples are too large
 55 | ### you might run out of memory when running the experiment since all the data needed will be stored into main
 56 | ### memory
 57 | extractContinuationGLM = true
 58 | 
 59 | ### set this to true if you want to build a standard kneser ney (generalized) language model
 60 | buildKneserNey = true
 61 | 
 62 | ### set this to true if you want to build a modified kneser ney (generalized) language model
 63 | buildModKneserNey = true
 64 | 
 65 | # was not used for paper since there is currently an acompaning python script for the task
 66 | calculateEntropy = false
 67 | 
 68 | ### calculate a standard language model
 69 | kneserNeySimple = true
 70 | 
 71 | ### calculate a generalized language model
 72 | kneserNeyComplex = true
 73 | 
 74 | ### use absolute discounting for interpolated probabilities (this should be set to false for the standard (modified) kneser ney implementation)
 75 | backoffAbsolute = false
 76 | 
 77 | ### don't use any smoothing but just calculate conditional probabilities.
 78 | conditionalProbabilityOnly = false
 79 | 
 80 | ##################################################################################################
 81 | ### misc
 82 | ##################################################################################################
 83 | 
 84 | ### should be used to save space
 85 | deleteTempFiles = true
 86 | 
 87 | ### is useful for modified kneser ney smoothing
 88 | addSentenceTags = true
 89 | addFakeStartTag = true
 90 | 
 91 | ### number of decimal places that will be used for calculation of smoothing algorithms
 92 | decimalPlaces = 30
 93 | 
 94 | ##################################################################################################
 95 | ### configuration of training data 
 96 | ##################################################################################################
 97 | 
 98 | ### number of test queries which will be sampled from the test query set
 99 | numberOfQueries = 100000
100 | 
101 | ### used for splitting files in which the skipped ngrams are stored and for index building
102 | maxCountDivider = 1000
103 | 
104 | ##################################################################################################
105 | ### the following numbers are for creation of training, learning and testing data splits.
106 | ##################################################################################################
107 | 
108 | # 20 means that only 20% of the input data will be thrown away
109 | sampleRate = 0
110 | 
111 | # 90 means that 90% of data will be training data
112 | splitDataRatio = 2
113 | 
114 | splitTestRatio = 100
115 | 
116 | 


--------------------------------------------------------------------------------
/lib/README:
--------------------------------------------------------------------------------
1 | origin of bzip2.jar:
2 | http://www.kohsuke.org/bzip2/
3 | 
4 | origin of wikimlj
5 | https://code.google.com/p/wikixmlj/downloads/detail?name=wikixmlj-r43.jar
6 | 


--------------------------------------------------------------------------------
/lib/apache/commons-compress-1.5/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/lib/apache/commons-compress-1.5/NOTICE.txt:
--------------------------------------------------------------------------------
1 | Apache Commons Compress
2 | Copyright 2002-2013 The Apache Software Foundation
3 | 
4 | This product includes software developed at
5 | The Apache Software Foundation (http://www.apache.org/).
6 | 


--------------------------------------------------------------------------------
/lib/apache/commons-compress-1.5/README.txt:
--------------------------------------------------------------------------------
 1 | Apache Commons Compress was derived from various sources, including:
 2 | 
 3 | Original BZip2 classes contributed by Keiron Liddle
 4 | <keiron@aftexsw.com>, Aftex Software to the Apache Ant project
 5 | They are based on a port of Julian Seward's libbzip2.
 6 | 
 7 | Original Tar classes from contributors of the Apache Ant project
 8 | 
 9 | Original Zip classes from contributors of the Apache Ant project
10 | 
11 | Original CPIO classes contributed by Markus Kuss and the jRPM project
12 | (jrpm.sourceforge.net)
13 | 


--------------------------------------------------------------------------------
/lib/apache/commons-compress-1.5/RELEASE-NOTES.txt:
--------------------------------------------------------------------------------
  1 |               Apache Commons Compress 1.5 RELEASE NOTES
  2 | 
  3 | Apache Commons Compress software defines an API for working with compression and archive formats.
  4 | These include: bzip2, gzip, pack200 and ar, cpio, jar, tar, zip, dump.
  5 | 
  6 | Release 1.5
  7 | 
  8 | Changes in this version include:
  9 | 
 10 | New features:
 11 | 
 12 | o CompressorStreamFactory has an option to create decompressing
 13 |   streams that decompress the full input for formats that support
 14 |   multiple concatenated streams.
 15 |   Issue: COMPRESS-220.
 16 | 
 17 | Fixed Bugs:
 18 | 
 19 | o Typo in CompressorStreamFactory Javadoc
 20 |   Issue: COMPRESS-218.
 21 |   Thanks to Gili.
 22 | o ArchiveStreamFactory's tar stream detection created false positives
 23 |   for AIFF files.
 24 |   Issue: COMPRESS-191.
 25 |   Thanks to Jukka Zitting.
 26 | o XZ for Java didn't provide an OSGi bundle.  Compress' dependency on
 27 |   it has now been marked optional so Compress itself can still be used
 28 |   in an OSGi context.
 29 |   Issue: COMPRESS-199.
 30 |   Thanks to Jukka Zitting.
 31 | o When specifying the encoding explicitly TarArchiveOutputStream would
 32 |   write unreadable names in GNU mode or even cause errors in POSIX
 33 |   mode for file names longer than 66 characters.
 34 |   Issue: COMPRESS-200.
 35 |   Thanks to Christian Schlichtherle.
 36 | o Writing TAR PAX headers failed if the generated entry name ended
 37 |   with a "/".
 38 |   Issue: COMPRESS-203.
 39 | o ZipArchiveInputStream sometimes failed to provide input to the
 40 |   Inflater when it needed it, leading to reads returning 0.
 41 |   Issue: COMPRESS-189.
 42 |   Thanks to Daniel Lowe.
 43 | o TarArchiveInputStream ignored the encoding for GNU long name
 44 |   entries.
 45 |   Issue: COMPRESS-212.
 46 | o TarArchiveInputStream could leave the second EOF record inside the
 47 |   stream it had just finished reading.
 48 |   Issue: COMPRESS-206.
 49 |   Thanks to Peter De Maeyer.
 50 | o DumpArchiveInputStream no longer implicitly closes the original
 51 |   input stream when it reaches the end of the archive.
 52 | o ZipArchiveInputStream now consumes the remainder of the archive when
 53 |   getNextZipEntry returns null.
 54 | o Unit tests could fail if the source tree was checked out to a
 55 |   directory tree containign spaces.
 56 |   Issue: COMPRESS-205.
 57 |   Thanks to Daniel Lowe.
 58 | o Fixed a potential ArrayIndexOutOfBoundsException when reading STORED
 59 |   entries from ZipArchiveInputStream.
 60 |   Issue: COMPRESS-219.
 61 | o CompressorStreamFactory can now be used without XZ for Java being
 62 |   available.
 63 |   Issue: COMPRESS-221.
 64 | 
 65 | Changes:
 66 | 
 67 | o Improved exception message if a zip archive cannot be read because
 68 |   of an unsupported compression method.
 69 |   Issue: COMPRESS-188.
 70 |   Thanks to Harald Kuhn.
 71 | o ArchiveStreamFactory has a setting for file name encoding that sets
 72 |   up encoding for ZIP and TAR streams.
 73 |   Issue: COMPRESS-192.
 74 |   Thanks to Jukka Zitting.
 75 | o TarArchiveEntry now has a method to verify its checksum.
 76 |   Issue: COMPRESS-191.
 77 |   Thanks to Jukka Zitting.
 78 | o Split/spanned ZIP archives are now properly detected by
 79 |   ArchiveStreamFactory but will cause an
 80 |   UnsupportedZipFeatureException when read.
 81 | o ZipArchiveInputStream now reads archives that start with a "PK00"
 82 |   signature.  Archives with this signatures are created when the
 83 |   archiver was willing to split the archive but in the end only needed
 84 |   a single segment - so didn't split anything.
 85 |   Issue: COMPRESS-208.
 86 | o TarArchiveEntry has a new constructor that allows setting linkFlag
 87 |   and preserveLeadingSlashes at the same time.
 88 |   Issue: COMPRESS-201.
 89 | o ChangeSetPerformer has a new perform overload that uses a ZipFile
 90 |   instance as input.
 91 |   Issue: COMPRESS-159.
 92 | o Garbage collection pressure has been reduced by reusing temporary
 93 |   byte arrays in classes.
 94 |   Issue: COMPRESS-172.
 95 |   Thanks to Thomas Mair.
 96 | o Can now handle zip extra field 0x5455 - Extended Timestamp.
 97 |   Issue: COMPRESS-210.
 98 |   Thanks to Julius Davies.
 99 | o handle zip extra field 0x7875 - Info Zip New Unix Extra Field.
100 |   Issue: COMPRESS-211.
101 |   Thanks to Julius Davies.
102 | o ZipShort, ZipLong, ZipEightByteInteger should implement Serializable
103 |   Issue: COMPRESS-213.
104 |   Thanks to Julius Davies.
105 | o better support for unix symlinks in ZipFile entries.
106 |   Issue: COMPRESS-214.
107 |   Thanks to Julius Davies.
108 | o ZipFile's initialization has been improved for non-Zip64 archives.
109 |   Issue: COMPRESS-215.
110 |   Thanks to Robin Power.
111 | o Updated XZ for Java dependency to 1.2 as this version provides
112 |   proper OSGi manifest attributes.
113 | 
114 | For complete information on Commons Compress, including instructions on how to submit bug reports,
115 | patches, or suggestions for improvement, see the Apache Commons Compress website:
116 | 
117 | http://commons.apache.org/compress/
118 | 


--------------------------------------------------------------------------------
/lib/apache/commons-compress-1.5/commons-compress-1.5-javadoc.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-compress-1.5/commons-compress-1.5-javadoc.jar


--------------------------------------------------------------------------------
/lib/apache/commons-compress-1.5/commons-compress-1.5-sources.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-compress-1.5/commons-compress-1.5-sources.jar


--------------------------------------------------------------------------------
/lib/apache/commons-compress-1.5/commons-compress-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-compress-1.5/commons-compress-1.5.jar


--------------------------------------------------------------------------------
/lib/apache/commons-io-2.4/NOTICE.txt:
--------------------------------------------------------------------------------
1 | Apache Commons IO
2 | Copyright 2002-2012 The Apache Software Foundation
3 | 
4 | This product includes software developed by
5 | The Apache Software Foundation (http://www.apache.org/).
6 | 
7 | 


--------------------------------------------------------------------------------
/lib/apache/commons-io-2.4/RELEASE-NOTES.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-io-2.4/RELEASE-NOTES.txt


--------------------------------------------------------------------------------
/lib/apache/commons-io-2.4/commons-io-2.4-javadoc.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-io-2.4/commons-io-2.4-javadoc.jar


--------------------------------------------------------------------------------
/lib/apache/commons-io-2.4/commons-io-2.4-sources.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-io-2.4/commons-io-2.4-sources.jar


--------------------------------------------------------------------------------
/lib/apache/commons-io-2.4/commons-io-2.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/commons-io-2.4/commons-io-2.4.jar


--------------------------------------------------------------------------------
/lib/apache/log4j-2.0-beta9/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 1999-2005 The Apache Software Foundation
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/lib/apache/log4j-2.0-beta9/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | Apache Log4j
 2 | Copyright 1999-2012 Apache Software Foundation
 3 | 
 4 | This product includes software developed at
 5 | The Apache Software Foundation (http://www.apache.org/).
 6 | 
 7 | ResolverUtil.java
 8 | Copyright 2005-2006 Tim Fennell
 9 | 
10 | Dumbster SMTP test server
11 | Copyright 2004 Jason Paul Kitchen
12 | 


--------------------------------------------------------------------------------
/lib/apache/log4j-2.0-beta9/RELEASE-NOTES.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |               Apache Log4j 2.0-beta9 RELEASE NOTES
  3 | 
  4 | The Apache Log4j 2 team is pleased to announce the Log4j 2.0-beta9 release!
  5 | 
  6 | Apache log4j is a well known framework for logging application behavior. Log4j 2 is an upgrade to
  7 | Log4j that provides significant improvements over its predecessor, Log4j 1.x, and provides
  8 | many of the improvements available in Logback while fixing some inherent problems in Logback's
  9 | architecture.
 10 | 
 11 | This is the eleventh release of Log4j 2 and is being made available to encourage use and feedback from the community.
 12 | 
 13 | Bug fixes and enhancements
 14 | 
 15 | Changes in this version include:
 16 | 
 17 | New features:
 18 | o LOG4J2-399:  Allow the default file rollover strategy to define the compression level. 
 19 | o LOG4J2-338:  Add TLSAppender. Also added missing license headers to several files. Thanks to Tibor Benke. 
 20 | o LOG4J2-253:  Added FAQ page to the site. 
 21 | o LOG4J2-362:  Add a diagram to the site (FAQ page) that explains when to use which jar. 
 22 | o LOG4J2-374:  Add more options to PatternLayout to display more detailed information about a Throwable. Thanks to Tibor Benke. 
 23 | o LOG4J2-383:  [Pattern Layout] Customize level names by length. 
 24 | o LOG4J2-384:  [Pattern Layout] Customize level names to lower-case. 
 25 | o LOG4J2-364:  Add WebLookup to retrieve information from the ServletContext. Thanks to David Nault. 
 26 | o LOG4J2-360:  Allow Plugins to have aliases. 
 27 | o LOG4J2-356:  Create a JSON Layout. 
 28 | o LOG4J2-341:  Enable XInclude for XML configurations. 
 29 | o LOG4J2-313:  Add JNDILookup plugin. Thanks to Woonsan Ko. 
 30 | o LOG4J2-305:  Ease porting from 1.x Logger.getRootLogger(): add LogManager.getRootLogger(). 
 31 | 
 32 | Fixed Bugs:
 33 | o LOG4J2-226:  Fix table of contents generation in pdf. 
 34 | o LOG4J2-395:  Allow classpath scheme when specifying configuration file location as a system property. Thanks to Abhinav Shah. 
 35 | o LOG4J2-393:  Initialize PluginManager once during configuration. Move advertisement setup into BaseConfiguration. 
 36 | o LOG4J2-391:  FlumePersistentManager now handles LockConflictExceptions in Berkeley Db. Thanks to Kamal Bahadur. 
 37 | o LOG4J2-380:  Use rollover date when substituting ${date} in the filePattern. 
 38 | o LOG4J2-322:  Centralized reflective use of Reflection#getCallerClass and properly handled its instability in various versions
 39 |         of Java. 
 40 | o LOG4J2-293:  Reset the Configuration if the ClassLoaderContextSelector creates a LoggerContext without a configuration
 41 |         location and then is later provided one. 
 42 | o LOG4J2-293:  Changed the ConfigurationFactory to recognize and properly use the classpath: URI scheme in addition to the
 43 |         classloader: URI scheme. Thanks to Abhinav Shah. 
 44 | o LOG4J2-359:  Changed the Servlet 3.0 auto-initializer so that it does nothing in a Servlet 2.5 or older application. This
 45 |         ensures behavioral consistency across containers. Thanks to Abhinav Shah. 
 46 | o LOG4J2-310:  Fixed issue where SMTPAppender did not send mails with error or fatal level without prior info event. Thanks to Olivier Lemasle. 
 47 | o LOG4J2-368:  Add PatternLayout constructor to Log4j 1.2 bridge for Velocity. 
 48 | o LOG4J2-333:  Match artifact ids with Maven module names. Thanks to Hervé Boutemy. 
 49 | o LOG4J2-367:  JMS appenders send two messages for one append. Thanks to David Parry. 
 50 | o LOG4J2-319:  Double stack trace logging when using %throwable in %style and %highlight. 
 51 | o LOG4J2-358:  NoSQLAppender using MongoDB provider ignores username and password attributes 
 52 | o LOG4J2-343:  Removed unnecessary generics from Appender interface and implementing classes. Thanks to Henning Schmiedehausen. 
 53 | o LOG4J2-351:  [OSGi] wrong Fragment-Host in manifest files. Thanks to Roland Weiglhofer. 
 54 | o LOG4J2-336:  AsyncLogger errors after multiple calls to LoggerContext.reconfigure(). Thanks to Andre Bogus. 
 55 | o LOG4J2-347:  Give the AsyncAppender thread a more descriptive name for easier debugging/profiling. Thanks to David Phillips. 
 56 | o LOG4J2-332:  Modified documentation to refer to SLF4J Binding instead of SLF4J Bridge. Thanks to Hervé Boutemy. 
 57 | o LOG4J2-342:  Ignore xml:base attributes. 
 58 | o LOG4J2-309:  Insure jars and distributions only have a single License and Notice file. 
 59 | o LOG4J2-320:  JPAAppender stops logging because META-INF/log4j-provider.properties is left open. 
 60 | o LOG4J2-335:  FlumePersistentManager's writer thread had high CPU usage. 
 61 | o LOG4J2-331:  Removed erroneous check for affected MongoDB records, which always returns zero on inserts. 
 62 | o LOG4J2-330:  Added a BSON Transformer so that MongoDB can persist Log4j events. 
 63 | o LOG4J2-329:  StatusLogger now only creates StatusData objects if they are the appropriate logging level. 
 64 | o LOG4J2-328:  FlumePersistentManager was calling Berkeley DB's count method too frequently. 
 65 | o LOG4J2-280:  Additional fix to make AsyncAppender threads daemon threads and improve their thread name. 
 66 | o LOG4J2-165:  The slf4j-ext jar is now an optional dependency of the SLF4J bridge. 
 67 | o LOG4J2-166:  RoutingAppender's default Route can now be an appender reference. 
 68 | o LOG4J2-299:  Add getThrowable method to ThrowableProxy. 
 69 | o LOG4J2-216:  ThrowableProxy no longer extends Throwable. 
 70 | o LOG4J2-311:  Synchronized flush() and close() methods in the XxxFileManager and OutputStreamManager classes. 
 71 | o LOG4J2-304:  Fixed Async Loggers memory leak. 
 72 | o LOG4J2-291:  Fixed JDBC, JPA, and NoSQL appenders so that the failover appender properly fails over on error. 
 73 | o LOG4J2-397:  Logger.info(Message) Javadoc is incorrect. Thanks to Yonatan Graber. 
 74 | 
 75 | Changes:
 76 | o LOG4J2-317:  Renamed FastFileAppender and FastRollingFileAppender to RandomAccessFileAppender
 77 |         and RollingRandomAccessFileAppender. Configurations using the Fast(Rolling)File element
 78 |         no longer work and should be modified to use the (Rolling)RandomAccessFile element. 
 79 | o Changed the "suppressExceptions" configuration attribute for all Appenders to "ignoreExceptions" to avoid
 80 |         confusion with Java 7 suppressed exceptions. Also renamed the Appender#isExceptionSuppressed() method to
 81 |         Appender#ignoreExceptions() to avoid the same confusion. All Appenders by default internally log and then ignore
 82 |         exceptions encountered while logging. Setting "ignoreExceptions" to "false" on an Appender causes it to allow
 83 |         exceptions to propagate to the caller. You must set "ignoreExceptions" to "false" for Appenders you are wrapping
 84 |         in the Failover Appender. 
 85 | o Changed the (relatively new) PatternLayout configuration attribute "suppressExceptions" to
 86 |         "alwaysWriteExceptions" to more correctly indicate what it does. As such, the meaning of this attribute has
 87 |         reversed (previous "true"s should become "false"s, and vice versa). Since this was an undocumented attribute up
 88 |         until now, it's unlikely this change will affect any users. 
 89 | o LOG4J2-355:  Add support for multiple SD-ELEMENTs in a RFC 5424 syslog message. Thanks to Tibor Benke. 
 90 | o Cleaned up tests and cleared up documentation for the JPA appender following the resolution of EclipseLink
 91 |         issue #412454. 
 92 | o LOG4J2-318:  Allow shutdown hook to be disabled in the configuration. 
 93 | o LOG4J2-312:  XML layout improvements (compact vs. pretty, namespace, namespace prefix, root element). 
 94 | o LOG4J2-388:  Update Java Mail dependency to 1.5.0 from 1.4.7. 
 95 | o LOG4J2-325:  Update JDBC tests to use H2 database 1.3.173 from 1.3.172. 
 96 | o LOG4J2-366:  Update commons-logging to 1.1.3 from 1.1.1. 
 97 | o LOG4J2-390:  Update HSQLDB dependency to 2.3.0 from 2.2.9. 
 98 | o LOG4J2-308:  Clarified which library versions were used in Async Loggers performance test. 
 99 | o LOG4J2-307:  Updated Async Loggers' LMAX Disruptor library from 3.0.1 to 3.2.0. 
100 | o LOG4J2-306:  Update JSON Jackson library to 2.2.2 from 2.2.1. 
101 | o LOG4J2-387:  Update Jackson dependency to 1.9.13 from 1.9.11. 
102 | o Improved site by adding quick jump-off page and menu for Javadoc links for all components. 
103 | 
104 | 
105 | Apache Log4j 2.0-beta9 requires a minimum of Java 6 to build and run. Basic compatibility with
106 | Log4j 1.x is provided through the log4j-1.2-api component, however it does not implement some of the
107 | very implementation specific classes and methods. The package names and Maven groupId have been changed to
108 | org.apache.logging.log4j to avoid any conflicts with log4j 1.x.
109 | 
110 | For complete information on Apache Log4j 2, including instructions on how to submit bug reports,
111 | patches, or suggestions for improvement, see the Apache Apache Log4j 2 website:
112 | 
113 | http://logging.apache.org/log4j/2.x/
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9-javadoc.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9-javadoc.jar


--------------------------------------------------------------------------------
/lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9-sources.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9-sources.jar


--------------------------------------------------------------------------------
/lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-api-2.0-beta9.jar


--------------------------------------------------------------------------------
/lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9-javadoc.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9-javadoc.jar


--------------------------------------------------------------------------------
/lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9-sources.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9-sources.jar


--------------------------------------------------------------------------------
/lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/apache/log4j-2.0-beta9/log4j-core-2.0-beta9.jar


--------------------------------------------------------------------------------
/lib/lucene-analyzers-common-4.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/lucene-analyzers-common-4.0.0.jar


--------------------------------------------------------------------------------
/lib/lucene-core-4.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/lucene-core-4.0.0.jar


--------------------------------------------------------------------------------
/lib/lucene-queryparser-4.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/lucene-queryparser-4.0.0.jar


--------------------------------------------------------------------------------
/lib/mongo-2.9.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/mongo-2.9.1.jar


--------------------------------------------------------------------------------
/lib/mysql/mysql-connector-java-5.1.24-bin.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renepickhardt/generalized-language-modeling-toolkit/803cc43f2f870c97c3ae74d0f63e7c74c8da3280/lib/mysql/mysql-connector-java-5.1.24-bin.jar


--------------------------------------------------------------------------------
/metriken.sh:
--------------------------------------------------------------------------------
  1 |  grep "KSS: 19" typo-4-7095.log | wc -l
  2 |  grep "KSS: 18" typo-4-7095.log | wc -l
  3 |  grep "KSS: 17" typo-4-7095.log | wc -l
  4 |  grep "KSS: 16" typo-4-7095.log | wc -l
  5 |  grep "KSS: 15" typo-4-7095.log | wc -l
  6 |  grep "KSS: 14" typo-4-7095.log | wc -l
  7 |  grep "KSS: 13" typo-4-7095.log | wc -l
  8 |  grep "KSS: 12" typo-4-7095.log | wc -l
  9 |  grep "KSS: 11" typo-4-7095.log | wc -l
 10 |  grep "KSS: 10" typo-4-7095.log | wc -l
 11 |  grep "KSS: 9" typo-4-7095.log | wc -l
 12 |  grep "KSS: 8" typo-4-7095.log | wc -l
 13 |  grep "KSS: 7" typo-4-7095.log | wc -l
 14 |  grep "KSS: 6" typo-4-7095.log | wc -l
 15 |  grep "KSS: 5" typo-4-7095.log | wc -l
 16 |  grep "KSS: 4" typo-4-7095.log | wc -l
 17 |  grep "KSS: 3" typo-4-7095.log | wc -l
 18 |  grep "KSS: 2" typo-4-7095.log | wc -l
 19 |  grep "KSS: 1" typo-4-7095.log | wc -l
 20 | 
 21 | 
 22 | #grep "KSS" typo-4-7095.log | grep "PREFIXLENGHT: 3"
 23 | 
 24 | echo "no matches:"
 25 | grep "NOTHING" typo-4-7095.log | wc -l
 26 | echo "matches:"
 27 | grep "HIT" typo-4-7095.log | wc -l
 28 | 
 29 | echo "matches on rank 1 any prefix"
 30 | grep "HIT" typo-4-7095.log | grep "RANK: 1" | wc -l
 31 | 
 32 | echo "matches on rank 2 any prefix"
 33 | grep "HIT" typo-4-7095.log | grep "RANK: 2" | wc -l
 34 | 
 35 | echo "matches on rank 3 any prefix"
 36 | grep "HIT" typo-4-7095.log | grep "RANK: 3" | wc -l
 37 | 
 38 | echo "matches on rank 4 any prefix"
 39 | grep "HIT" typo-4-7095.log | grep "RANK: 4" | wc -l
 40 | 
 41 | echo "matches on rank 5 any prefix"
 42 | grep "HIT" typo-4-7095.log | grep "RANK: 5" | wc -l
 43 | 
 44 | 
 45 | echo "total number of hits with prefix 0:"
 46 | HIT=`grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 0" | wc -l`
 47 | echo $HIT
 48 | echo "total number of NO HITS with prefix 0:"
 49 | NO=`grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 0" | wc -l`
 50 | echo $NO
 51 | SUM=$((HIT + NO))
 52 | HIT=$((HIT * 1000))
 53 | frac=$((HIT / SUM))
 54 | echo $frac "Promille hits"
 55 | 
 56 | echo "matches with on rank 1 and prefix 0:"
 57 | grep "HIT" typo-4-7095.log | grep "RANK: 1" | grep "PREFIXLENGHT: 0" | wc -l
 58 | 
 59 | echo "matches with on rank 2 and prefix 0:"
 60 | grep "HIT" typo-4-7095.log | grep "RANK: 2" | grep "PREFIXLENGHT: 0" | wc -l
 61 | 
 62 | echo "matches with on rank 3 and prefix 0:"
 63 | grep "HIT" typo-4-7095.log | grep "RANK: 3" | grep "PREFIXLENGHT: 0" | wc -l
 64 | 
 65 | echo "matches with on rank 4 and prefix 0:"
 66 | grep "HIT" typo-4-7095.log | grep "RANK: 4" | grep "PREFIXLENGHT: 0" | wc -l
 67 | 
 68 | echo "matches with on rank 5 and prefix 0:"
 69 | grep "HIT" typo-4-7095.log | grep "RANK: 5" | grep "PREFIXLENGHT: 0" | wc -l
 70 | 
 71 | 
 72 | 
 73 | 
 74 | echo "total number of hits with prefix 1:"
 75 | grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 1" | wc -l
 76 | echo "total number of NO HITS with prefix 1:"
 77 | grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 1" | wc -l
 78 | 
 79 | 
 80 | echo "matches with on rank 1 and prefix 1:"
 81 | grep "HIT" typo-4-7095.log | grep "RANK: 1" | grep "PREFIXLENGHT: 1" | wc -l
 82 | 
 83 | echo "matches with on rank 2 and prefix 1:"
 84 | grep "HIT" typo-4-7095.log | grep "RANK: 2" | grep "PREFIXLENGHT: 1" | wc -l
 85 | 
 86 | echo "matches with on rank 3 and prefix 1:"
 87 | grep "HIT" typo-4-7095.log | grep "RANK: 3" | grep "PREFIXLENGHT: 1" | wc -l
 88 | 
 89 | echo "matches with on rank 4 and prefix 1:"
 90 | grep "HIT" typo-4-7095.log | grep "RANK: 4" | grep "PREFIXLENGHT: 1" | wc -l
 91 | 
 92 | echo "matches with on rank 5 and prefix 1:"
 93 | grep "HIT" typo-4-7095.log | grep "RANK: 5" | grep "PREFIXLENGHT: 1" | wc -l
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | echo "total number of hits with prefix 2:"
100 | grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 2" | wc -l
101 | echo "total number of NO HITS with prefix 2:"
102 | grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 2" | wc -l
103 | 
104 | 
105 | echo "matches with on rank 1 and prefix 2:"
106 | grep "HIT" typo-4-7095.log | grep "RANK: 1" | grep "PREFIXLENGHT: 2" | wc -l
107 | 
108 | echo "matches with on rank 2 and prefix 2:"
109 | grep "HIT" typo-4-7095.log | grep "RANK: 2" | grep "PREFIXLENGHT: 2" | wc -l
110 | 
111 | echo "matches with on rank 3 and prefix 2:"
112 | grep "HIT" typo-4-7095.log | grep "RANK: 3" | grep "PREFIXLENGHT: 2" | wc -l
113 | 
114 | echo "matches with on rank 4 and prefix 2:"
115 | grep "HIT" typo-4-7095.log | grep "RANK: 4" | grep "PREFIXLENGHT: 2" | wc -l
116 | 
117 | echo "matches with on rank 5 and prefix 2:"
118 | grep "HIT" typo-4-7095.log | grep "RANK: 5" | grep "PREFIXLENGHT: 2" | wc -l
119 | 
120 | 
121 | 
122 | 
123 | echo "total number of hits with prefix 3:"
124 | grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 3" | wc -l
125 | echo "total number of NO HITS with prefix 3:"
126 | grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 3" | wc -l
127 | 
128 | 
129 | echo "matches with on rank 1 and prefix 3:"
130 | grep "HIT" typo-4-7095.log | grep "RANK: 1" | grep "PREFIXLENGHT: 3" | wc -l
131 | 
132 | echo "matches with on rank 2 and prefix 3:"
133 | grep "HIT" typo-4-7095.log | grep "RANK: 2" | grep "PREFIXLENGHT: 3" | wc -l
134 | 
135 | echo "matches with on rank 3 and prefix 3:"
136 | grep "HIT" typo-4-7095.log | grep "RANK: 3" | grep "PREFIXLENGHT: 3" | wc -l
137 | 
138 | echo "matches with on rank 4 and prefix 3:"
139 | grep "HIT" typo-4-7095.log | grep "RANK: 4" | grep "PREFIXLENGHT: 3" | wc -l
140 | 
141 | echo "matches with on rank 5 and prefix 3:"
142 | grep "HIT" typo-4-7095.log | grep "RANK: 5" | grep "PREFIXLENGHT: 3" | wc -l
143 | 
144 | 
145 | 
146 | echo "total number of hits with prefix 0:"
147 | HIT=`grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 0" | wc -l`
148 | echo $HIT
149 | echo "total number of NO HITS with prefix 0:"
150 | NO=`grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 0" | wc -l`
151 | echo $NO
152 | SUM=$((HIT + NO))
153 | HIT=$((HIT * 1000))
154 | frac=$((HIT / SUM))
155 | echo $frac "Promille hits"
156 | 
157 | echo "total number of hits with prefix 1:"
158 | HIT=`grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 1" | wc -l`
159 | echo $HIT
160 | echo "total number of NO HITS with prefix 1:"
161 | NO=`grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 1" | wc -l`
162 | echo $NO
163 | SUM=$((HIT + NO))
164 | HIT=$((HIT * 1000))
165 | frac=$((HIT / SUM))
166 | echo $frac "Promille hits"
167 | 
168 | echo "total number of hits with prefix 2:"
169 | HIT=`grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 2" | wc -l`
170 | echo $HIT
171 | echo "total number of NO HITS with prefix 2:"
172 | NO=`grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 2" | wc -l`
173 | echo $NO
174 | SUM=$((HIT + NO))
175 | HIT=$((HIT * 1000))
176 | frac=$((HIT / SUM))
177 | echo $frac "Promille hits"
178 | 
179 | echo "total number of hits with prefix 3:"
180 | HIT=`grep "HIT" typo-4-7095.log | grep "PREFIXLENGHT: 3" | wc -l`
181 | echo $HIT
182 | echo "total number of NO HITS with prefix 3:"
183 | NO=`grep "NOTHING" typo-4-7095.log | grep "PREFIXLENGTH: 3" | wc -l`
184 | echo $NO
185 | SUM=$((HIT + NO))
186 | HIT=$((HIT * 1000))
187 | frac=$((HIT / SUM))
188 | echo $frac "Promille hits"
189 | 


--------------------------------------------------------------------------------
/mvn.sh:
--------------------------------------------------------------------------------
1 | ulimit -v 20000000
2 | mvn clean
3 | mvn compile
4 | nice mvn exec:java -Dexec.mainClass="de.typology.executables.KneserNeyBuilder" -Dfile.encoding=UTF-8
5 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <properties>
 3 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 4 |   </properties>
 5 |   <modelVersion>4.0.0</modelVersion>
 6 |   <groupId>de.typology</groupId>
 7 |   <artifactId>typology</artifactId>
 8 |   <version>0.0.1-SNAPSHOT</version>
 9 |   <build>
10 |     <!-- http://stackoverflow.com/questions/224373/handling-unconventional-source-directory-for-a-web-project-in-maven -->
11 |     <sourceDirectory>src</sourceDirectory>
12 |     <plugins>
13 |       <plugin>
14 |         <artifactId>maven-compiler-plugin</artifactId>
15 |         <version>2.3.2</version>
16 |         <configuration>
17 |           <source>1.6</source>
18 |           <target>1.6</target>
19 |           <encoding>UTF-8</encoding>
20 |         </configuration>
21 |       </plugin>
22 |       <plugin>
23 |         <groupId>org.codehaus.mojo</groupId>
24 |         <artifactId>exec-maven-plugin</artifactId>
25 |         <version>1.2.1</version>
26 |       </plugin>
27 |       <plugin>
28 |         <artifactId>maven-surefire-plugin</artifactId>
29 |         <version>2.16</version>
30 |         <configuration>
31 |           <argLine>-Xmx6024m</argLine>
32 |         </configuration>
33 |       </plugin>
34 |       <plugin>
35 | 	<artifactId>maven-assembly-plugin</artifactId>
36 | 	<configuration>
37 | 	  <archive>
38 | 	    <manifest>
39 | 	      <mainClass>de.typology.executables.KneserNeyBuilder</mainClass>
40 | 	    </manifest>
41 | 	  </archive>
42 | 	  <descriptorRefs>
43 | 	    <descriptorRef>jar-with-dependencies</descriptorRef>
44 | 	  </descriptorRefs>
45 | 	</configuration>
46 |       </plugin>
47 |     </plugins>
48 |   </build>
49 |   <dependencies>
50 |     <dependency>
51 |       <groupId>mysql</groupId>
52 |       <artifactId>mysql-connector-java</artifactId>
53 |       <version>5.1.22</version>
54 |     </dependency>
55 |     <dependency>
56 |       <groupId>org.apache.commons</groupId>
57 |       <artifactId>commons-io</artifactId>
58 |       <version>1.3.2</version>
59 |     </dependency>
60 |     <dependency>
61 |       <groupId>org.apache.logging.log4j</groupId>
62 |       <artifactId>log4j-api</artifactId>
63 |       <version>2.0-beta9</version>
64 |     </dependency>
65 |     <dependency>
66 |       <groupId>org.apache.logging.log4j</groupId>
67 |       <artifactId>log4j-core</artifactId>
68 |       <version>2.0-beta9</version>
69 |     </dependency>
70 |     <dependency>
71 |       <groupId>org.apache.commons</groupId>
72 |       <artifactId>commons-lang3</artifactId>
73 |       <version>3.2</version>
74 |     </dependency>
75 |     <dependency>
76 |       <groupId>junit</groupId>
77 |       <artifactId>junit</artifactId>
78 |       <version>4.11</version>
79 |       <scope>test</scope>
80 |     </dependency>
81 |   </dependencies>
82 | </project>
83 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
 1 | #Scripts
 2 | ##Content
 3 | Contains scripts to perform to following tasks:
 4 | + Insert the different language models into a MySQL and build indices
 5 | + Aggregate the evaluation data
 6 | + Plot the aggregated evaluation data
 7 | 
 8 | ##Structure
 9 | To be continued...
10 | 
11 | ##Configuration
12 | To be continued...
13 | 


--------------------------------------------------------------------------------
/scripts/combineLM.sh:
--------------------------------------------------------------------------------
 1 | # combine ngrams
 2 | cd $1
 3 | for n in 2 3 4 5 
 4 | do
 5 | cd ${n}/
 6 | for l in A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 7 | do
 8 | cat ${l}* > ../${l}.${n}n
 9 | rm ${l}*
10 | mv ../${l}.${n}n .
11 | done
12 | 
13 | 
14 | for l in a b c d e f g h i j k l m n p q r s t u v w x y z
15 | do
16 | cat ${l}* > ../${l}.${n}n
17 | rm ${l}*
18 | mv ../${l}.${n}n .
19 | done
20 | 
21 | 
22 | mv other.${n}n ../
23 | cat o* > ../o.${n}n
24 | rm o*
25 | mv ../o.${n}n .
26 | mv ../other.${n}n .
27 | cd ../
28 | done
29 | 


--------------------------------------------------------------------------------
/scripts/combineTypo.sh:
--------------------------------------------------------------------------------
 1 | # combine typology
 2 | cd $1
 3 | for n in 1 2 3 4
 4 | do
 5 | cd ${n}/
 6 | for l in A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 7 | do
 8 | cat ${l}* > ../${l}.${n}es
 9 | rm ${l}*
10 | mv ../${l}.${n}es .
11 | done
12 | 
13 | 
14 | for l in a b c d e f g h i j k l m n p q r s t u v w x y z
15 | do
16 | cat ${l}* > ../${l}.${n}es
17 | rm ${l}*
18 | mv ../${l}.${n}es .
19 | done
20 | 
21 | 
22 | mv other.${n}n ../
23 | cat o* > ../o.${n}es
24 | rm o*
25 | mv ../o.${n}es .
26 | mv ../other.${n}es .
27 | cd ../
28 | done
29 | 


--------------------------------------------------------------------------------
/scripts/combinefiles.sh:
--------------------------------------------------------------------------------
 1 | # combine ngrams
 2 | cd $1
 3 | for n in 2 3 4 5 
 4 | do
 5 | cd ${n}/
 6 | for l in A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 7 | do
 8 | cat ${l}* > ../${l}.${n}n
 9 | rm ${l}*
10 | mv ../${l}.${n}n .
11 | done
12 | 
13 | 
14 | for l in a b c d e f g h i j k l m n p q r s t u v w x y z
15 | do
16 | cat ${l}* > ../${l}.${n}n
17 | rm ${l}*
18 | mv ../${l}.${n}n .
19 | done
20 | 
21 | 
22 | mv other.${n}n ../
23 | cat o* > ../o.${n}n
24 | rm o*
25 | mv ../o.${n}n .
26 | mv ../other.${n}n .
27 | cd ../
28 | done
29 | 
30 | 
31 | 
32 | # combine typology
33 | cd $1
34 | for n in 1 2 3 4
35 | do
36 | cd ${n}/
37 | for l in A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
38 | do
39 | cat ${l}* > ../${l}.${n}es
40 | rm ${l}*
41 | mv ../${l}.${n}es .
42 | done
43 | 
44 | 
45 | for l in a b c d e f g h i j k l m n p q r s t u v w x y z
46 | do
47 | cat ${l}* > ../${l}.${n}es
48 | rm ${l}*
49 | mv ../${l}.${n}es .
50 | done
51 | 
52 | 
53 | mv other.${n}n ../
54 | cat o* > ../o.${n}es
55 | rm o*
56 | mv ../o.${n}es .
57 | mv ../other.${n}es .
58 | cd ../
59 | done
60 | 


--------------------------------------------------------------------------------
/scripts/index-glm.sh:
--------------------------------------------------------------------------------
 1 | dbUser="importer"
 2 | modelLength=5
 3 | topK=5
 4 | 
 5 | echo $1
 6 | inputPath=${1/\/glm*/}
 7 | dbLang=${inputPath##*/}
 8 | inputPath=${inputPath%/*}
 9 | dbType=${inputPath##*/}
10 | echo $inputPath
11 | echo $dbLang
12 | echo $dbType
13 | 
14 | dbName=$dbType"_"$dbLang
15 | echo $dbName
16 | 
17 | 
18 | #dbPath="/mnt/vdb/typoeval/mysql/${dbName}/" #server
19 | dbPath=/var/lib/mysql/${dbName}/ #local machine
20 | 
21 | mysql -u ${dbUser} -e "drop database ${dbName};"
22 | mysql -u ${dbUser} -e "create database ${dbName};"
23 | 
24 | twoPowerModelLength=2**$modelLength
25 | 
26 | for (( sequence=1 ; sequence < $twoPowerModelLength ; sequence++ )); do
27 |   if [[ $sequence%2 -eq 0 ]]; then
28 |     continue
29 |   fi
30 | 
31 |   echo $sequence
32 |   sequenceBinary=`echo "obase=2;$sequence" | bc`
33 |   echo $sequenceBinary
34 | 
35 |   path=$1$sequenceBinary"/*"
36 |   for file in $path
37 |     do
38 |     xpath=${file%/*}
39 |     xbase=${file##*/}
40 |     xfext=${xbase##*.}
41 |     xpref=${xbase%.*}
42 |     tableName=$xfext"_"$xpref
43 |     echo "tableName: "$tableName
44 |     echo "xpath: "$xpath
45 |     echo "xbase: "$xbase
46 |     echo "xfext: "$xfext
47 |     echo "xpref: "$xpref
48 | 
49 |     indexQuery="create table "$tableName" ("
50 |     indexSuffix="("
51 |     importQuery="load data local infile '"$file"' into table "$tableName" fields terminated by '\t' enclosed by '' lines terminated by '\n' ("
52 |     sequenceLengthMinusOne=`expr ${#sequenceBinary} - 1`
53 |     for (( sequencePointer=0; sequencePointer<$sequenceLengthMinusOne; sequencePointer++ )); do
54 |       currentBit=${sequenceBinary:$sequencePointer:1}
55 |       if [ $currentBit -eq 1 ]; then
56 |         indexQuery=$indexQuery"source"$sequencePointer" varchar(60), "
57 |         indexSuffix=$indexSuffix"source"$sequencePointer", "
58 |         importQuery=$importQuery"source"$sequencePointer", "
59 |       fi
60 |     done;
61 |     indexQuery=$indexQuery"target varchar(60), score float) engine=myisam character set utf8 collate utf8_bin;"
62 |     if [ $sequence -ne 1 ]; then
63 |       indexQuery=$indexQuery" create index "$tableName"_0_ix on "$tableName$indexSuffix"score desc);"
64 |     fi
65 | 
66 |     for (( i=1 ; i <= $topK ; i++ )); do
67 |       if [ $sequence -eq 1 ]; then
68 |         indexQuery=$indexQuery" create index "$tableName"_"$i"_ix on "$tableName" (target("$i"), score desc);"
69 |       else
70 |         indexQuery=$indexQuery" create index "$tableName"_"$i"_ix on "$tableName" "$indexSuffix"target("$i"));"
71 |       fi
72 |     done;
73 | 
74 |     importQuery=$importQuery"target, score);"
75 | 
76 |     #create tables and indices
77 |     mysql -u ${dbUser} $dbName --local-infile=1 -e "$indexQuery"
78 | 
79 |     #disable indices
80 |     myisamchk --keys-used=0 -rq ${dbPath}${tableName}
81 |     mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;"
82 | 
83 |     #import data
84 |     mysql -u ${dbUser} $dbName --local-infile=1 -e "$importQuery"
85 | 
86 |     #compress table / really necessary?
87 |     myisampack ${dbPath}${tableName}
88 | 
89 |     #enable index
90 |     myisamchk -rq ${dbPath}${tableName} --tmpdir="/mnt/vdb/tmp" --sort_buffer=3G #--sort-index --sort-records=1
91 | 
92 |     #and flush index again
93 |     mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;"
94 |   done;
95 | done;
96 | 
97 | 


--------------------------------------------------------------------------------
/scripts/index-kneser-ney.sh:
--------------------------------------------------------------------------------
  1 | dbUser="importer"
  2 | modelLength=5
  3 | topK=5
  4 | 
  5 | # call with e.g.:
  6 | # ./index-kneser-ney.sh out/wiki/test/ kneser-ney
  7 | # or
  8 | # ./index-kneser-ney.sh out/wiki/test/ mod-kneser-ney
  9 | 
 10 | echo $1
 11 | echo $2
 12 | inputPath=${1%/}
 13 | dbLang=${inputPath##*/}
 14 | inputPath=${inputPath%/*}
 15 | dbDataSet=${inputPath##*/}
 16 | dbType=${2//-/_}
 17 | echo "inputpath: "$inputPath
 18 | echo "lang: "$dbLang
 19 | echo "dbDataSet: "$dbDataSet
 20 | echo "dbType: "$dbType
 21 | 
 22 | 
 23 | 
 24 | buildResultTable () {
 25 |   xpath=${file%/*}
 26 |   xbase=${file##*/}
 27 |   xfext=${xbase##*.}
 28 |   xpref=${xbase%.*}
 29 |   tableName=$xfext"_"$xpref
 30 |   echo "tableName: "$tableName
 31 |   echo "xpath: "$xpath
 32 |   echo "xbase: "$xbase
 33 |   echo "xfext: "$xfext
 34 |   echo "xpref: "$xpref
 35 | 
 36 |   indexQuery="create table "$tableName" ("
 37 |   indexSuffix="("
 38 |   importQuery="load data local infile '"$file"' into table "$tableName" fields terminated by '\t' enclosed by '' lines terminated by '\n' ("
 39 |   sequenceLengthMinusOne=`expr ${#sequenceBinary} - 1`
 40 |   for (( sequencePointer=0; sequencePointer<$sequenceLengthMinusOne; sequencePointer++ )); do
 41 |     currentBit=${sequenceBinary:$sequencePointer:1}
 42 |     if [ $currentBit -eq 1 ]; then
 43 |       indexQuery=$indexQuery"source"$sequencePointer" varchar(60), "
 44 |       indexSuffix=$indexSuffix"source"$sequencePointer", "
 45 |       importQuery=$importQuery"source"$sequencePointer", "
 46 |     fi
 47 |   done
 48 |   indexQuery=$indexQuery"target varchar(60), score float) engine=myisam character set utf8 collate utf8_bin;"
 49 |   if [ $sequence -ne 1 ]; then
 50 |     indexQuery=$indexQuery" create index "$tableName"_0_ix on "$tableName$indexSuffix"score desc);"
 51 |   fi
 52 | 
 53 |   for (( i=1 ; i <= $topK ; i++ )); do
 54 |     if [ $sequence -eq 1 ]; then
 55 |       indexQuery=$indexQuery" create index "$tableName"_"$i"_ix on "$tableName" (target("$i"), score desc);"
 56 |     else
 57 |       indexQuery=$indexQuery" create index "$tableName"_"$i"_ix on "$tableName" "$indexSuffix"target("$i"));"
 58 |     fi
 59 |   done
 60 |   importQuery=$importQuery"target, score);"
 61 | }
 62 | 
 63 | buildDiscountTable () {
 64 |   xpath=${file%/*}
 65 |   xbase=${file##*/}
 66 |   xfext=${xbase##*.}
 67 |   xpref=${xbase%.*}
 68 |   tableName=$xfext"_"$xpref
 69 |   echo "tableName: "$tableName
 70 |   echo "xpath: "$xpath
 71 |   echo "xbase: "$xbase
 72 |   echo "xfext: "$xfext
 73 |   echo "xpref: "$xpref
 74 | 
 75 |   indexQuery="create table "$tableName" ("
 76 |   indexSuffix="("
 77 |   importQuery="load data local infile '"$file"' into table "$tableName" fields terminated by '\t' enclosed by '' lines terminated by '\n' ("
 78 |   sequenceLengthMinusOne=`expr ${#sequenceBinary} - 1`
 79 |   for (( sequencePointer=0; sequencePointer<=$sequenceLengthMinusOne; sequencePointer++ )); do
 80 |     currentBit=${sequenceBinary:$sequencePointer:1}
 81 |     if [ $currentBit -eq 1 ]; then
 82 |       indexQuery=$indexQuery"source"$sequencePointer" varchar(60), "
 83 |       indexSuffix=$indexSuffix"source"$sequencePointer", "
 84 |       importQuery=$importQuery"source"$sequencePointer", "
 85 |     fi
 86 |   done
 87 |   indexQuery=$indexQuery"score float) engine=myisam character set utf8 collate utf8_bin;"
 88 |   importQuery=$importQuery"score);"
 89 | 
 90 |   # remove ", " from indexSuffix
 91 |   indexSuffix=${indexSuffix%?}
 92 |   indexSuffix=${indexSuffix%?}
 93 |   indexSuffix=$indexSuffix")"
 94 |   indexQuery=$indexQuery" create index "$tableName" on "$tableName$indexSuffix";"
 95 |   echo $indexQuery
 96 | }
 97 | 
 98 | buildIndices () {
 99 | 
100 |   #create tables and indices
101 |   mysql -u ${dbUser} $dbName --local-infile=1 -e "$indexQuery"
102 | 
103 |   #disable indices
104 |   myisamchk --keys-used=0 -rq ${dbPath}${tableName}
105 |   mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;"
106 | 
107 |   #import data
108 |   mysql -u ${dbUser} $dbName --local-infile=1 -e "$importQuery"
109 | 
110 |   #compress table / really necessary?
111 |   myisampack ${dbPath}${tableName}
112 | 
113 |   #enable index
114 |   myisamchk -rq ${dbPath}${tableName} --tmpdir="/mnt/vdb/tmp" --sort_buffer=3G #--sort-index --sort-records=1
115 | 
116 |   #and flush index again
117 |   mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;"
118 | }
119 | 
120 | buildDatabase () {
121 | echo "dbName: "$dbName
122 | #dbPath="/mnt/vdb/typoeval/mysql/${dbName}/" #server
123 | dbPath=/var/lib/mysql/${dbName}/ #local machine
124 | echo "create database ${dbName};"
125 | mysql -u ${dbUser} -e "drop database \`${dbName}\`;"
126 | mysql -u ${dbUser} -e "create database \`${dbName}\`;"
127 | }
128 | 
129 | #dbName=$dbDataSet"_"$dbLang"_"$dbType"_high"
130 | #buildDatabase
131 | #twoPowerModelLength=2**$modelLength
132 | #
133 | #for (( sequence=1 ; sequence < $twoPowerModelLength ; sequence++ )); do
134 | #  if [[ $sequence%2 -eq 0 ]]; then
135 | #    continue
136 | #  fi
137 | #
138 | #  echo $sequence
139 | #  sequenceBinary=`echo "obase=2;$sequence" | bc`
140 | #  echo $sequenceBinary
141 | #
142 | #  path=$1"/kneser-ney-high/"$sequenceBinary"/*"
143 | #  for file in $path; do
144 | #    buildResultTable
145 | #    buildIndices
146 | #  done
147 | #done
148 | #
149 | #dbName=$dbDataSet"_"$dbLang"_"$dbType"_low"
150 | #buildDatabase
151 | #
152 | #modelLengthMinusOne=`expr $modelLength - 1`
153 | #twoPowerModelLengthMinusOne=2**$modelLengthMinusOne
154 | #for (( sequence=1 ; sequence < $twoPowerModelLengthMinusOne ; sequence++ )); do
155 | #  if [[ $sequence%2 -eq 0 ]]; then
156 | #    continue
157 | #  fi
158 | #
159 | #  echo $sequence
160 | #  sequenceBinary=`echo "obase=2;$sequence" | bc`
161 | #  echo $sequenceBinary
162 | #
163 | #  path=$1"/kneser-ney-low/"$sequenceBinary"/*"
164 | #  for file in $path; do
165 | #    buildResultTable
166 | #    buildIndices
167 | #  done
168 | #done
169 | 
170 | dbName=$dbDataSet"_"$dbLang"_"$dbType"_high_discount"
171 | buildDatabase
172 | 
173 | modelLengthMinusOne=`expr $modelLength - 1`
174 | twoPowerModelLengthMinusOne=2**$modelLengthMinusOne
175 | for (( sequence=1 ; sequence < $twoPowerModelLengthMinusOne ; sequence++ )); do
176 |   if [[ $sequence%2 -eq 0 ]]; then
177 |     continue
178 |   fi
179 | 
180 |   echo $sequence
181 |   sequenceBinary=`echo "obase=2;$sequence" | bc`
182 |   echo $sequenceBinary
183 | 
184 |   path=$1"/kneser-ney-high-discount/"$sequenceBinary"/*"
185 |   for file in $path; do
186 |     buildDiscountTable
187 |     buildIndices
188 |   done
189 | done
190 | 
191 | dbName=$dbDataSet"_"$dbLang"_"$dbType"_low_discount"
192 | buildDatabase
193 | modelLengthMinusTwo=`expr $modelLength - 2`
194 | twoPowerModelLengthMinusTwo=2**$modelLengthMinusTwo
195 | for (( sequence=1 ; sequence < $twoPowerModelLengthMinusTwo ; sequence++ )); do
196 |   if [[ $sequence%2 -eq 0 ]]; then
197 |     continue
198 |   fi
199 | 
200 |   echo $sequence
201 |   sequenceBinary=`echo "obase=2;$sequence" | bc`
202 |   echo $sequenceBinary
203 | 
204 |   path=$1"/kneser-ney-low-discount/"$sequenceBinary"/*"
205 |   for file in $path; do
206 |     buildDiscountTable
207 |     buildIndices
208 |   done
209 | done
210 | 


--------------------------------------------------------------------------------
/scripts/index-lm.sh:
--------------------------------------------------------------------------------
  1 | dbUser="rene"
  2 | echo $1
  3 | inputPath=${1/\/ngrams*/}
  4 | dbLang=${inputPath##*/}
  5 | inputPath=${inputPath%/*}
  6 | dbType=${inputPath##*/}
  7 | echo $inputPath
  8 | echo $dbLang
  9 | echo $dbType
 10 | 
 11 | dbName=$dbType"_"$dbLang"_ngram"
 12 | echo $dbName
 13 | #exit 1
 14 | 
 15 | 
 16 | dbPath="/mnt/vdb/typoeval/mysql/${dbName}/" #server
 17 | #dbPath=/var/lib/mysql/${dbName}/ #local machine
 18 | 
 19 | mysql -u ${dbUser} -e "drop database ${dbName};"
 20 | mysql -u ${dbUser} -e "create database ${dbName};"
 21 | 
 22 | for (( i = 1 ;  i <= 5;  i++  ))
 23 |   do
 24 |   path=$1$i"gs/*"
 25 |   for file in $path
 26 |     do
 27 |     xpath=${file%/*}
 28 |     xbase=${file##*/}
 29 |     xfext=${xbase##*.}
 30 |     xpref=${xbase%.*}
 31 |     tablename=$xfext"_"$xpref
 32 |     echo "tablename: "$tablename
 33 |     echo "xpath: "$xpath
 34 |     echo "xbase: "$xbase
 35 |     echo "xfext: "$xfext
 36 |     echo "xpref: "$xpref
 37 | 
 38 |    #create tables and indices
 39 |    if [ $i -eq 1 ];
 40 |       then
 41 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin;
 42 |       create index ${tablename}_ix on ${tablename} (target(1), score desc);
 43 |       create index ${tablename}_2_ix on ${tablename} (target(2), score desc);
 44 |       create index ${tablename}_3_ix on ${tablename} (target(3), score desc);
 45 |       create index ${tablename}_4_ix on ${tablename} (target(4), score desc);
 46 |       create index ${tablename}_5_ix on ${tablename} (target(5), score desc);"
 47 |     fi
 48 | 
 49 |     if [ $i -eq 2 ];
 50 |       then
 51 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (source1 varchar(60),target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin;
 52 |       create index ${tablename}_ix on ${tablename} (source1(60), score desc);
 53 |       create index ${tablename}_2_ix on ${tablename} (source1(60), target(2));
 54 |       create index ${tablename}_3_ix on ${tablename} (source1(60), target(3));
 55 |       create index ${tablename}_4_ix on ${tablename} (source1(60), target(4));
 56 |       create index ${tablename}_5_ix on ${tablename} (source1(60), target(5));"
 57 |     fi
 58 | 
 59 |     if [ $i -eq 3 ];
 60 |       then
 61 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (source1 varchar(60),source2 varchar(60),target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin;
 62 |       create index ${tablename}_ix on ${tablename} (source1(60), source2(60), score desc);
 63 |       create index ${tablename}_2_ix on ${tablename} (source1(60), source2(60), target(2));
 64 |       create index ${tablename}_3_ix on ${tablename} (source1(60), source2(60), target(3));
 65 |       create index ${tablename}_4_ix on ${tablename} (source1(60), source2(60), target(4));
 66 |       create index ${tablename}_5_ix on ${tablename} (source1(60), source2(60), target(5));"
 67 |     fi
 68 | 
 69 |     if [ $i -eq 4 ];
 70 |       then
 71 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (source1 varchar(60),source2 varchar(60), source3 varchar(60), target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin;
 72 |       create index ${tablename}_ix on ${tablename} (source1(60), source2(60), source3(60), score desc);
 73 |       create index ${tablename}_2_ix on ${tablename} (source1(60), source2(60), source3(60), target(2));
 74 |       create index ${tablename}_3_ix on ${tablename} (source1(60), source2(60), source3(60), target(3));
 75 |       create index ${tablename}_4_ix on ${tablename} (source1(60), source2(60), source3(60), target(4));
 76 |       create index ${tablename}_5_ix on ${tablename} (source1(60), source2(60), source3(60), target(5));"
 77 |     fi
 78 | 
 79 |     if [ $i -eq 5 ];
 80 |       then
 81 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (source1 varchar(60),source2 varchar(60), source3 varchar(60), source4 varchar(60), target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin;
 82 |       create index ${tablename}_ix on ${tablename} (source1(60), source2(60), source3(60),  source4(60), score desc);
 83 |       create index ${tablename}_2_ix on ${tablename} (source1(60), source2(60), source3(60), source4(60), target(2));
 84 |       create index ${tablename}_3_ix on ${tablename} (source1(60), source2(60), source3(60), source4(60), target(3));
 85 |       create index ${tablename}_4_ix on ${tablename} (source1(60), source2(60), source3(60), source4(60), target(4));
 86 |       create index ${tablename}_5_ix on ${tablename} (source1(60), source2(60), source3(60), source4(60), target(5));"
 87 |     fi
 88 | 
 89 |     #disable indices
 90 |     myisamchk --keys-used=0 -rq ${dbPath}${tablename}
 91 |     mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;"
 92 | 
 93 |     #import data
 94 |     if [ $i -eq 1 ];
 95 |       then
 96 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (target, score);"
 97 |     fi
 98 | 
 99 |     if [ $i -eq 2 ];
100 |       then
101 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (source1, target, score);"
102 |     fi
103 | 
104 |     if [ $i -eq 3 ];
105 |       then
106 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (source1, source2, target, score);"
107 |     fi
108 | 
109 |     if [ $i -eq 4 ];
110 |       then
111 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (source1, source2, source3, target, score);"
112 |     fi
113 | 
114 |     if [ $i -eq 5 ];
115 |       then
116 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (source1, source2, source3, source4, target, score);"
117 |     fi
118 | 
119 |     #compress table / really necessary?
120 |     myisampack ${dbPath}${tablename}
121 | 
122 |     #enable index
123 |     myisamchk -rq ${dbPath}${tablename} --tmpdir="/mnt/vdb/tmp" --sort_buffer=3G #--sort-index --sort-records=1
124 | 
125 |     #and flush index again
126 |     mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;"
127 |   done;
128 | done;
129 | 


--------------------------------------------------------------------------------
/scripts/index-typo.sh:
--------------------------------------------------------------------------------
 1 | dbUser="rene"
 2 | echo $1
 3 | inputPath=${1/\/typos*/}
 4 | dbLang=${inputPath##*/}
 5 | inputPath=${inputPath%/*}
 6 | dbType=${inputPath##*/}
 7 | echo $inputPath
 8 | echo $dbLang
 9 | echo $dbType
10 | 
11 | dbName=$dbType"_"$dbLang"_typo"
12 | echo $dbName
13 | #exit 1
14 | dbPath="/mnt/vdb/typoeval/mysql/${dbName}/" #server
15 | #dbPath=/var/lib/mysql/${dbName}/ #local machine
16 | 
17 | mysql -u ${dbUser} -e "drop database ${dbName};"
18 | mysql -u ${dbUser} -e "create database ${dbName};"
19 | 
20 | for (( i = 0; i < 5; i++ ))
21 | do
22 | path=$1$i"es/*"
23 | for file in $path
24 |     do
25 |     xpath=${file%/*}
26 |     xbase=${file##*/}
27 |     xfext=${xbase##*.}
28 |     xpref=${xbase%.*}
29 |     tablename=$xfext"_"$xpref;
30 |     echo "tablename: "$tablename;
31 |     echo "xpath: "$xpath;
32 |     echo "xbase: "$xbase;
33 |     echo "xfext: "$xfext;
34 |     echo "xpref: "$xpref;
35 | 
36 |     #create tables and indices
37 |     if [ $i -eq 0 ];
38 |     then
39 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin;
40 |       create index ${tablename}_ix on ${tablename} (target(1), score desc);
41 |       create index ${tablename}_2_ix on ${tablename} (target(2), score desc);
42 |       create index ${tablename}_3_ix on ${tablename} (target(3), score desc);
43 |       create index ${tablename}_4_ix on ${tablename} (target(4), score desc);
44 |       create index ${tablename}_5_ix on ${tablename} (target(5), score desc);"
45 |     else
46 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "create table ${tablename} (source varchar(60),target varchar(60),score float) engine=myisam character set utf8 collate utf8_bin;
47 |       create index ${tablename}_ix on ${tablename} (source(60), score desc);
48 |       create index ${tablename}_2_ix on ${tablename} (source(60), target(2), score desc);
49 |       create index ${tablename}_3_ix on ${tablename} (source(60), target(3), score desc);
50 |       create index ${tablename}_4_ix on ${tablename} (source(60), target(4), score desc);
51 |       create index ${tablename}_5_ix on ${tablename} (source(60), target(5), score desc);"
52 |     fi
53 | 
54 | 
55 |     #disable indices
56 |     myisamchk --keys-used=0 -rq ${dbPath}${tablename}
57 | 
58 |     mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;"
59 | 
60 |     #import data
61 |     if [ $i -eq 0 ];
62 |     then
63 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (@dummy, target, score);"
64 |     else
65 |       mysql -u ${dbUser} $dbName --local-infile=1 -e "load data local infile '$file' into table ${tablename} fields terminated by '\t' enclosed by '' lines terminated by '\n' (source, target, score);"
66 |     fi
67 | 
68 |     #compress table / really necessary?
69 |     myisampack ${dbPath}${tablename}
70 | 
71 |     #enable index
72 |     myisamchk -rq ${dbPath}${tablename} --tmpdir="/mnt/vdb/tmp" --sort_buffer=3G #--sort-index --sort-records=1
73 | 
74 |     #and flush index again.
75 |     mysql -u ${dbUser} $dbName --local-infile=1 -e "flush tables;"
76 | 
77 |   done
78 | done
79 | 


--------------------------------------------------------------------------------
/scripts/preparequery.sh:
--------------------------------------------------------------------------------
 1 | fSource=$1
 2 | #fSource="Die"
 3 | fEdge="4es${fSource:0:1}"
 4 | tSource=$2
 5 | #tSource="deutschen"
 6 | tEdge="3es${tSource:0:1}"
 7 | sSource=$3
 8 | #sSource="Bauern"
 9 | sEdge="2es${sSource:0:1}"
10 | oSource=$4
11 | #oSource="haben"
12 | oEdge="1es${oSource:0:1}"
13 | prefix="${5}%"
14 | #prefix="d%"
15 | 
16 | query="select ${fEdge}.source, ${tEdge}.source, ${sEdge}.source, ${oEdge}.source, ${fEdge}.target, (IFNULL(${fEdge}.score, 0) + IFNULL(${tEdge}.score, 0) + IFNULL(${sEdge}.score, 0) + IFNULL(${oEdge}.score, 0)) as count from ${fEdge} right outer join ${tEdge} on ${tEdge}.target=${fEdge}.target right outer join ${sEdge} on ${sEdge}.target=${fEdge}.target right outer join ${oEdge} on ${oEdge}.target like ${fEdge}.target where ${fEdge}.source like \"${fSource}\" AND ${fEdge}.target like \"${prefix}\" and ${tEdge}.source like \"${tSource}\" AND ${tEdge}.target like \"${prefix}\" and ${sEdge}.source like \"${sSource}\" AND ${sEdge}.target like \"${prefix}\" and ${oEdge}.source like \"${oSource}\" AND ${oEdge}.target like \"${prefix}\" order by count desc limit 5;"
17 | 
18 | 
19 | #query="select source, target, score from ${fEdge} where ${fEdge}.source like \"${fSource}\" AND ${fEdge}.target like \"${prefix}\" order by score desc limit 5; 
20 | #select source, target, score from ${tEdge} where ${tEdge}.source like \"${tSource}\" AND ${tEdge}.target like \"${prefix}\" order by score desc limit 5; 
21 | #select source, target, score from ${sEdge} where ${sEdge}.source like \"${sSource}\" AND ${sEdge}.target like \"${prefix}\" order by score desc limit 5; 
22 | #select source, target, score from ${oEdge} where ${oEdge}.source like \"${oSource}\" AND ${oEdge}.target like \"${prefix}\" order by score desc limit 5;"
23 | 
24 | echo $query;
25 | 
26 | #mysql -u importer typology --local-infile=1 -e "${query}"
27 | 


--------------------------------------------------------------------------------
/scripts/runpreparequery.sh:
--------------------------------------------------------------------------------
 1 | fSource=$1
 2 | #fSource="Die"
 3 | fEdge="4es${fSource:0:1}"
 4 | tSource=$2
 5 | #tSource="deutschen"
 6 | tEdge="3es${tSource:0:1}"
 7 | sSource=$3
 8 | #sSource="Bauern"
 9 | sEdge="2es${sSource:0:1}"
10 | oSource=$4
11 | #oSource="haben"
12 | oEdge="1es${oSource:0:1}"
13 | prefix="${5}%"
14 | #prefix="d%"
15 | 
16 | query="select ${fEdge}.source, ${tEdge}.source, ${sEdge}.source, ${oEdge}.source, ${fEdge}.target, (IFNULL(${fEdge}.score, 0) + IFNULL(${tEdge}.score, 0) + IFNULL(${sEdge}.score, 0) + IFNULL(${oEdge}.score, 0)) as count from ${fEdge} right outer join ${tEdge} on ${tEdge}.target=${fEdge}.target right outer join ${sEdge} on ${sEdge}.target=${fEdge}.target right outer join ${oEdge} on ${oEdge}.target like ${fEdge}.target where ${fEdge}.source like \"${fSource}\" AND ${fEdge}.target like \"${prefix}\" and ${tEdge}.source like \"${tSource}\" AND ${tEdge}.target like \"${prefix}\" and ${sEdge}.source like \"${sSource}\" AND ${sEdge}.target like \"${prefix}\" and ${oEdge}.source like \"${oSource}\" AND ${oEdge}.target like \"${prefix}\" order by count desc limit 5;"
17 | 
18 | mysql -u importer typology --local-infile=1 -e "${query}"
19 | 


--------------------------------------------------------------------------------
/scripts/stats.txt:
--------------------------------------------------------------------------------
 1 | google-de total: 19959412114
 2 | google-en total: 122004194540
 3 | google-es total: 23061292567
 4 | google-fr total: 19925969585
 5 | wiki-de total: 423089409
 6 | wiki-en total: 745287250
 7 | wiki-es total: 283966401
 8 | wiki-fr total: 253536262
 9 | wiki-it total: 146107024
10 | enron-en total: 82566655
11 | dgttm-de total: 24426630
12 | dgttm-en total: 29172197
13 | dgttm-es total: 27878291
14 | dgttm-fr total: 27666297
15 | dgttm-it total: 27600138
16 | google-de unique: 3685172
17 | google-en unique: 7379221
18 | google-es unique: 2233403
19 | google-fr unique: 1978551
20 | wiki-de unique: 9823118
21 | wiki-en unique: 11702124
22 | wiki-es unique: 4178681
23 | wiki-fr unique: 4058752
24 | wiki-it unique: 3085208
25 | enron-en unique: 588279
26 | dgttm-de unique: 664546
27 | dgttm-en unique: 458622
28 | dgttm-es unique: 446314
29 | dgttm-fr unique: 459495
30 | dgttm-it unique: 471740
31 | 


--------------------------------------------------------------------------------
/scripts/table.sh:
--------------------------------------------------------------------------------
  1 | #AUTOR: Martin Koerner
  2 | #Purpose to put results into a LaTeX table
  3 | 
  4 | #run with: pfl=x k=x modelParameter=x ./table.sh
  5 | 
  6 | #e.g.:
  7 | #pfl=1 k=5 weight=no modelParameter=5 ./table.sh
  8 | 
  9 | 
 10 | #storage directory for res.*.log files
 11 | #LOGDIR="/var/lib/datasets/results/"
 12 | LOGDIR="/home/martin/results/"
 13 | #storage directory for return files
 14 | #RETURNDIR="/var/lib/datasets/plots/"
 15 | RETURNDIR="/home/martin/plots/"
 16 | STATS="stats.txt"
 17 | SECNKSS=1
 18 | 
 19 | if [[ ${#pfl} == 0 || ${#k} == 0 || ${#weight} == 0 || ${#modelParameter} == 0 ]]
 20 | 	then echo "set values for k, pfl, modelParameter and weight"
 21 | 	exit
 22 | fi
 23 | 
 24 | #temp:
 25 | #res.trainedOn-wiki-de-testedOn-wiki-de-lm-pic-modelParameter2-sam0-split95-joinlength10-nQ100000
 26 | 
 27 | 
 28 | 
 29 | 
 30 | #general declarations
 31 | PF1="res.trainedOn-"
 32 | PF2="-testedOn-"
 33 | PF3="-modelParameter"
 34 | PF4="-sam0-split95-joinlength10-nQ100000.log"
 35 | LANGS=(de en es fr it)
 36 | 
 37 | LM="-lm-"
 38 | TYPO="-typolgy-"
 39 | FILENAME="table-k$k-pfl$pfl-modelParameter$modelParameter.txt"
 40 | RETURN=$RETURNDIR$FILENAME
 41 | 
 42 | #reset result file
 43 | echo -n "" | tee "$RETURN"
 44 | 
 45 | echo "\begin{table*}[bth]" | tee -a "$RETURN"
 46 | echo "\begin{center}" | tee -a "$RETURN"
 47 | echo "\begin{tabular}{lllll}" | tee -a "$RETURN"
 48 | echo "Corpus & total words & unique words & MRR pfl=$pfl & Top $k Precision pfl=$pfl & NKSS@$SECNKSS & NKSS@$k \\\\" | tee -a "$RETURN"
 49 | echo "\hline" | tee -a "$RETURN"
 50 | 
 51 | CALC () {
 52 | #echo -n "( CNTTYPO / $CNTLM  - 1 ) * 100"
 53 | 
 54 | RESULT=`echo "($CNTTYPO/$CNTLM-1)*100" | bc -l`
 55 | echo -n " $RESULT" | awk '{ printf "%.1f", $0 }' | tee -a "$RETURN"
 56 | #echo -n " $RESULT" | tee -a "$RETURN"
 57 | }
 58 | 
 59 | PRINTLN () {
 60 | echo -n "$CORPUS & " | tee -a "$RETURN"
 61 | TOTALWORDS=`grep "$TYP1$LANG total" $STATS`
 62 | TOTALWORDS=${TOTALWORDS[0]//$TYP1$LANG total: /}
 63 | echo -n "$TOTALWORDS & " | tee -a "$RETURN"
 64 | 
 65 | UNIQUEWORDS=`grep "$TYP1$LANG unique" $STATS`
 66 | UNIQUEWORDS=${UNIQUEWORDS[0]//$TYP1$LANG unique: /}
 67 | echo -n "$UNIQUEWORDS & " | tee -a "$RETURN"
 68 | 
 69 | CNTTYPO=`grep "MRR with pfl=$pfl" $FILETYPO`
 70 | CNTTYPO=${CNTTYPO[0]//MRR with pfl=$pfl: /}
 71 | echo $CNTTYPO
 72 | CNTLM=`grep "MRR with pfl=$pfl" $FILELM`
 73 | CNTLM=${CNTLM[0]//MRR with pfl=$pfl: /}
 74 | echo $CNTLM
 75 | CALC
 76 | echo -n " & " | tee -a "$RETURN"
 77 | CNTTYPO=`grep "Precision at k=$k with pfl=$pfl" $FILETYPO`
 78 | CNTTYPO=${CNTTYPO[0]//Precision at k=$k with pfl=$pfl: /}
 79 | echo $CNTTYPO
 80 | CNTLM=`grep "Precision at k=$k with pfl=$pfl" $FILELM`
 81 | CNTLM=${CNTLM[0]//Precision at k=$k with pfl=$pfl: /}
 82 | echo $CNTLM
 83 | CALC
 84 | echo -n " & " | tee -a "$RETURN"
 85 | 
 86 | 
 87 | CNTTYPO=`grep "NKSS at k=$SECNKSS" $FILETYPO`
 88 | CNTTYPO=${CNTTYPO[0]//NKSS at k=$SECNKSS: /}
 89 | echo $CNTTYPO
 90 | CNTLM=`grep "NKSS at k=$SECNKSS" $FILELM`
 91 | CNTLM=${CNTLM[0]//NKSS at k=$SECNKSS: /}
 92 | echo $CNTLM
 93 | CALC
 94 | echo -n " & " | tee -a "$RETURN"
 95 | 
 96 | CNTTYPO=`grep "NKSS at k=$k" $FILETYPO`
 97 | CNTTYPO=${CNTTYPO[0]//NKSS at k=$k: /}
 98 | echo $CNTTYPO
 99 | CNTLM=`grep "NKSS at k=$k" $FILELM`
100 | CNTLM=${CNTLM[0]//NKSS at k=$k: /}
101 | echo $CNTLM
102 | CALC
103 | 
104 | echo " \\\\" | tee -a "$RETURN"
105 | }
106 | 
107 | #google
108 | for LANG in ${LANGS[@]} 
109 | do
110 | if [[ $LANG != "it" ]]
111 | then
112 | TYP1="google-"
113 | TYP2="wiki-"
114 | CORPUS="$TYP1$TYP2$LANG"
115 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4
116 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4
117 | echo $FILETYPO
118 | echo $FILELM
119 | PRINTLN
120 | fi
121 | 
122 | #wiki
123 | 
124 | TYP1="wiki-"
125 | TYP2="wiki-"
126 | CORPUS="$TYP1$TYP2$LANG"
127 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4
128 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4
129 | echo $FILETYPO
130 | echo $FILELM
131 | PRINTLN
132 | 
133 | #dgttm
134 | TYP1="dgttm-"
135 | TYP2="dgttm-"
136 | CORPUS="$TYP1$TYP2$LANG"
137 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4
138 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4
139 | echo $FILETYPO
140 | echo $FILELM
141 | PRINTLN
142 | 
143 | 
144 | #enron
145 | if [[ $LANG == "en" ]]
146 | then
147 | TYP1="enron-"
148 | TYP2="enron-"
149 | CORPUS="$TYP1$TYP2$LANG"
150 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4
151 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4
152 | echo $FILETYPO
153 | echo $FILELM
154 | PRINTLN
155 | fi
156 | if [[ $LANG == "en" ]]
157 | then
158 | TYP1="enron-"
159 | TYP2="wiki-"
160 | CORPUS="$TYP1$TYP2$LANG"
161 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4
162 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4
163 | echo $FILETYPO
164 | echo $FILELM
165 | PRINTLN
166 | fi
167 | if [[ $LANG == "en" ]]
168 | then
169 | TYP1="google-"
170 | TYP2="enron-"
171 | CORPUS="$TYP1$TYP2$LANG"
172 | FILETYPO=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$TYPO$weight$PF3$modelParameter$PF4
173 | FILELM=$LOGDIR$PF1$TYP1$LANG$PF2$TYP2$LANG$LM$weight$PF3$modelParameter$PF4
174 | echo $FILETYPO
175 | echo $FILELM
176 | PRINTLN
177 | fi
178 | 
179 | done
180 | 
181 | echo "\end{tabular}" | tee -a "$RETURN"
182 | echo "\label{tab:corporaStats}" | tee -a "$RETURN"
183 | echo "\caption{Statistics of our evaluation corpora}" | tee -a "$RETURN"
184 | echo "\end{center}" | tee -a "$RETURN"
185 | echo "\end{table*}" | tee -a "$RETURN"
186 | 


--------------------------------------------------------------------------------
/scripts/test.sh:
--------------------------------------------------------------------------------
 1 | dbUser="importer"
 2 | testName="hybridtypology"
 3 | 
 4 | #dbPath="/mnt/vdb/typoeval/mysql/${testName}/" #server
 5 | dbPath=/var/lib/mysql/${testName}/ #local machine
 6 | 
 7 | #mysql -u ${dbUser} -e "create database ${testName};"
 8 | 
 9 | 
10 | for (( i = 2 ;  i <= 5;  i++  ))
11 | do
12 | path="$1$i/*"
13 | for file in $path
14 | do
15 | 
16 | 
17 | done;
18 | done;
19 | 


--------------------------------------------------------------------------------
/src/de/typology/executables/KneserNeyBuilder.java:
--------------------------------------------------------------------------------
  1 | package de.typology.executables;
  2 | 
  3 | import java.io.File;
  4 | import java.util.ArrayList;
  5 | import java.util.HashMap;
  6 | 
  7 | import org.apache.logging.log4j.LogManager;
  8 | import org.apache.logging.log4j.Logger;
  9 | 
 10 | import de.typology.indexes.WordIndex;
 11 | import de.typology.indexes.WordIndexer;
 12 | import de.typology.patterns.PatternBuilder;
 13 | import de.typology.smoother.KneserNeySmoother;
 14 | import de.typology.smoother.ModifiedKneserNeySmoother;
 15 | import de.typology.splitter.AbsoluteSplitter;
 16 | import de.typology.splitter.DataSetSplitter;
 17 | import de.typology.splitter.SmoothingSplitter;
 18 | import de.typology.tester.TestSequenceExtractor;
 19 | import de.typology.utils.Config;
 20 | 
 21 | public class KneserNeyBuilder {
 22 | 
 23 | 	static Logger logger = LogManager.getLogger(KneserNeyBuilder.class
 24 | 			.getName());
 25 | 
 26 | 	public static void main(String[] args) {
 27 | 
 28 | 		// TODO: parameters as arguments
 29 | 		File inputDirectory = new File(Config.get().outputDirectory
 30 | 				+ Config.get().inputDataSet);
 31 | 		File inputFile = new File(inputDirectory.getAbsolutePath()
 32 | 				+ "/training.txt");
 33 | 		File indexFile = new File(inputDirectory.getAbsolutePath()
 34 | 				+ "/index.txt");
 35 | 		File absoluteDirectory = new File(inputDirectory.getAbsolutePath()
 36 | 				+ "/absolute");
 37 | 		File continuationDirectory = new File(inputDirectory.getAbsolutePath()
 38 | 				+ "/continuation");
 39 | 		if (Config.get().splitData) {
 40 | 			DataSetSplitter dss = new DataSetSplitter(inputDirectory,
 41 | 					"normalized.txt");
 42 | 			dss.split("training.txt", "learning.txt", "testing.txt",
 43 | 					Config.get().modelLength);
 44 | 			dss.splitIntoSequences(new File(inputDirectory.getAbsolutePath()
 45 | 					+ "/testing.txt"), Config.get().modelLength,
 46 | 					Config.get().numberOfQueries);
 47 | 		}
 48 | 		if (Config.get().buildIndex) {
 49 | 			logger.info("build word index: " + indexFile.getAbsolutePath());
 50 | 			WordIndexer wordIndexer = new WordIndexer();
 51 | 			wordIndexer.buildIndex(inputFile, indexFile,
 52 | 					Config.get().maxCountDivider, "<fs> <s> ", " </s>");
 53 | 		}
 54 | 		if (Config.get().buildGLM) {
 55 | 			ArrayList<boolean[]> glmForSmoothingPatterns = PatternBuilder
 56 | 					.getReverseGLMForSmoothingPatterns(Config.get().modelLength);
 57 | 			AbsoluteSplitter absolteSplitter = new AbsoluteSplitter(inputFile,
 58 | 					indexFile, absoluteDirectory, "\t",
 59 | 					Config.get().deleteTempFiles, "<fs> <s> ", " </s>");
 60 | 			logger.info("split into GLM sequences: "
 61 | 					+ inputFile.getAbsolutePath());
 62 | 			absolteSplitter.split(glmForSmoothingPatterns,
 63 | 					Config.get().numberOfCores);
 64 | 		}
 65 | 		if (Config.get().buildContinuationGLM) {
 66 | 			ArrayList<boolean[]> lmPatterns = PatternBuilder
 67 | 					.getReverseLMPatterns(Config.get().modelLength);
 68 | 			SmoothingSplitter smoothingSplitter = new SmoothingSplitter(
 69 | 					absoluteDirectory, continuationDirectory, indexFile, "\t",
 70 | 					Config.get().deleteTempFiles);
 71 | 			logger.info("split into continuation sequences: "
 72 | 					+ inputFile.getAbsolutePath());
 73 | 			smoothingSplitter.split(lmPatterns, Config.get().numberOfCores);
 74 | 		}
 75 | 
 76 | 		File testExtractOutputDirectory = new File(
 77 | 				inputDirectory.getAbsolutePath() + "/testing-samples");
 78 | 		if (Config.get().extractContinuationGLM) {
 79 | 			File testSequences = new File(inputDirectory.getAbsolutePath()
 80 | 					+ "/testing-samples-" + Config.get().modelLength + ".txt");
 81 | 			testExtractOutputDirectory.mkdir();
 82 | 
 83 | 			TestSequenceExtractor tse = new TestSequenceExtractor(
 84 | 					testSequences, absoluteDirectory, continuationDirectory,
 85 | 					testExtractOutputDirectory, "\t", new WordIndex(indexFile));
 86 | 			tse.extractSequences(Config.get().modelLength,
 87 | 					Config.get().numberOfCores);
 88 | 			tse.extractContinuationSequences(Config.get().modelLength,
 89 | 					Config.get().numberOfCores);
 90 | 
 91 | 		}
 92 | 
 93 | 		HashMap<String, HashMap<String, Long>> absoluteTypeSequenceValueMap = null;
 94 | 		HashMap<String, HashMap<String, Long[]>> continuationTypeSequenceValueMap = null;
 95 | 		if (Config.get().buildKneserNey) {
 96 | 			KneserNeySmoother kns = new KneserNeySmoother(
 97 | 					testExtractOutputDirectory, absoluteDirectory,
 98 | 					continuationDirectory, "\t");
 99 | 
100 | 			// read absolute and continuation values into HashMaps
101 | 			logger.info("read absolute and continuation values into HashMaps for kneser ney");
102 | 			absoluteTypeSequenceValueMap = kns
103 | 					.readAbsoluteValuesIntoHashMap(kns.extractedAbsoluteDirectory);
104 | 
105 | 			continuationTypeSequenceValueMap = kns
106 | 					.readContinuationValuesIntoHashMap(kns.extractedContinuationDirectory);
107 | 			kns.absoluteTypeSequenceValueMap = absoluteTypeSequenceValueMap;
108 | 			kns.continuationTypeSequenceValueMap = continuationTypeSequenceValueMap;
109 | 
110 | 			for (int i = Config.get().modelLength; i >= 1; i--) {
111 | 				File inputSequenceFile = new File(
112 | 						inputDirectory.getAbsolutePath() + "/testing-samples-"
113 | 								+ i + ".txt");
114 | 				File resultFile;
115 | 				// smooth simple
116 | 				if (Config.get().kneserNeySimple) {
117 | 					resultFile = new File(inputDirectory.getAbsolutePath()
118 | 							+ "/kneser-ney-simple-backoffToCont-" + i + ".txt");
119 | 					kns.smooth(inputSequenceFile, resultFile, i, false,
120 | 							Config.get().conditionalProbabilityOnly);
121 | 				}
122 | 				// smooth complex
123 | 				if (Config.get().kneserNeyComplex) {
124 | 					resultFile = new File(inputDirectory.getAbsolutePath()
125 | 							+ "/kneser-ney-complex-backoffToCont-" + i + ".txt");
126 | 					kns.smooth(inputSequenceFile, resultFile, i, true,
127 | 							Config.get().conditionalProbabilityOnly);
128 | 				}
129 | 			}
130 | 		}
131 | 		if (Config.get().buildModKneserNey) {
132 | 			ModifiedKneserNeySmoother mkns = new ModifiedKneserNeySmoother(
133 | 					testExtractOutputDirectory, absoluteDirectory,
134 | 					continuationDirectory, "\t", Config.get().decimalPlaces);
135 | 
136 | 			if (absoluteTypeSequenceValueMap == null) {
137 | 				// read absolute and continuation values into HashMaps
138 | 
139 | 				logger.info("read absolute and continuation values into HashMaps for mod kneser ney");
140 | 				absoluteTypeSequenceValueMap = mkns
141 | 						.readAbsoluteValuesIntoHashMap(mkns.extractedAbsoluteDirectory);
142 | 
143 | 				continuationTypeSequenceValueMap = mkns
144 | 						.readContinuationValuesIntoHashMap(mkns.extractedContinuationDirectory);
145 | 			}
146 | 
147 | 			mkns.absoluteTypeSequenceValueMap = absoluteTypeSequenceValueMap;
148 | 			mkns.continuationTypeSequenceValueMap = continuationTypeSequenceValueMap;
149 | 
150 | 			for (int i = Config.get().modelLength; i >= 1; i--) {
151 | 				File inputSequenceFile = new File(
152 | 						inputDirectory.getAbsolutePath() + "/testing-samples-"
153 | 								+ i + ".txt");
154 | 				File resultFile;
155 | 				// smooth simple
156 | 				if (Config.get().kneserNeySimple) {
157 | 					resultFile = new File(inputDirectory.getAbsolutePath()
158 | 							+ "/mod-kneser-ney-simple-backoffToCont-" + i
159 | 							+ ".txt");
160 | 					mkns.smooth(inputSequenceFile, resultFile, i, false,
161 | 							Config.get().conditionalProbabilityOnly);
162 | 				}
163 | 				// smooth complex
164 | 				if (Config.get().kneserNeyComplex) {
165 | 					resultFile = new File(inputDirectory.getAbsolutePath()
166 | 							+ "/mod-kneser-ney-complex-backoffToCont-" + i
167 | 							+ ".txt");
168 | 					mkns.smooth(inputSequenceFile, resultFile, i, true,
169 | 							Config.get().conditionalProbabilityOnly);
170 | 				}
171 | 			}
172 | 		}
173 | 		logger.info("done");
174 | 	}
175 | }
176 | 


--------------------------------------------------------------------------------
/src/de/typology/executables/MultiKneserNeyBuilder.java:
--------------------------------------------------------------------------------
 1 | package de.typology.executables;
 2 | 
 3 | import de.typology.utils.Config;
 4 | 
 5 | public class MultiKneserNeyBuilder {
 6 | 
 7 | 	public static void main(String[] args) {
 8 | 		String[] languages = Config.get().languages.split(",");
 9 | 		String inputDataSet = Config.get().inputDataSet;
10 | 		for (String language : languages) {
11 | 			Config.get().inputDataSet = inputDataSet + "/" + language;
12 | 			KneserNeyBuilder.main(args);
13 | 		}
14 | 
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/src/de/typology/indexes/WordIndex.java:
--------------------------------------------------------------------------------
  1 | package de.typology.indexes;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.File;
  6 | import java.io.FileReader;
  7 | import java.io.FileWriter;
  8 | import java.io.IOException;
  9 | import java.util.Arrays;
 10 | import java.util.HashMap;
 11 | import java.util.Iterator;
 12 | import java.util.Map.Entry;
 13 | 
 14 | import org.apache.commons.io.FileUtils;
 15 | 
 16 | /**
 17 |  * A class that is based on the text file produced by WordIndexer.
 18 |  * 
 19 |  * @author Martin Koerner
 20 |  * 
 21 |  */
 22 | public class WordIndex implements Iterable<String> {
 23 | 	protected String[] index;
 24 | 
 25 | 	public WordIndex(File indexFile) {
 26 | 		// count total number of lines in the index file
 27 | 		int lineCount = 0;
 28 | 		try {
 29 | 			BufferedReader br = new BufferedReader(new FileReader(indexFile));
 30 | 			while (br.readLine() != null) {
 31 | 				lineCount++;
 32 | 			}
 33 | 			br.close();
 34 | 		} catch (IOException e) {
 35 | 			e.printStackTrace();
 36 | 		}
 37 | 
 38 | 		this.index = new String[lineCount];
 39 | 		int currentLineCount = 0;
 40 | 
 41 | 		// read the index file
 42 | 		try {
 43 | 			BufferedReader br = new BufferedReader(new FileReader(indexFile));
 44 | 			String line;
 45 | 			String[] lineSplit;
 46 | 			while ((line = br.readLine()) != null) {
 47 | 				lineSplit = line.split("\t");
 48 | 				this.index[currentLineCount] = lineSplit[0];
 49 | 				currentLineCount++;
 50 | 			}
 51 | 			br.close();
 52 | 		} catch (IOException e) {
 53 | 			e.printStackTrace();
 54 | 		}
 55 | 	}
 56 | 
 57 | 	public int getLength() {
 58 | 		return this.index.length;
 59 | 	}
 60 | 
 61 | 	/**
 62 | 	 * returns the file in which word should be stored based on this.index
 63 | 	 * 
 64 | 	 * @param word
 65 | 	 * @return
 66 | 	 */
 67 | 	public int rank(String word) {
 68 | 		int lo = 0;
 69 | 		int hi = this.index.length - 1;
 70 | 		while (lo <= hi) {
 71 | 			int mid = lo + (hi - lo) / 2;
 72 | 			if (word.compareTo(this.index[mid]) < 0) {
 73 | 				hi = mid - 1;
 74 | 			} else if (word.compareTo(this.index[mid]) > 0) {
 75 | 				lo = mid + 1;
 76 | 			} else {
 77 | 				return mid;
 78 | 			}
 79 | 		}
 80 | 		// the following return statement is not the standard return result for
 81 | 		// binary search
 82 | 		return (lo + hi) / 2;
 83 | 	}
 84 | 
 85 | 	@Override
 86 | 	public Iterator<String> iterator() {
 87 | 		return Arrays.asList(this.index).iterator();
 88 | 	}
 89 | 
 90 | 	public HashMap<Integer, BufferedWriter> openWriters(File outputDirectory) {
 91 | 		HashMap<Integer, BufferedWriter> writers = new HashMap<Integer, BufferedWriter>();
 92 | 
 93 | 		File currentOutputDirectory = new File(
 94 | 				outputDirectory.getAbsolutePath());
 95 | 		if (currentOutputDirectory.exists()) {
 96 | 			try {
 97 | 				FileUtils.deleteDirectory(currentOutputDirectory);
 98 | 			} catch (IOException e) {
 99 | 				// TODO Auto-generated catch block
100 | 				e.printStackTrace();
101 | 			}
102 | 		}
103 | 		currentOutputDirectory.mkdir();
104 | 
105 | 		// calculate buffer size for writers
106 | 		// TODO: bufferSize calculation
107 | 		for (int fileCount = 0; fileCount < this.index.length; fileCount++) {
108 | 			try {
109 | 				writers.put(fileCount, new BufferedWriter(new FileWriter(
110 | 						currentOutputDirectory.getAbsolutePath() + "/"
111 | 								+ fileCount), 10 * 8 * 1024));
112 | 			} catch (IOException e) {
113 | 				// TODO Auto-generated catch block
114 | 				e.printStackTrace();
115 | 			}
116 | 		}
117 | 		return writers;
118 | 	}
119 | 
120 | 	public void closeWriters(HashMap<Integer, BufferedWriter> writers) {
121 | 		for (Entry<Integer, BufferedWriter> entry : writers.entrySet()) {
122 | 			try {
123 | 				entry.getValue().close();
124 | 			} catch (IOException e) {
125 | 				// TODO Auto-generated catch block
126 | 				e.printStackTrace();
127 | 			}
128 | 		}
129 | 	}
130 | }
131 | 


--------------------------------------------------------------------------------
/src/de/typology/indexes/WordIndexer.java:
--------------------------------------------------------------------------------
  1 | package de.typology.indexes;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.File;
  6 | import java.io.FileNotFoundException;
  7 | import java.io.FileReader;
  8 | import java.io.FileWriter;
  9 | import java.io.IOException;
 10 | import java.util.Comparator;
 11 | import java.util.Iterator;
 12 | import java.util.Map;
 13 | import java.util.Map.Entry;
 14 | import java.util.TreeMap;
 15 | 
 16 | /**
 17 |  * A class for building a text file containing a index representation for a
 18 |  * given text file based on the alphabetical distribution of its words.
 19 |  * 
 20 |  * @author Martin Koerner
 21 |  * 
 22 |  */
 23 | public class WordIndexer {
 24 | 
 25 | 	private TreeMap<String, Long> buildMap(File InputFile,
 26 | 			String addBeforeSentence, String addAfterSentence) {
 27 | 		BufferedReader reader;
 28 | 		try {
 29 | 			reader = new BufferedReader(new FileReader(InputFile));
 30 | 		} catch (FileNotFoundException e1) {
 31 | 			// TODO Auto-generated catch block
 32 | 			e1.printStackTrace();
 33 | 			return null;
 34 | 		}
 35 | 
 36 | 		// a comparator for wordMap
 37 | 		Comparator<String> StringComparator = new Comparator<String>() {
 38 | 			@Override
 39 | 			public int compare(String s1, String s2) {
 40 | 				return s1.compareTo(s2);
 41 | 			}
 42 | 		};
 43 | 
 44 | 		TreeMap<String, Long> wordMap = new TreeMap<String, Long>(
 45 | 				StringComparator);
 46 | 		String line;
 47 | 		// long lineCount=0L;
 48 | 		try {
 49 | 			while ((line = reader.readLine()) != null) {
 50 | 				line = addBeforeSentence + line + addAfterSentence;
 51 | 				String[] words = line.split("\\s+");
 52 | 				for (String word : words) {
 53 | 					if (wordMap.containsKey(word)) {
 54 | 						wordMap.put(word, wordMap.get(word) + 1);
 55 | 					} else {
 56 | 						wordMap.put(word, 1L);
 57 | 					}
 58 | 				}
 59 | 			}
 60 | 			reader.close();
 61 | 
 62 | 		} catch (IOException e) {
 63 | 			e.printStackTrace();
 64 | 		}
 65 | 		return wordMap;
 66 | 	}
 67 | 
 68 | 	/**
 69 | 	 * 
 70 | 	 * @param inputFile
 71 | 	 * @param maxCountDivider
 72 | 	 * @return Long: maxCountPerFile
 73 | 	 */
 74 | 	public long buildIndex(File inputFile, File indexOutputFile,
 75 | 			int maxCountDivider, String addBeforeSentence,
 76 | 			String addAfterSentence) {
 77 | 
 78 | 		// build WordMap
 79 | 		TreeMap<String, Long> wordMap = this.buildMap(inputFile,
 80 | 				addBeforeSentence, addAfterSentence);
 81 | 
 82 | 		// summarize all word counts
 83 | 		Long totalCount = 0L;
 84 | 		for (Entry<String, Long> word : wordMap.entrySet()) {
 85 | 			totalCount += word.getValue();
 86 | 		}
 87 | 
 88 | 		// calculate max count per file
 89 | 		Long maxCountPerFile = totalCount / maxCountDivider;
 90 | 		// System.out.println("maxCountPerFile: " + maxCountPerFile);
 91 | 		if (maxCountPerFile < 1L) {
 92 | 			maxCountPerFile = 1L;
 93 | 		}
 94 | 
 95 | 		// build index
 96 | 		BufferedWriter indexWriter;
 97 | 		try {
 98 | 			indexWriter = new BufferedWriter(new FileWriter(indexOutputFile));
 99 | 			Long currentFileCount = 0L;
100 | 			int fileCount = 0;
101 | 			Iterator<Map.Entry<String, Long>> wordMapIterator = wordMap
102 | 					.entrySet().iterator();
103 | 			Entry<String, Long> word;
104 | 
105 | 			while (wordMapIterator.hasNext()) {
106 | 				// get next word
107 | 				word = wordMapIterator.next();
108 | 				if (fileCount == 0
109 | 						|| currentFileCount + word.getValue() > maxCountPerFile) {
110 | 					indexWriter.write(word.getKey() + "\t" + fileCount + "\n");
111 | 					currentFileCount = word.getValue();
112 | 					fileCount++;
113 | 				} else {
114 | 					currentFileCount += word.getValue();
115 | 				}
116 | 			}
117 | 			indexWriter.close();
118 | 		} catch (IOException e) {
119 | 			// make sure that no corrupted index file is stored
120 | 			if (indexOutputFile.exists()) {
121 | 				indexOutputFile.delete();
122 | 			}
123 | 			e.printStackTrace();
124 | 			// TODO Auto-generated catch block
125 | 			e.printStackTrace();
126 | 		}
127 | 		return maxCountPerFile;
128 | 	}
129 | 
130 | }
131 | 


--------------------------------------------------------------------------------
/src/de/typology/patterns/PatternBuilder.java:
--------------------------------------------------------------------------------
  1 | package de.typology.patterns;
  2 | 
  3 | import java.util.ArrayList;
  4 | 
  5 | public class PatternBuilder {
  6 | 
  7 | 	public static ArrayList<boolean[]> getGLMPatterns(int maxModelLength) {
  8 | 		ArrayList<boolean[]> patterns = new ArrayList<boolean[]>();
  9 | 		for (int intPattern = 1; intPattern < Math.pow(2, maxModelLength); intPattern++) {
 10 | 			// leave out even sequences since they don't contain a
 11 | 			// target
 12 | 			if (intPattern % 2 == 0) {
 13 | 				continue;
 14 | 			}
 15 | 			patterns.add(PatternTransformer.getBooleanPattern(intPattern));
 16 | 		}
 17 | 		return patterns;
 18 | 	}
 19 | 
 20 | 	public static ArrayList<boolean[]> getReverseGLMPatterns(int maxModelLength) {
 21 | 		ArrayList<boolean[]> patterns = new ArrayList<boolean[]>();
 22 | 		for (int intPattern = (int) (Math.pow(2, maxModelLength) - 1); intPattern > 0; intPattern--) {
 23 | 			// leave out even sequences since they don't contain a
 24 | 			// target
 25 | 			if (intPattern % 2 == 0) {
 26 | 				continue;
 27 | 			}
 28 | 			patterns.add(PatternTransformer.getBooleanPattern(intPattern));
 29 | 		}
 30 | 		return patterns;
 31 | 	}
 32 | 
 33 | 	/**
 34 | 	 * Also returns sequences that are greater than maxModelLength but are
 35 | 	 * needed to calculate kneser ney smoothed values
 36 | 	 * 
 37 | 	 * @param maxModelLength
 38 | 	 * @return
 39 | 	 */
 40 | 	public static ArrayList<boolean[]> getGLMForSmoothingPatterns(
 41 | 			int maxModelLength) {
 42 | 		ArrayList<boolean[]> patterns = new ArrayList<boolean[]>();
 43 | 		for (int intPattern = 1; intPattern < Math.pow(2, maxModelLength); intPattern++) {
 44 | 			// // leave out even sequences since they don't contain a
 45 | 			// // target
 46 | 			// if (intPattern % 2 == 0) {
 47 | 			// continue;
 48 | 			// }
 49 | 			patterns.add(PatternTransformer.getBooleanPattern(intPattern));
 50 | 		}
 51 | 		return patterns;
 52 | 	}
 53 | 
 54 | 	public static ArrayList<boolean[]> getReverseGLMForSmoothingPatterns(
 55 | 			int maxModelLength) {
 56 | 		ArrayList<boolean[]> patterns = new ArrayList<boolean[]>();
 57 | 		for (int intPattern = (int) Math.pow(2, maxModelLength) - 1; intPattern > 0; intPattern--) {
 58 | 			// // leave out even sequences since they don't contain a
 59 | 			// // target
 60 | 			// if (intPattern % 2 == 0) {
 61 | 			// continue;
 62 | 			// }
 63 | 			patterns.add(PatternTransformer.getBooleanPattern(intPattern));
 64 | 		}
 65 | 		return patterns;
 66 | 	}
 67 | 
 68 | 	public static ArrayList<boolean[]> getLMPatterns(int maxModelLength) {
 69 | 		ArrayList<boolean[]> patterns = new ArrayList<boolean[]>();
 70 | 		for (int intPattern = 1; intPattern < Math.pow(2, maxModelLength); intPattern++) {
 71 | 			String stringPattern = Integer.toBinaryString(intPattern);
 72 | 			if (Integer.bitCount(intPattern) == stringPattern.length()) {
 73 | 				patterns.add(PatternTransformer.getBooleanPattern(intPattern));
 74 | 			}
 75 | 		}
 76 | 		return patterns;
 77 | 	}
 78 | 
 79 | 	public static ArrayList<boolean[]> getReverseLMPatterns(int maxModelLength) {
 80 | 		ArrayList<boolean[]> patterns = new ArrayList<boolean[]>();
 81 | 		for (int intPattern = (int) (Math.pow(2, maxModelLength) - 1); intPattern > 0; intPattern--) {
 82 | 			String stringPattern = Integer.toBinaryString(intPattern);
 83 | 			if (Integer.bitCount(intPattern) == stringPattern.length()) {
 84 | 				patterns.add(PatternTransformer.getBooleanPattern(intPattern));
 85 | 			}
 86 | 		}
 87 | 		return patterns;
 88 | 	}
 89 | 
 90 | 	public static ArrayList<boolean[]> getTypologyPatterns(int maxModelLength) {
 91 | 		ArrayList<boolean[]> patterns = new ArrayList<boolean[]>();
 92 | 		for (int intPattern = 1; intPattern < Math.pow(2, maxModelLength); intPattern++) {
 93 | 			String stringPattern = Integer.toBinaryString(intPattern);
 94 | 			if (Integer.bitCount(intPattern) <= 2
 95 | 					&& stringPattern.startsWith("1")
 96 | 					&& stringPattern.endsWith("1")) {
 97 | 				patterns.add(PatternTransformer.getBooleanPattern(intPattern));
 98 | 			}
 99 | 		}
100 | 		return patterns;
101 | 	}
102 | 
103 | 	public static ArrayList<boolean[]> getReverseTypologyPatterns(
104 | 			int maxModelLength) {
105 | 		ArrayList<boolean[]> patterns = new ArrayList<boolean[]>();
106 | 		for (int intPattern = (int) (Math.pow(2, maxModelLength) - 1); intPattern > 0; intPattern--) {
107 | 			String stringPattern = Integer.toBinaryString(intPattern);
108 | 			if (Integer.bitCount(intPattern) <= 2
109 | 					&& stringPattern.startsWith("1")
110 | 					&& stringPattern.endsWith("1")) {
111 | 				patterns.add(PatternTransformer.getBooleanPattern(intPattern));
112 | 			}
113 | 		}
114 | 		return patterns;
115 | 	}
116 | }
117 | 


--------------------------------------------------------------------------------
/src/de/typology/patterns/PatternTransformer.java:
--------------------------------------------------------------------------------
 1 | package de.typology.patterns;
 2 | 
 3 | import java.util.Arrays;
 4 | 
 5 | /**
 6 |  * A class for transforming the used boolean pattern into different formats like
 7 |  * a binary string representation.
 8 |  * 
 9 |  * @author Martin Koerner
10 |  * 
11 |  */
12 | public class PatternTransformer {
13 | 
14 | 	public static void main(String[] args) {
15 | 		boolean[] bool1 = { true, false, true, true };
16 | 		System.out.println(getStringPattern(bool1));
17 | 		int i = 8;
18 | 		boolean[] bool2 = PatternTransformer.getBooleanPattern(Integer
19 | 				.toBinaryString(i));
20 | 		System.out.println(Arrays.toString(bool2));
21 | 	}
22 | 
23 | 	public static String getStringPattern(boolean[] booleanPattern) {
24 | 		String stringPattern = new String();
25 | 		for (boolean bool : booleanPattern) {
26 | 			if (bool) {
27 | 				stringPattern += 1;
28 | 			} else {
29 | 				stringPattern += 0;
30 | 			}
31 | 		}
32 | 		return stringPattern;
33 | 	}
34 | 
35 | 	public static boolean[] getBooleanPattern(int intPattern) {
36 | 		return PatternTransformer.getBooleanPattern(Integer
37 | 				.toBinaryString(intPattern));
38 | 	}
39 | 
40 | 	public static boolean[] getBooleanPattern(String stringPattern) {
41 | 		boolean[] booleanPattern = new boolean[stringPattern.length()];
42 | 		for (int i = 0; i < stringPattern.length(); i++) {
43 | 			if (stringPattern.charAt(i) == '1') {
44 | 				booleanPattern[i] = true;
45 | 			} else {
46 | 				booleanPattern[i] = false;
47 | 			}
48 | 		}
49 | 		return booleanPattern;
50 | 	}
51 | 
52 | 	public static boolean[] getBooleanPatternWithOnes(int length) {
53 | 		boolean[] booleanPattern = new boolean[length];
54 | 		for (int i = 0; i < booleanPattern.length; i++) {
55 | 			booleanPattern[i] = true;
56 | 		}
57 | 		return booleanPattern;
58 | 	}
59 | 
60 | 	public static int getIntPattern(boolean[] booleanPattern) {
61 | 		String stringPattern = PatternTransformer
62 | 				.getStringPattern(booleanPattern);
63 | 		if (stringPattern.length() == 0) {
64 | 			return 0;
65 | 		} else {
66 | 			return Integer.parseInt(stringPattern, 2);
67 | 		}
68 | 	}
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/src/de/typology/smoother/ModifiedKneserNeySmoother.java:
--------------------------------------------------------------------------------
  1 | package de.typology.smoother;
  2 | 
  3 | import java.io.File;
  4 | import java.util.HashMap;
  5 | 
  6 | import de.typology.patterns.PatternTransformer;
  7 | import de.typology.utils.Counter;
  8 | 
  9 | public class ModifiedKneserNeySmoother extends KneserNeySmoother {
 10 | 
 11 | 	public ModifiedKneserNeySmoother(File extractedSequenceDirectory,
 12 | 			File absoluteDirectory, File continuationDirectory,
 13 | 			String delimiter, int decimalPlaces) {
 14 | 		super(extractedSequenceDirectory, absoluteDirectory,
 15 | 				continuationDirectory, delimiter);
 16 | 
 17 | 		this.discountTypesValuesMapFile = new File(this.absoluteDirectory
 18 | 				.getParentFile().getAbsolutePath()
 19 | 				+ "/discount-values-mod-kneser-ney.ser");
 20 | 
 21 | 	}
 22 | 
 23 | 	private double d1;
 24 | 	private double d2;
 25 | 	private double d3plus;
 26 | 
 27 | 	/**
 28 | 	 * @param args
 29 | 	 */
 30 | 
 31 | 	@Override
 32 | 	protected HashMap<String, HashMap<String, Double>> calculateDiscountValues(
 33 | 			HashMap<String, HashMap<String, Double>> discountTypeValuesMap,
 34 | 			File inputDirectory) {
 35 | 		for (File absoluteTypeDirectory : inputDirectory.listFiles()) {
 36 | 			if (absoluteTypeDirectory.getName().contains("split")) {
 37 | 				continue;
 38 | 			}
 39 | 			HashMap<String, Double> discountValuesMap = new HashMap<String, Double>();
 40 | 			long n1 = Counter.countCountsInDirectory(1, absoluteTypeDirectory,
 41 | 					"<fs>");
 42 | 			long n2 = Counter.countCountsInDirectory(2, absoluteTypeDirectory,
 43 | 					"<fs>");
 44 | 			long n3 = Counter.countCountsInDirectory(3, absoluteTypeDirectory,
 45 | 					"<fs>");
 46 | 			long n4 = Counter.countCountsInDirectory(4, absoluteTypeDirectory,
 47 | 					"<fs>");
 48 | 			this.logger.info("n1 for " + absoluteTypeDirectory.getName() + ":"
 49 | 					+ n1);
 50 | 			this.logger.info("n2 for " + absoluteTypeDirectory.getName() + ":"
 51 | 					+ n2);
 52 | 			this.logger.info("n3 for " + absoluteTypeDirectory.getName() + ":"
 53 | 					+ n3);
 54 | 			this.logger.info("n4 for " + absoluteTypeDirectory.getName() + ":"
 55 | 					+ n4);
 56 | 			double y = n1 / ((double) n1 + 2 * n2);
 57 | 			this.d1 = 1 - 2 * y * ((double) n2 / (double) n1);
 58 | 			this.d2 = 2 - 3 * y * ((double) n3 / (double) n2);
 59 | 			this.d3plus = 3 - 4 * y * ((double) n4 / (double) n3);
 60 | 			// this.d1plus = 0.5;
 61 | 			this.logger.info("D1 for " + absoluteTypeDirectory.getName() + ":"
 62 | 					+ this.d1);
 63 | 			this.logger.info("D2 for " + absoluteTypeDirectory.getName() + ":"
 64 | 					+ this.d2);
 65 | 			this.logger.info("D3+ for " + absoluteTypeDirectory.getName() + ":"
 66 | 					+ this.d3plus);
 67 | 			discountValuesMap.put("D1", this.d1);
 68 | 			discountValuesMap.put("D2", this.d2);
 69 | 			discountValuesMap.put("D3+", this.d3plus);
 70 | 
 71 | 			discountTypeValuesMap.put(absoluteTypeDirectory.getName(),
 72 | 					discountValuesMap);
 73 | 		}
 74 | 		return discountTypeValuesMap;
 75 | 
 76 | 	}
 77 | 
 78 | 	/**
 79 | 	 * 
 80 | 	 * @param sequenceStringPattern
 81 | 	 * @param sequenceCount
 82 | 	 * @return
 83 | 	 */
 84 | 	@Override
 85 | 	protected double getDiscountValue(String sequenceStringPattern,
 86 | 			long sequenceCount) {
 87 | 		String stringPatternForBitcount = sequenceStringPattern.replaceAll("_",
 88 | 				"0");
 89 | 		if (Integer.bitCount(PatternTransformer
 90 | 				.getIntPattern(PatternTransformer
 91 | 						.getBooleanPattern(stringPatternForBitcount))) > 1) {
 92 | 			// not lowest order
 93 | 			if (sequenceCount == 1) {
 94 | 				return this.discountTypeValuesMap.get(sequenceStringPattern)
 95 | 						.get("D1");
 96 | 			}
 97 | 			if (sequenceCount == 2) {
 98 | 				return this.discountTypeValuesMap.get(sequenceStringPattern)
 99 | 						.get("D2");
100 | 			}
101 | 			if (sequenceCount >= 3) {
102 | 				return this.discountTypeValuesMap.get(sequenceStringPattern)
103 | 						.get("D3+");
104 | 			}
105 | 			// count < 1
106 | 			return 0;
107 | 		} else {
108 | 			// lowest order
109 | 			return 0;
110 | 		}
111 | 	}
112 | 
113 | 	@Override
114 | 	protected double calculateWeightNumerator(String continuationPattern,
115 | 			String sequence, int sequenceLength, String sequenceStringPattern) {
116 | 		// [0]=1+
117 | 		// [1]=1
118 | 		// [2]=2
119 | 		// [3]=3+
120 | 		return this.getDiscountValue(continuationPattern, 1)
121 | 				* this.calculateContinuationLast(sequence, sequenceLength,
122 | 						sequenceStringPattern, 1)
123 | 				+ this.getDiscountValue(continuationPattern, 2)
124 | 				* this.calculateContinuationLast(sequence, sequenceLength,
125 | 						sequenceStringPattern, 2)
126 | 				+ this.getDiscountValue(continuationPattern, 3)
127 | 				* this.calculateContinuationLast(sequence, sequenceLength,
128 | 						sequenceStringPattern, 3);
129 | 	}
130 | }
131 | 


--------------------------------------------------------------------------------
/src/de/typology/splitter/AbsoluteSplitter.java:
--------------------------------------------------------------------------------
  1 | package de.typology.splitter;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.IOException;
  7 | import java.io.InputStream;
  8 | import java.util.ArrayList;
  9 | import java.util.concurrent.ExecutorService;
 10 | import java.util.concurrent.Executors;
 11 | import java.util.concurrent.TimeUnit;
 12 | 
 13 | import org.apache.commons.io.FileUtils;
 14 | import org.apache.logging.log4j.LogManager;
 15 | import org.apache.logging.log4j.Logger;
 16 | 
 17 | import de.typology.indexes.WordIndex;
 18 | import de.typology.patterns.PatternTransformer;
 19 | 
 20 | /**
 21 |  * Split
 22 |  * 
 23 |  * @author Martin Koerner
 24 |  * 
 25 |  */
 26 | public class AbsoluteSplitter {
 27 | 	private File inputFile;
 28 | 	private File indexFile;
 29 | 	private File outputDirectory;
 30 | 	private String delimiter;
 31 | 	protected boolean deleteTempFiles;
 32 | 	protected String addBeforeSentence;
 33 | 	protected String addAfterSentence;
 34 | 
 35 | 	Logger logger = LogManager.getLogger(this.getClass().getName());
 36 | 
 37 | 	public AbsoluteSplitter(File inputFile, File indexFile,
 38 | 			File outputDirectory, String delimiter, boolean deleteTempFiles,
 39 | 			String addBeforeSentence, String addAfterSentence) {
 40 | 		this.inputFile = inputFile;
 41 | 		this.indexFile = indexFile;
 42 | 		this.outputDirectory = outputDirectory;
 43 | 		this.delimiter = delimiter;
 44 | 		this.deleteTempFiles = deleteTempFiles;
 45 | 		this.addBeforeSentence = addBeforeSentence;
 46 | 		this.addAfterSentence = addAfterSentence;
 47 | 		// delete old directory
 48 | 		if (outputDirectory.exists()) {
 49 | 			try {
 50 | 				FileUtils.deleteDirectory(outputDirectory);
 51 | 			} catch (IOException e) {
 52 | 				// TODO Auto-generated catch block
 53 | 				e.printStackTrace();
 54 | 			}
 55 | 		}
 56 | 		outputDirectory.mkdir();
 57 | 	}
 58 | 
 59 | 	public void split(ArrayList<boolean[]> patterns, int cores) {
 60 | 
 61 | 		this.logger
 62 | 				.info("read word index: " + this.indexFile.getAbsolutePath());
 63 | 		WordIndex wordIndex = new WordIndex(this.indexFile);
 64 | 
 65 | 		// initialize executerService
 66 | 		// int cores = Runtime.getRuntime().availableProcessors();
 67 | 		ExecutorService executorService = Executors.newFixedThreadPool(cores);
 68 | 		for (boolean[] pattern : patterns) {
 69 | 			this.logger.debug("execute SplitterTask for: "
 70 | 					+ PatternTransformer.getStringPattern(pattern)
 71 | 					+ " sequences");
 72 | 
 73 | 			try {
 74 | 				InputStream inputFileInputStream = new FileInputStream(
 75 | 						this.inputFile);
 76 | 				SplitterTask splitterTask = new SplitterTask(
 77 | 						inputFileInputStream, this.outputDirectory, wordIndex,
 78 | 						pattern, PatternTransformer.getStringPattern(pattern),
 79 | 						this.delimiter, 0, this.deleteTempFiles,
 80 | 						this.addBeforeSentence, this.addAfterSentence, false,
 81 | 						false, false);
 82 | 				executorService.execute(splitterTask);
 83 | 			} catch (FileNotFoundException e) {
 84 | 				// TODO Auto-generated catch block
 85 | 				e.printStackTrace();
 86 | 				this.logger.error("inputFile not found: "
 87 | 						+ this.inputFile.getAbsolutePath());
 88 | 				return;
 89 | 			}
 90 | 		}
 91 | 		executorService.shutdown();
 92 | 		try {
 93 | 			executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
 94 | 		} catch (InterruptedException e) {
 95 | 			// TODO Auto-generated catch block
 96 | 			e.printStackTrace();
 97 | 		}
 98 | 	}
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/de/typology/splitter/Aggregator.java:
--------------------------------------------------------------------------------
  1 | package de.typology.splitter;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.File;
  6 | import java.io.FileReader;
  7 | import java.io.FileWriter;
  8 | import java.io.IOException;
  9 | import java.util.Comparator;
 10 | import java.util.Map.Entry;
 11 | import java.util.SortedMap;
 12 | import java.util.SortedSet;
 13 | import java.util.TreeMap;
 14 | import java.util.TreeSet;
 15 | 
 16 | import org.apache.logging.log4j.LogManager;
 17 | import org.apache.logging.log4j.Logger;
 18 | 
 19 | /**
 20 |  * A class for aggregating sequences by counting their occurrences. Expects an
 21 |  * inputStream with a size that is 30% of the allocated main memory.
 22 |  * 
 23 |  * @author Martin Koerner
 24 |  * 
 25 |  */
 26 | public class Aggregator {
 27 | 	File inputFile;
 28 | 	File outputFile;
 29 | 	String delimiter;
 30 | 	int startSortAtColumn;
 31 | 	boolean additionalCounts;
 32 | 
 33 | 	Logger logger = LogManager.getLogger(this.getClass().getName());
 34 | 
 35 | 	// this comparator is based on the value of startSortAtColumn
 36 | 	private Comparator<String> stringComparator = new Comparator<String>() {
 37 | 		@Override
 38 | 		public int compare(String string1, String string2) {
 39 | 			if (Aggregator.this.startSortAtColumn == 0) {
 40 | 				return string1.compareTo(string2);
 41 | 			} else {
 42 | 				String[] string1Split = string1.split("\\s");
 43 | 				String[] string2Split = string2.split("\\s");
 44 | 				String newString1 = "";
 45 | 				String newString2 = "";
 46 | 				for (int i = Aggregator.this.startSortAtColumn; i < string1Split.length; i++) {
 47 | 					newString1 += string1Split[i] + " ";
 48 | 					newString2 += string2Split[i] + " ";
 49 | 				}
 50 | 				newString1 = newString1.replaceFirst(" $", "");
 51 | 				newString2 = newString2.replaceFirst(" $", "");
 52 | 				int result = newString1.compareTo(newString2);
 53 | 				if (result != 0) {
 54 | 					// not equal
 55 | 					return result;
 56 | 				} else {
 57 | 					int i = 0;
 58 | 					while (i < Aggregator.this.startSortAtColumn) {
 59 | 						String newNewString1 = newString1;
 60 | 						String newNewString2 = newString2;
 61 | 						for (int j = i; j >= 0; j--) {
 62 | 							newNewString1 = string1Split[j] + " "
 63 | 									+ newNewString1;
 64 | 							newNewString2 = string2Split[j] + " "
 65 | 									+ newNewString2;
 66 | 						}
 67 | 						result = newNewString1.compareTo(newNewString2);
 68 | 						if (result != 0) {
 69 | 							// not equal
 70 | 							return result;
 71 | 						}
 72 | 						// equal
 73 | 						i++;
 74 | 					}
 75 | 					// final result: equal
 76 | 					return 0;
 77 | 				}
 78 | 			}
 79 | 		}
 80 | 	};
 81 | 
 82 | 	/**
 83 | 	 * @param inputStream
 84 | 	 * @param outputStream
 85 | 	 * @param delimiter
 86 | 	 * @param startSortAtColumn
 87 | 	 *            : First column is zero
 88 | 	 */
 89 | 	public Aggregator(File inputFile, File outputFile, String delimiter,
 90 | 			int startSortAtColumn, boolean additionalCounts) {
 91 | 		this.inputFile = inputFile;
 92 | 		this.outputFile = outputFile;
 93 | 		this.delimiter = delimiter;
 94 | 		this.startSortAtColumn = startSortAtColumn;
 95 | 		this.additionalCounts = additionalCounts;
 96 | 
 97 | 	}
 98 | 
 99 | 	public void aggregateCounts() {
100 | 		try {
101 | 			BufferedReader inputFileReader = new BufferedReader(new FileReader(
102 | 					this.inputFile));
103 | 
104 | 			SortedMap<String, Long[]> wordMapAdditionalCounts = new TreeMap<String, Long[]>(
105 | 					this.stringComparator);
106 | 			SortedMap<String, Long> wordMapNoAdditionalCounts = new TreeMap<String, Long>(
107 | 					this.stringComparator);
108 | 			String inputLine;
109 | 
110 | 			while ((inputLine = inputFileReader.readLine()) != null) {
111 | 				String[] inputLineSplit = inputLine.split(this.delimiter);
112 | 				String words = inputLineSplit[0];
113 | 				long count = Long.parseLong(inputLineSplit[1]);
114 | 				if (words.length() == 0) {
115 | 					// TODO: understand the following comment
116 | 					// logger.error("empty row in " + this.inputFile + ": \""
117 | 					// + inputLine + "\"");
118 | 					// logger.error("exiting JVM");
119 | 					// System.exit(1);
120 | 					continue;
121 | 				}
122 | 
123 | 				if (this.additionalCounts) {
124 | 					this.addCountWithAdditional(wordMapAdditionalCounts, words,
125 | 							count);
126 | 				} else {
127 | 					this.addCountWithNoAdditional(wordMapNoAdditionalCounts,
128 | 							words, count);
129 | 				}
130 | 			}
131 | 
132 | 			inputFileReader.close();
133 | 			BufferedWriter outputFileWriter = new BufferedWriter(
134 | 					new FileWriter(this.outputFile));
135 | 			if (this.additionalCounts) {
136 | 				for (Entry<String, Long[]> entry : wordMapAdditionalCounts
137 | 						.entrySet()) {
138 | 					String words = entry.getKey();
139 | 					// [0]=1+
140 | 					// [1]=1
141 | 					// [2]=2
142 | 					// [3]=3+
143 | 					outputFileWriter.write(words + this.delimiter
144 | 							+ entry.getValue()[0] + this.delimiter
145 | 							+ entry.getValue()[1] + this.delimiter
146 | 							+ entry.getValue()[2] + this.delimiter
147 | 							+ entry.getValue()[3] + "\n");
148 | 				}
149 | 			} else {
150 | 				for (Entry<String, Long> entry : wordMapNoAdditionalCounts
151 | 						.entrySet()) {
152 | 					String words = entry.getKey();
153 | 					outputFileWriter.write(words + this.delimiter
154 | 							+ entry.getValue() + "\n");
155 | 				}
156 | 			}
157 | 			outputFileWriter.close();
158 | 		} catch (IOException e) {
159 | 			// TODO Auto-generated catch block
160 | 			e.printStackTrace();
161 | 		}
162 | 	}
163 | 
164 | 	private void addCountWithNoAdditional(
165 | 			SortedMap<String, Long> wordMapNoAdditionalCounts, String words,
166 | 			long count) {
167 | 		if (wordMapNoAdditionalCounts.containsKey(words)) {
168 | 			wordMapNoAdditionalCounts.put(words,
169 | 					wordMapNoAdditionalCounts.get(words) + count);
170 | 		} else {
171 | 			wordMapNoAdditionalCounts.put(words, count);
172 | 		}
173 | 	}
174 | 
175 | 	private void addCountWithAdditional(SortedMap<String, Long[]> wordMap,
176 | 			String words, long count) {
177 | 		if (wordMap.containsKey(words)) {
178 | 			Long[] countTypeArray = wordMap.get(words);
179 | 			countTypeArray[0] = countTypeArray[0] + count;
180 | 			if (count == 1) {
181 | 				countTypeArray[1] = countTypeArray[1] + count;
182 | 			}
183 | 			if (count == 2) {
184 | 				countTypeArray[2] = countTypeArray[2] + count;
185 | 			}
186 | 			if (count >= 3) {
187 | 				countTypeArray[3] = countTypeArray[3] + count;
188 | 			}
189 | 		} else {
190 | 			Long[] countTypeArray = new Long[4];
191 | 			countTypeArray[0] = count;
192 | 			if (count == 1) {
193 | 				countTypeArray[1] = count;
194 | 			} else {
195 | 				countTypeArray[1] = 0L;
196 | 			}
197 | 			if (count == 2) {
198 | 				countTypeArray[2] = count;
199 | 			} else {
200 | 				countTypeArray[2] = 0L;
201 | 			}
202 | 			if (count >= 3) {
203 | 				countTypeArray[3] = count;
204 | 			} else {
205 | 				countTypeArray[3] = 0L;
206 | 			}
207 | 			wordMap.put(words, countTypeArray);
208 | 		}
209 | 	}
210 | 
211 | 	public void aggregateWithoutCounts() {
212 | 		try {
213 | 			BufferedReader inputFileReader = new BufferedReader(new FileReader(
214 | 					this.inputFile));
215 | 
216 | 			SortedSet<String> wordSet = new TreeSet<String>(
217 | 					this.stringComparator);
218 | 			String inputLine;
219 | 
220 | 			while ((inputLine = inputFileReader.readLine()) != null) {
221 | 				wordSet.add(inputLine);
222 | 			}
223 | 			inputFileReader.close();
224 | 			BufferedWriter outputFileWriter = new BufferedWriter(
225 | 					new FileWriter(this.outputFile));
226 | 			for (String line : wordSet) {
227 | 				outputFileWriter.write(line + "\n");
228 | 			}
229 | 			outputFileWriter.close();
230 | 		} catch (IOException e) {
231 | 			// TODO Auto-generated catch block
232 | 			e.printStackTrace();
233 | 		}
234 | 	}
235 | }
236 | 


--------------------------------------------------------------------------------
/src/de/typology/splitter/DataSetSplitter.java:
--------------------------------------------------------------------------------
  1 | package de.typology.splitter;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.File;
  6 | import java.io.FileReader;
  7 | import java.io.FileWriter;
  8 | import java.io.IOException;
  9 | import java.util.HashMap;
 10 | import java.util.Map.Entry;
 11 | 
 12 | import org.apache.logging.log4j.LogManager;
 13 | import org.apache.logging.log4j.Logger;
 14 | 
 15 | import de.typology.utils.Config;
 16 | 
 17 | /**
 18 |  * This class splits and samples a given input file into trainings and test data
 19 |  * 
 20 |  * The threasholds can be configured in config.txt the relevant fields are
 21 |  * 
 22 |  * splitDataRatio
 23 |  * 
 24 |  * smpleRate
 25 |  * 
 26 |  * nGramLength
 27 |  * 
 28 |  * @author Rene Pickhardt, Martin Koerner
 29 |  * 
 30 |  */
 31 | public class DataSetSplitter {
 32 | 
 33 | 	/**
 34 | 	 * @param args
 35 | 	 */
 36 | 	public static void main(String[] args) {
 37 | 		String outputDirectory = Config.get().outputDirectory
 38 | 				+ Config.get().inputDataSet;
 39 | 
 40 | 		DataSetSplitter dss = new DataSetSplitter(new File(outputDirectory),
 41 | 				"normalized.txt");
 42 | 		dss.split("training.txt", "learning.txt", "testing.txt", 5);
 43 | 		dss.splitIntoSequences(new File(outputDirectory + "/training.txt"),
 44 | 				Config.get().modelLength, Config.get().numberOfQueries);
 45 | 
 46 | 	}
 47 | 
 48 | 	private File directory;
 49 | 
 50 | 	private String inputName;
 51 | 
 52 | 	Logger logger = LogManager.getLogger(this.getClass().getName());
 53 | 
 54 | 	public DataSetSplitter(File directory, String inputName) {
 55 | 
 56 | 		this.directory = directory;
 57 | 		this.inputName = inputName;
 58 | 	}
 59 | 
 60 | 	/**
 61 | 	 * Takes a given input file and provides a 3 way split. The file can be
 62 | 	 * sampled via the sampleRatio. A high sample ratio means that a large
 63 | 	 * portion of the file is being thrown away
 64 | 	 * 
 65 | 	 * There the splitDataRatio specifies the percentage of the file that is
 66 | 	 * used as training data. The rest will be used as test and learing data.
 67 | 	 * 
 68 | 	 * The last parameter splitTestRatio is usually set to 50 and means that the
 69 | 	 * test data is also devided again into learning and testing data
 70 | 	 * 
 71 | 	 * 
 72 | 	 * @param inputFile
 73 | 	 *            potentially large text file that needs to be split
 74 | 	 * @param trainingFileName
 75 | 	 *            filename where training data are to be stored
 76 | 	 * @param learningFileName
 77 | 	 *            filename where learning data are to be stored
 78 | 	 * @param testingFileName
 79 | 	 *            filename where test data are to be stored
 80 | 	 */
 81 | 	public void split(String trainingFileName, String learningFileName,
 82 | 			String testingFileName, int sequenceLength) {
 83 | 		this.logger.info("splitting into training, testing and learning file: "
 84 | 				+ this.directory + "/" + this.inputName);
 85 | 		try {
 86 | 			BufferedReader reader = new BufferedReader(new FileReader(
 87 | 					this.directory.getAbsolutePath() + "/" + this.inputName));
 88 | 			BufferedWriter trainingDataWriter = new BufferedWriter(
 89 | 					new FileWriter(this.directory.getAbsolutePath() + "/"
 90 | 							+ trainingFileName));
 91 | 			BufferedWriter learningDataWriter = new BufferedWriter(
 92 | 					new FileWriter(this.directory.getAbsolutePath() + "/"
 93 | 							+ learningFileName));
 94 | 			BufferedWriter testingDataWriter = new BufferedWriter(
 95 | 					new FileWriter(this.directory.getAbsolutePath() + "/"
 96 | 							+ testingFileName));
 97 | 			int rand;
 98 | 			String line;
 99 | 			while ((line = reader.readLine()) != null) {
100 | 				rand = (int) (Math.random() * 100);
101 | 				if (rand >= Config.get().sampleRate) {
102 | 					// keep data
103 | 					rand = (int) (Math.random() * 100);
104 | 					if (rand >= Config.get().splitDataRatio) {
105 | 						// store data in testing or learning file
106 | 						rand = (int) (Math.random() * 100);
107 | 						if (rand >= Config.get().splitTestRatio) {
108 | 							if (Config.get().addSentenceTags) {
109 | 								// TODO make this flexible
110 | 								line = "<s> " + line + " </s>";
111 | 							}
112 | 							learningDataWriter.write(line + "\n");
113 | 						} else {
114 | 							if (Config.get().addSentenceTags) {
115 | 								// TODO make this flexible
116 | 								line = "<s> " + line + " </s>";
117 | 							}
118 | 							testingDataWriter.write(line + "\n");
119 | 						}
120 | 					} else {
121 | 						// store data in training file
122 | 						trainingDataWriter.write(line + "\n");
123 | 					}
124 | 				}
125 | 			}
126 | 			reader.close();
127 | 			trainingDataWriter.close();
128 | 			learningDataWriter.close();
129 | 			testingDataWriter.close();
130 | 
131 | 			this.logger.info("splitting done");
132 | 
133 | 		} catch (IOException e) {
134 | 			// TODO Auto-generated catch block
135 | 			e.printStackTrace();
136 | 		}
137 | 	}
138 | 
139 | 	public void splitIntoSequences(File inputFile, int maxSequenceLength,
140 | 			int numberOfSequences) {
141 | 		System.out.println(maxSequenceLength);
142 | 		String[] fileNameSplit = inputFile.getName().split("\\.");
143 | 
144 | 		HashMap<Integer, BufferedWriter> testSequenceFileWriters = new HashMap<Integer, BufferedWriter>();
145 | 		for (int i = 1; i <= maxSequenceLength; i++) {
146 | 			try {
147 | 				testSequenceFileWriters.put(i,
148 | 						new BufferedWriter(new FileWriter(new File(
149 | 								this.directory.getAbsolutePath() + "/"
150 | 										+ fileNameSplit[0] + "-samples-" + i
151 | 										+ "." + fileNameSplit[1]))));
152 | 			} catch (IOException e) {
153 | 				// TODO Auto-generated catch block
154 | 				e.printStackTrace();
155 | 			}
156 | 		}
157 | 
158 | 		// get total count from stats file
159 | 		long sequenceCount = 0L;
160 | 		try {
161 | 			BufferedReader reader = new BufferedReader(
162 | 					new FileReader(inputFile));
163 | 			String line;
164 | 			// count sequences
165 | 			while ((line = reader.readLine()) != null) {
166 | 				String[] lineSplit = line.split("\\s");
167 | 				if (lineSplit.length < maxSequenceLength) {
168 | 					continue;
169 | 				} else {
170 | 					int sequenceStart = 0;
171 | 					while (lineSplit.length - sequenceStart >= maxSequenceLength) {
172 | 						sequenceCount++;
173 | 						sequenceStart++;
174 | 					}
175 | 				}
176 | 			}
177 | 			reader.close();
178 | 		} catch (IOException e) {
179 | 			// TODO Auto-generated catch block
180 | 			e.printStackTrace();
181 | 		}
182 | 		this.logger.debug("sequenceCount: " + sequenceCount);
183 | 		double sequenceProbability = (double) numberOfSequences / sequenceCount;
184 | 		long skipDistance = sequenceCount / numberOfSequences;
185 | 		this.logger.debug("skipDistance: " + skipDistance);
186 | 
187 | 		try {
188 | 			BufferedReader reader = new BufferedReader(
189 | 					new FileReader(inputFile));
190 | 			this.logger.info("splitting " + inputFile.getName()
191 | 					+ " into sequences");
192 | 			String line;
193 | 			while ((line = reader.readLine()) != null) {
194 | 				String[] originalLineSplit = line.split("\\s");
195 | 				int linePointer = 0;
196 | 				while (originalLineSplit.length - linePointer >= maxSequenceLength) {
197 | 					// build current Sequence
198 | 					String currentSequence = "";
199 | 					for (int i = 0; i < maxSequenceLength; i++) {
200 | 						currentSequence += originalLineSplit[linePointer + i]
201 | 								+ " ";
202 | 					}
203 | 					currentSequence = currentSequence.replaceFirst(" $", "");
204 | 					if (Math.random() <= sequenceProbability) {
205 | 						String[] currentSequenceSplit = currentSequence
206 | 								.split("\\s");
207 | 						for (int i = 1; i <= maxSequenceLength; i++) {
208 | 							// build result sequence
209 | 							String resultSequence = "";
210 | 							for (int j = 0; j < i; j++) {
211 | 								resultSequence += currentSequenceSplit[j] + " ";
212 | 							}
213 | 							resultSequence = resultSequence.replaceFirst(" $",
214 | 									"");
215 | 							testSequenceFileWriters.get(i).write(
216 | 									resultSequence + "\n");
217 | 						}
218 | 					}
219 | 					linePointer++;
220 | 				}
221 | 			}
222 | 
223 | 			reader.close();
224 | 			for (Entry<Integer, BufferedWriter> testSequenceWritersEntry : testSequenceFileWriters
225 | 					.entrySet()) {
226 | 				testSequenceWritersEntry.getValue().close();
227 | 			}
228 | 		} catch (IOException e) {
229 | 			// TODO Auto-generated catch block
230 | 			e.printStackTrace();
231 | 		}
232 | 	}
233 | }
234 | 


--------------------------------------------------------------------------------
/src/de/typology/splitter/LineCounterTask.java:
--------------------------------------------------------------------------------
 1 | package de.typology.splitter;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.BufferedWriter;
 5 | import java.io.File;
 6 | import java.io.FileWriter;
 7 | import java.io.IOException;
 8 | import java.io.InputStream;
 9 | import java.io.InputStreamReader;
10 | 
11 | import org.apache.commons.io.FileUtils;
12 | import org.apache.logging.log4j.LogManager;
13 | import org.apache.logging.log4j.Logger;
14 | 
15 | public class LineCounterTask implements Runnable {
16 | 	protected InputStream inputStream;
17 | 	protected File outputDirectory;
18 | 	protected String patternLabel;
19 | 	protected String delimiter;
20 | 	protected boolean setCountToOne;
21 | 	protected boolean additionalCounts;
22 | 
23 | 	Logger logger = LogManager.getLogger(this.getClass().getName());
24 | 
25 | 	public LineCounterTask(InputStream inputStream, File outputDirectory,
26 | 			String patternLabel, String delimiter, boolean setCountToOne,
27 | 			boolean additionalCounts) {
28 | 		this.inputStream = inputStream;
29 | 		this.outputDirectory = outputDirectory;
30 | 		this.patternLabel = patternLabel;
31 | 		this.delimiter = delimiter;
32 | 		this.setCountToOne = setCountToOne;
33 | 		this.additionalCounts = additionalCounts;
34 | 	}
35 | 
36 | 	@Override
37 | 	public void run() {
38 | 		File outputDirectory = new File(this.outputDirectory.getAbsolutePath()
39 | 				+ "/" + this.patternLabel);
40 | 		if (outputDirectory.exists()) {
41 | 			try {
42 | 				FileUtils.deleteDirectory(outputDirectory);
43 | 			} catch (IOException e) {
44 | 				// TODO Auto-generated catch block
45 | 				e.printStackTrace();
46 | 			}
47 | 		}
48 | 		outputDirectory.mkdir();
49 | 		this.logger.info("count lines for: "
50 | 				+ outputDirectory.getAbsolutePath());
51 | 
52 | 		BufferedReader inputStreamReader = new BufferedReader(
53 | 				new InputStreamReader(this.inputStream));
54 | 		long onePlusLineCount = 0L;
55 | 		long oneLineCount = 0L;
56 | 		long twoLineCount = 0L;
57 | 		long threePlusLineCount = 0L;
58 | 		String line;
59 | 		try {
60 | 			if (this.setCountToOne) {
61 | 				while ((line = inputStreamReader.readLine()) != null) {
62 | 					onePlusLineCount++;
63 | 				}
64 | 			} else {
65 | 				while ((line = inputStreamReader.readLine()) != null) {
66 | 					long currentCount = Long.parseLong(line
67 | 							.split(this.delimiter)[1]);
68 | 					onePlusLineCount += currentCount;
69 | 					if (currentCount == 1L) {
70 | 						oneLineCount += currentCount;
71 | 					}
72 | 					if (currentCount == 2L) {
73 | 						twoLineCount += currentCount;
74 | 					}
75 | 					if (currentCount >= 3L) {
76 | 						threePlusLineCount += currentCount;
77 | 					}
78 | 				}
79 | 			}
80 | 			inputStreamReader.close();
81 | 
82 | 			BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(
83 | 					outputDirectory.getAbsolutePath() + "/" + "all"));
84 | 			if (this.additionalCounts) {
85 | 				bufferedWriter.write(onePlusLineCount + this.delimiter
86 | 						+ oneLineCount + this.delimiter + twoLineCount
87 | 						+ this.delimiter + threePlusLineCount + "\n");
88 | 			} else {
89 | 				bufferedWriter.write(onePlusLineCount + "\n");
90 | 			}
91 | 			bufferedWriter.close();
92 | 
93 | 		} catch (IOException e) {
94 | 			// TODO Auto-generated catch block
95 | 			e.printStackTrace();
96 | 		}
97 | 	}
98 | }
99 | 


--------------------------------------------------------------------------------
/src/de/typology/splitter/SequenceModifier.java:
--------------------------------------------------------------------------------
  1 | package de.typology.splitter;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.File;
  6 | import java.io.FileReader;
  7 | import java.io.IOException;
  8 | import java.io.OutputStream;
  9 | import java.io.OutputStreamWriter;
 10 | 
 11 | /**
 12 |  * A class for modifying the sequences in InputDirectory based on the given
 13 |  * Pattern. The modified sequences are returned as outputStream
 14 |  * 
 15 |  * @author Martin Koerner
 16 |  * 
 17 |  */
 18 | public class SequenceModifier implements Runnable {
 19 | 	private File inputDirectory;
 20 | 	private OutputStream outputStream;
 21 | 	private String delimiter;
 22 | 	private boolean[] pattern;
 23 | 	private boolean modifyCount;
 24 | 	private boolean setCountToOne;
 25 | 
 26 | 	public SequenceModifier(File inputDirectory, OutputStream outputStream,
 27 | 			String delimiter, boolean[] pattern, boolean modifyCount,
 28 | 			boolean setCountToOne) {
 29 | 		this.inputDirectory = inputDirectory;
 30 | 		this.outputStream = outputStream;
 31 | 		this.delimiter = delimiter;
 32 | 		this.pattern = pattern;
 33 | 		this.modifyCount = modifyCount;
 34 | 		this.setCountToOne = setCountToOne;
 35 | 	}
 36 | 
 37 | 	@Override
 38 | 	public void run() {
 39 | 		BufferedWriter outputStreamWriter = new BufferedWriter(
 40 | 				new OutputStreamWriter(this.outputStream));
 41 | 		try {
 42 | 			for (File inputFile : this.inputDirectory.listFiles()) {
 43 | 				BufferedReader inputFileReader = new BufferedReader(
 44 | 						new FileReader(inputFile));
 45 | 				String line;
 46 | 				while ((line = inputFileReader.readLine()) != null) {
 47 | 					String[] lineSplit = line.split(this.delimiter);
 48 | 					if (this.modifyCount) {
 49 | 						String[] words = lineSplit[0].split("\\s");
 50 | 						String modifiedWords = "";
 51 | 						try {
 52 | 							for (int i = 0; i < this.pattern.length; i++) {
 53 | 								if (this.pattern[i]) {
 54 | 									modifiedWords += words[i] + " ";
 55 | 								}
 56 | 							}
 57 | 						} catch (Exception e) {
 58 | 							e.printStackTrace();
 59 | 						}
 60 | 						modifiedWords = modifiedWords.replaceFirst(" $", "");
 61 | 						// TODO: better solution?
 62 | 						if (words[0].equals("<fs>")) {
 63 | 							// for kneser-ney smoothing: every sequence that
 64 | 							// starts
 65 | 							// with <fs> counts as a new sequence
 66 | 							if (this.inputDirectory.getName().equals("1")) {
 67 | 								continue;
 68 | 							}
 69 | 							if (!this.pattern[0]) {
 70 | 								// set <s> in _1 to zero
 71 | 								if (this.inputDirectory.getName().equals("11")
 72 | 										&& words[1].equals("<s>")) {
 73 | 									outputStreamWriter.write("<s>"
 74 | 											+ this.delimiter + "0\n");
 75 | 								} else {
 76 | 									outputStreamWriter.write(modifiedWords
 77 | 											+ this.delimiter
 78 | 											+ line.split(this.delimiter)[1]
 79 | 											+ "\n");
 80 | 								}
 81 | 							}
 82 | 							// if pattern[0]==true: leave out sequence
 83 | 						} else {
 84 | 							if (this.setCountToOne) {
 85 | 								outputStreamWriter.write(modifiedWords
 86 | 										+ this.delimiter + "1\n");
 87 | 							} else {
 88 | 								outputStreamWriter.write(modifiedWords
 89 | 										+ this.delimiter + lineSplit[1] + "\n");
 90 | 							}
 91 | 						}
 92 | 					} else {
 93 | 						outputStreamWriter.write(line + "\n");
 94 | 					}
 95 | 
 96 | 				}
 97 | 				inputFileReader.close();
 98 | 			}
 99 | 			outputStreamWriter.close();
100 | 		} catch (IOException e) {
101 | 			// TODO Auto-generated catch block
102 | 			e.printStackTrace();
103 | 		}
104 | 
105 | 	}
106 | }
107 | 


--------------------------------------------------------------------------------
/src/de/typology/splitter/Sequencer.java:
--------------------------------------------------------------------------------
  1 | package de.typology.splitter;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.File;
  6 | import java.io.IOException;
  7 | import java.io.InputStream;
  8 | import java.io.InputStreamReader;
  9 | import java.util.HashMap;
 10 | 
 11 | import org.apache.logging.log4j.LogManager;
 12 | import org.apache.logging.log4j.Logger;
 13 | 
 14 | import de.typology.indexes.WordIndex;
 15 | 
 16 | /**
 17 |  * A class for splitting a text file (via inputStream) into sequences that are
 18 |  * stored in different files based on the indexFile in outputDirectory.
 19 |  * 
 20 |  * @author Martin Koerner
 21 |  * 
 22 |  */
 23 | public class Sequencer {
 24 | 	protected InputStream inputStream;
 25 | 	protected File outputDirectory;
 26 | 	protected WordIndex wordIndex;
 27 | 	protected boolean[] pattern;
 28 | 	protected String addBeforeSentence;
 29 | 	protected String addAfterSentence;
 30 | 	protected String delimiter;
 31 | 	protected boolean completeLine;
 32 | 	private int startSortAtColumn;
 33 | 
 34 | 	Logger logger = LogManager.getLogger(this.getClass().getName());
 35 | 
 36 | 	public Sequencer(InputStream inputStream, File outputDirectory,
 37 | 			WordIndex wordIndex, boolean[] pattern, String addBeforeSentence,
 38 | 			String addAfterSentence, String delimiter, boolean completeLine,
 39 | 			int startSortAtColumn) {
 40 | 		this.inputStream = inputStream;
 41 | 		this.outputDirectory = outputDirectory;
 42 | 		this.wordIndex = wordIndex;
 43 | 		this.pattern = pattern;
 44 | 		this.addBeforeSentence = addBeforeSentence;
 45 | 		this.addAfterSentence = addAfterSentence;
 46 | 		this.delimiter = delimiter;
 47 | 		this.completeLine = completeLine;
 48 | 		this.startSortAtColumn = startSortAtColumn;
 49 | 
 50 | 	}
 51 | 
 52 | 	public void splitIntoFiles() {
 53 | 		HashMap<Integer, BufferedWriter> writers = this.wordIndex
 54 | 				.openWriters(this.outputDirectory);
 55 | 		// TODO: bufferSize calculation
 56 | 		BufferedReader bufferedReader = new BufferedReader(
 57 | 				new InputStreamReader(this.inputStream), 100 * 8 * 1024);
 58 | 		// BufferedReader bufferedReader = new BufferedReader(
 59 | 		// new InputStreamReader(this.inputStream), 10 * 8 * 1024);
 60 | 		String line;
 61 | 		try {
 62 | 			while ((line = bufferedReader.readLine()) != null) {
 63 | 				line = this.addBeforeSentence + line + this.addAfterSentence;
 64 | 				if (this.completeLine) {
 65 | 					String[] lineSplit = line.split("\\s");
 66 | 					writers.get(
 67 | 							this.wordIndex
 68 | 									.rank(lineSplit[this.startSortAtColumn]))
 69 | 							.write(line + "\n");
 70 | 				} else {
 71 | 					String[] lineSplit = line.split("\\s");
 72 | 					int linePointer = 0;
 73 | 					while (lineSplit.length - linePointer >= this.pattern.length) {
 74 | 						String sequence = "";
 75 | 						for (int i = 0; i < this.pattern.length; i++) {
 76 | 							if (this.pattern[i]) {
 77 | 								sequence += lineSplit[linePointer + i] + " ";
 78 | 							}
 79 | 						}
 80 | 						sequence = sequence.replaceFirst(" $", "");
 81 | 						sequence += this.delimiter + "1\n";
 82 | 
 83 | 						// write sequence
 84 | 
 85 | 						writers.get(
 86 | 								this.wordIndex.rank(sequence.split(" ")[this.startSortAtColumn]))
 87 | 								.write(sequence);
 88 | 
 89 | 						linePointer++;
 90 | 					}
 91 | 				}
 92 | 			}
 93 | 			bufferedReader.close();
 94 | 		} catch (IOException e) {
 95 | 			// TODO Auto-generated catch block
 96 | 			e.printStackTrace();
 97 | 		}
 98 | 
 99 | 		this.wordIndex.closeWriters(writers);
100 | 	}
101 | 
102 | 	public boolean[] getPattern() {
103 | 		return this.pattern;
104 | 	}
105 | }
106 | 


--------------------------------------------------------------------------------
/src/de/typology/splitter/SmoothingSplitter.java:
--------------------------------------------------------------------------------
  1 | package de.typology.splitter;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.io.OutputStream;
  6 | import java.io.PipedInputStream;
  7 | import java.io.PipedOutputStream;
  8 | import java.util.ArrayList;
  9 | import java.util.Comparator;
 10 | import java.util.HashSet;
 11 | import java.util.Map.Entry;
 12 | import java.util.SortedMap;
 13 | import java.util.TreeMap;
 14 | import java.util.concurrent.ExecutorService;
 15 | import java.util.concurrent.Executors;
 16 | import java.util.concurrent.TimeUnit;
 17 | 
 18 | import org.apache.logging.log4j.LogManager;
 19 | import org.apache.logging.log4j.Logger;
 20 | 
 21 | import de.typology.indexes.WordIndex;
 22 | import de.typology.patterns.PatternTransformer;
 23 | 
 24 | public class SmoothingSplitter {
 25 | 	private File absoluteDirectory;
 26 | 	private File continuationDirectory;
 27 | 	private File indexFile;
 28 | 	private String delimiter;
 29 | 	protected boolean deleteTempFiles;
 30 | 
 31 | 	Logger logger = LogManager.getLogger(this.getClass().getName());
 32 | 	private ExecutorService executorService;
 33 | 
 34 | 	private Comparator<boolean[]> patternComparator = new Comparator<boolean[]>() {
 35 | 		@Override
 36 | 		public int compare(boolean[] pattern1, boolean[] pattern2) {
 37 | 			return PatternTransformer.getStringPattern(pattern2).compareTo(
 38 | 					PatternTransformer.getStringPattern(pattern1));
 39 | 		}
 40 | 	};
 41 | 
 42 | 	public SmoothingSplitter(File absoluteDirectory,
 43 | 			File continuationDirectory, File indexFile, String delimiter,
 44 | 			boolean deleteTempFiles) {
 45 | 		this.absoluteDirectory = absoluteDirectory;
 46 | 		this.continuationDirectory = continuationDirectory;
 47 | 		continuationDirectory.mkdir();
 48 | 		this.indexFile = indexFile;
 49 | 		this.delimiter = delimiter;
 50 | 		this.deleteTempFiles = deleteTempFiles;
 51 | 	}
 52 | 
 53 | 	public void split(ArrayList<boolean[]> patterns, int cores) {
 54 | 		// read Index
 55 | 		this.logger
 56 | 				.info("read word index: " + this.indexFile.getAbsolutePath());
 57 | 		WordIndex wordIndex = new WordIndex(this.indexFile);
 58 | 		// initialize executerService
 59 | 		// int cores = Runtime.getRuntime().availableProcessors();
 60 | 
 61 | 		SortedMap<boolean[], boolean[]> continuationMap = this
 62 | 				.filterContinuationMap(this.getContinuationMap(patterns));
 63 | 
 64 | 		HashSet<boolean[]> finishedPatterns = new HashSet<boolean[]>();
 65 | 
 66 | 		while (finishedPatterns.size() < continuationMap.size()) {
 67 | 			ArrayList<boolean[]> currentPatterns = new ArrayList<boolean[]>();
 68 | 			this.executorService = Executors.newFixedThreadPool(cores);
 69 | 
 70 | 			for (Entry<boolean[], boolean[]> entry : continuationMap.entrySet()) {
 71 | 				// list for storing patterns that are currently computed
 72 | 
 73 | 				if (!finishedPatterns.contains(entry.getKey())) {
 74 | 					if (!PatternTransformer.getStringPattern(entry.getValue())
 75 | 							.contains("0")) {
 76 | 						// read absolute files
 77 | 						currentPatterns.add(entry.getKey());
 78 | 						this.logger.info("build continuation for "
 79 | 								+ PatternTransformer.getStringPattern(entry
 80 | 										.getKey())
 81 | 								+ " from absolute "
 82 | 								+ PatternTransformer.getStringPattern(entry
 83 | 										.getValue()));
 84 | 
 85 | 						String inputPatternLabel = PatternTransformer
 86 | 								.getStringPattern(entry.getValue());
 87 | 						boolean[] outputPattern = PatternTransformer
 88 | 								.getBooleanPattern(PatternTransformer
 89 | 										.getStringPattern(entry.getKey())
 90 | 										.replaceAll("0", ""));
 91 | 						String outputPatternLabel = PatternTransformer
 92 | 								.getStringPattern(entry.getKey()).replaceAll(
 93 | 										"0", "_");
 94 | 
 95 | 						File currentAbsoluteInputDirectory = new File(
 96 | 								this.absoluteDirectory.getAbsolutePath() + "/"
 97 | 										+ inputPatternLabel);
 98 | 
 99 | 						this.logger.debug("inputPattern: "
100 | 								+ PatternTransformer.getStringPattern(entry
101 | 										.getValue()));
102 | 						this.logger.debug("inputPatternLabel: "
103 | 								+ inputPatternLabel);
104 | 						this.logger.debug("outputPattern: "
105 | 								+ PatternTransformer
106 | 										.getStringPattern(outputPattern));
107 | 						this.logger.debug("newPatternLabel: "
108 | 								+ outputPatternLabel);
109 | 						this.logger.debug("patternForModifier: "
110 | 								+ PatternTransformer.getStringPattern(entry
111 | 										.getKey()));
112 | 
113 | 						this.splitType(currentAbsoluteInputDirectory,
114 | 								this.continuationDirectory, outputPattern,
115 | 								outputPatternLabel, entry.getKey(), wordIndex,
116 | 								true, true);
117 | 					} else {
118 | 						if (finishedPatterns.contains(entry.getValue())) {
119 | 							// read continuation files
120 | 							currentPatterns.add(entry.getKey());
121 | 							this.logger.info("build continuation for "
122 | 									+ PatternTransformer.getStringPattern(entry
123 | 											.getKey())
124 | 									+ " from continuation "
125 | 									+ PatternTransformer.getStringPattern(entry
126 | 											.getValue()));
127 | 
128 | 							String inputPatternLabel = PatternTransformer
129 | 									.getStringPattern(entry.getValue())
130 | 									.replaceAll("0", "_");
131 | 							boolean[] outputPattern = PatternTransformer
132 | 									.getBooleanPattern(PatternTransformer
133 | 											.getStringPattern(entry.getKey())
134 | 											.replaceAll("0", ""));
135 | 							String outputPatternLabel = PatternTransformer
136 | 									.getStringPattern(entry.getKey())
137 | 									.replaceAll("0", "_");
138 | 
139 | 							File currentContinuationInputDirectory = new File(
140 | 									this.continuationDirectory
141 | 											.getAbsolutePath()
142 | 											+ "/"
143 | 											+ inputPatternLabel);
144 | 
145 | 							// build patternForModifier
146 | 							boolean[] patternForModifier = new boolean[Integer
147 | 									.bitCount(PatternTransformer
148 | 											.getIntPattern(entry.getValue()))];
149 | 							System.out.println(outputPatternLabel + "<--"
150 | 									+ inputPatternLabel + " "
151 | 									+ patternForModifier.length);
152 | 							int patternPointer = 0;
153 | 							for (int i = 0; i < entry.getValue().length; i++) {
154 | 								if (entry.getKey()[i] && entry.getValue()[i]) {
155 | 									patternForModifier[patternPointer] = true;
156 | 									patternPointer++;
157 | 								} else {
158 | 									if (!entry.getKey()[i]
159 | 											&& entry.getValue()[i]) {
160 | 										patternForModifier[patternPointer] = false;
161 | 										patternPointer++;
162 | 									}
163 | 								}
164 | 							}
165 | 
166 | 							this.logger.debug("inputPattern: "
167 | 									+ PatternTransformer.getStringPattern(entry
168 | 											.getValue()));
169 | 							this.logger.debug("inputPatternLabel: "
170 | 									+ inputPatternLabel);
171 | 							this.logger.debug("outputPattern: "
172 | 									+ PatternTransformer
173 | 											.getStringPattern(outputPattern));
174 | 							this.logger.debug("newPatternLabel: "
175 | 									+ outputPatternLabel);
176 | 							this.logger
177 | 									.debug("patternForModifier: "
178 | 											+ PatternTransformer
179 | 													.getStringPattern(patternForModifier));
180 | 
181 | 							this.splitType(currentContinuationInputDirectory,
182 | 									this.continuationDirectory, outputPattern,
183 | 									outputPatternLabel, patternForModifier,
184 | 									wordIndex, false, true);
185 | 
186 | 						}
187 | 					}
188 | 				}
189 | 			}
190 | 			this.executorService.shutdown();
191 | 			this.logger.info("end of this round of calculation");
192 | 			try {
193 | 				this.executorService.awaitTermination(Long.MAX_VALUE,
194 | 						TimeUnit.SECONDS);
195 | 			} catch (InterruptedException e) {
196 | 				// TODO Auto-generated catch block
197 | 				e.printStackTrace();
198 | 			}
199 | 			// add currently computed patterns to finishedPatterns
200 | 			for (boolean[] currentPattern : currentPatterns) {
201 | 				finishedPatterns.add(currentPattern);
202 | 			}
203 | 		}
204 | 
205 | 	}
206 | 
207 | 	private void splitType(File currentInputDirectory, File outputDirectory,
208 | 			boolean[] newPattern, String newPatternLabel,
209 | 			boolean[] patternForModifier, WordIndex wordIndex,
210 | 			boolean setCountToOne, boolean additionalCounts) {
211 | 		PipedInputStream pipedInputStream = new PipedInputStream(100 * 8 * 1024);
212 | 
213 | 		if (Integer.bitCount(PatternTransformer.getIntPattern(newPattern)) == 0) {
214 | 			LineCounterTask lineCountTask = new LineCounterTask(
215 | 					pipedInputStream, outputDirectory, newPatternLabel,
216 | 					this.delimiter, setCountToOne, additionalCounts);
217 | 			this.executorService.execute(lineCountTask);
218 | 		} else {
219 | 			// don't add tags here
220 | 			SplitterTask splitterTask = new SplitterTask(pipedInputStream,
221 | 					outputDirectory, wordIndex, newPattern, newPatternLabel,
222 | 					this.delimiter, 0, this.deleteTempFiles, "", "", true,
223 | 					false, additionalCounts);
224 | 			this.executorService.execute(splitterTask);
225 | 		}
226 | 
227 | 		try {
228 | 			OutputStream pipedOutputStream = new PipedOutputStream(
229 | 					pipedInputStream);
230 | 			SequenceModifier sequenceModifier = new SequenceModifier(
231 | 					currentInputDirectory, pipedOutputStream, this.delimiter,
232 | 					patternForModifier, true, setCountToOne);
233 | 			this.executorService.execute(sequenceModifier);
234 | 
235 | 		} catch (IOException e) {
236 | 			// TODO Auto-generated catch block
237 | 			e.printStackTrace();
238 | 		}
239 | 
240 | 	}
241 | 
242 | 	private SortedMap<boolean[], boolean[]> filterContinuationMap(
243 | 			SortedMap<boolean[], boolean[]> continuationMap) {
244 | 		SortedMap<boolean[], boolean[]> newContinuationMap = new TreeMap<boolean[], boolean[]>(
245 | 				this.patternComparator);
246 | 		for (Entry<boolean[], boolean[]> entry : continuationMap.entrySet()) {
247 | 			if (PatternTransformer.getStringPattern(entry.getKey()).equals(
248 | 					PatternTransformer.getStringPattern(entry.getValue()))) {
249 | 				continue;
250 | 			}
251 | 			boolean[] currentPattern = entry.getKey();
252 | 			if (currentPattern.length > 2) {
253 | 				if (!currentPattern[0] && !currentPattern[1]) {
254 | 					continue;
255 | 				}
256 | 			}
257 | 			newContinuationMap.put(entry.getKey(), entry.getValue());
258 | 
259 | 		}
260 | 		return newContinuationMap;
261 | 	}
262 | 
263 | 	private SortedMap<boolean[], boolean[]> getContinuationMap(
264 | 			ArrayList<boolean[]> patterns) {
265 | 		SortedMap<boolean[], boolean[]> continuationMap = new TreeMap<boolean[], boolean[]>(
266 | 				this.patternComparator);
267 | 
268 | 		for (boolean[] inputPattern : patterns) {
269 | 			this.addPatterns(continuationMap, inputPattern, inputPattern, 0);
270 | 		}
271 | 		return continuationMap;
272 | 	}
273 | 
274 | 	private void addPatterns(SortedMap<boolean[], boolean[]> continuationMap,
275 | 			boolean[] pattern, boolean[] oldPattern, int position) {
276 | 		if (position < pattern.length) {
277 | 			boolean[] newPattern = pattern.clone();
278 | 			newPattern[position] = false;
279 | 			continuationMap.put(newPattern, pattern);
280 | 			continuationMap.put(pattern, oldPattern);
281 | 			this.addPatterns(continuationMap, newPattern, pattern, position + 1);
282 | 			this.addPatterns(continuationMap, pattern, oldPattern, position + 1);
283 | 		}
284 | 	}
285 | 
286 | }
287 | 


--------------------------------------------------------------------------------
/src/de/typology/splitter/SplitterTask.java:
--------------------------------------------------------------------------------
  1 | package de.typology.splitter;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.io.InputStream;
  6 | 
  7 | import org.apache.commons.io.FileUtils;
  8 | import org.apache.logging.log4j.LogManager;
  9 | import org.apache.logging.log4j.Logger;
 10 | 
 11 | import de.typology.indexes.WordIndex;
 12 | 
 13 | /**
 14 |  * A class for running Sequencer and Aggregator for a given pattern.
 15 |  * 
 16 |  * @author Martin Koerner
 17 |  * 
 18 |  */
 19 | public class SplitterTask implements Runnable {
 20 | 	private InputStream inputStream;
 21 | 	private File outputDirectory;
 22 | 	private WordIndex wordIndex;
 23 | 	private boolean[] pattern;
 24 | 	private String patternLabel;
 25 | 	private String delimiter;
 26 | 	private int startSortAtColumn;
 27 | 	private boolean deleteTempFiles;
 28 | 
 29 | 	private String addBeforeSentence;
 30 | 	private String addAfterSentence;
 31 | 	private boolean sequenceModifyCounts;
 32 | 	private boolean aggregateCompleteLine;
 33 | 	private boolean additionalCounts;
 34 | 
 35 | 	Logger logger = LogManager.getLogger(this.getClass().getName());
 36 | 
 37 | 	public SplitterTask(InputStream inputStream, File outputDirectory,
 38 | 			WordIndex wordIndex, boolean[] pattern, String patternLabel,
 39 | 			String delimiter, int startSortAtColumn, boolean deleteTempFiles,
 40 | 			String addBeforeSentence, String addAfterSentence,
 41 | 			boolean sequenceModifyCounts, boolean aggregateCompleteLine,
 42 | 			boolean additionalCounts) {
 43 | 		this.inputStream = inputStream;
 44 | 		this.outputDirectory = outputDirectory;
 45 | 		this.wordIndex = wordIndex;
 46 | 		this.pattern = pattern;
 47 | 		this.patternLabel = patternLabel;
 48 | 		this.delimiter = delimiter;
 49 | 		this.startSortAtColumn = startSortAtColumn;
 50 | 		this.deleteTempFiles = deleteTempFiles;
 51 | 		this.addBeforeSentence = addBeforeSentence;
 52 | 		this.addAfterSentence = addAfterSentence;
 53 | 		this.sequenceModifyCounts = sequenceModifyCounts;
 54 | 		this.aggregateCompleteLine = aggregateCompleteLine;
 55 | 		this.additionalCounts = additionalCounts;
 56 | 	}
 57 | 
 58 | 	@Override
 59 | 	public void run() {
 60 | 		File sequencerOutputDirectory = new File(
 61 | 				this.outputDirectory.getAbsolutePath() + "/"
 62 | 						+ this.patternLabel + "-split");
 63 | 		if (sequencerOutputDirectory.exists()) {
 64 | 			try {
 65 | 				FileUtils.deleteDirectory(sequencerOutputDirectory);
 66 | 			} catch (IOException e) {
 67 | 				// TODO Auto-generated catch block
 68 | 				e.printStackTrace();
 69 | 			}
 70 | 		}
 71 | 		sequencerOutputDirectory.mkdir();
 72 | 		this.logger.info("start building: "
 73 | 				+ sequencerOutputDirectory.getAbsolutePath());
 74 | 
 75 | 		// initialize sequencer
 76 | 		Sequencer sequencer = new Sequencer(this.inputStream,
 77 | 				sequencerOutputDirectory, this.wordIndex, this.pattern,
 78 | 				this.addBeforeSentence, this.addAfterSentence, this.delimiter,
 79 | 				this.sequenceModifyCounts, this.startSortAtColumn);
 80 | 		sequencer.splitIntoFiles();
 81 | 
 82 | 		File aggregatedOutputDirectory = new File(
 83 | 				this.outputDirectory.getAbsolutePath() + "/"
 84 | 						+ this.patternLabel);
 85 | 		if (aggregatedOutputDirectory.exists()) {
 86 | 			try {
 87 | 				FileUtils.deleteDirectory(aggregatedOutputDirectory);
 88 | 			} catch (IOException e) {
 89 | 				// TODO Auto-generated catch block
 90 | 				e.printStackTrace();
 91 | 			}
 92 | 		}
 93 | 		aggregatedOutputDirectory.mkdir();
 94 | 		this.logger.info("aggregate into: " + aggregatedOutputDirectory);
 95 | 
 96 | 		for (File splitFile : sequencerOutputDirectory.listFiles()) {
 97 | 			Aggregator aggregator = new Aggregator(splitFile, new File(
 98 | 					aggregatedOutputDirectory.getAbsolutePath() + "/"
 99 | 							+ splitFile.getName()), this.delimiter,
100 | 					this.startSortAtColumn, this.additionalCounts);
101 | 			if (this.aggregateCompleteLine) {
102 | 				aggregator.aggregateWithoutCounts();
103 | 			} else {
104 | 				aggregator.aggregateCounts();
105 | 			}
106 | 		}
107 | 
108 | 		// delete sequencerOutputDirectory
109 | 		if (this.deleteTempFiles) {
110 | 			try {
111 | 				FileUtils.deleteDirectory(sequencerOutputDirectory);
112 | 			} catch (IOException e) {
113 | 				// TODO Auto-generated catch block
114 | 				e.printStackTrace();
115 | 			}
116 | 		}
117 | 	}
118 | }
119 | 


--------------------------------------------------------------------------------
/src/de/typology/tester/SequenceExtractorTask.java:
--------------------------------------------------------------------------------
  1 | package de.typology.tester;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.File;
  6 | import java.io.FileReader;
  7 | import java.io.FileWriter;
  8 | import java.io.IOException;
  9 | import java.util.ArrayList;
 10 | import java.util.HashSet;
 11 | 
 12 | import org.apache.commons.io.FileUtils;
 13 | import org.apache.logging.log4j.LogManager;
 14 | import org.apache.logging.log4j.Logger;
 15 | 
 16 | /**
 17 |  * This class takes an ArrayList of sequences and a directory of Files as an
 18 |  * input and writes all occurrences of the sequences into new files in the
 19 |  * outputDirectory
 20 |  * 
 21 |  * @author Martin Koerner
 22 |  * 
 23 |  */
 24 | public class SequenceExtractorTask implements Runnable {
 25 | 
 26 | 	Logger logger = LogManager.getLogger(this.getClass().getName());
 27 | 
 28 | 	private ArrayList<String> originalSequences;
 29 | 	private boolean[] pattern;
 30 | 	private File inputDirectory;
 31 | 	private File outputDirectory;
 32 | 	private String delimiter;
 33 | 
 34 | 	public SequenceExtractorTask(ArrayList<String> originalSequences,
 35 | 			boolean[] pattern, File inputDirectory, File outputDirectory,
 36 | 			String delimiter) {
 37 | 		this.originalSequences = originalSequences;
 38 | 		this.pattern = pattern;
 39 | 
 40 | 		this.inputDirectory = inputDirectory;
 41 | 		this.outputDirectory = outputDirectory;
 42 | 		if (this.outputDirectory.exists()) {
 43 | 			try {
 44 | 				FileUtils.deleteDirectory(this.outputDirectory);
 45 | 			} catch (IOException e) {
 46 | 				// TODO Auto-generated catch block
 47 | 				e.printStackTrace();
 48 | 			}
 49 | 		}
 50 | 		this.outputDirectory.mkdirs();
 51 | 		this.delimiter = delimiter;
 52 | 
 53 | 	}
 54 | 
 55 | 	@Override
 56 | 	public void run() {
 57 | 		HashSet<String> newSequences = this.getNewSequences();
 58 | 
 59 | 		for (File inputFile : this.inputDirectory.listFiles()) {
 60 | 			File outputFile = new File(this.outputDirectory.getAbsolutePath()
 61 | 					+ "/" + inputFile.getName());
 62 | 			if (inputFile.getName().equals("all")) {
 63 | 				try {
 64 | 					FileUtils.copyFile(inputFile, outputFile);
 65 | 				} catch (IOException e) {
 66 | 					// TODO Auto-generated catch block
 67 | 					e.printStackTrace();
 68 | 				}
 69 | 			} else {
 70 | 				try {
 71 | 					BufferedReader inputFileReader = new BufferedReader(
 72 | 							new FileReader(inputFile));
 73 | 					BufferedWriter outputFileWriter = new BufferedWriter(
 74 | 							new FileWriter(outputFile));
 75 | 					String line;
 76 | 
 77 | 					while ((line = inputFileReader.readLine()) != null) {
 78 | 						if (newSequences
 79 | 								.contains(line.split(this.delimiter)[0])) {
 80 | 
 81 | 							outputFileWriter.write(line + "\n");
 82 | 						}
 83 | 					}
 84 | 					inputFileReader.close();
 85 | 					outputFileWriter.close();
 86 | 				} catch (IOException e) {
 87 | 					// TODO Auto-generated catch block
 88 | 					e.printStackTrace();
 89 | 				}
 90 | 			}
 91 | 
 92 | 		}
 93 | 
 94 | 	}
 95 | 
 96 | 	private HashSet<String> getNewSequences() {
 97 | 		HashSet<String> newSequences = new HashSet<String>();
 98 | 
 99 | 		for (String originalLine : this.originalSequences) {
100 | 			// modify sequences for continuation
101 | 			if (!this.pattern[0] || !this.pattern[this.pattern.length - 1]) {
102 | 				for (boolean element : this.pattern) {
103 | 					if (element) {
104 | 						break;
105 | 					} else {
106 | 						originalLine = "<dummy> " + originalLine;
107 | 					}
108 | 				}
109 | 				for (int i = this.pattern.length - 1; i >= 0; i--) {
110 | 					if (this.pattern[i]) {
111 | 						break;
112 | 					} else {
113 | 						originalLine = originalLine + " <dummy>";
114 | 					}
115 | 				}
116 | 			}
117 | 			String[] originalLineSplit = originalLine.split("\\s");
118 | 			int linePointer = 0;
119 | 			while (originalLineSplit.length - linePointer >= this.pattern.length) {
120 | 
121 | 				// build current Sequence
122 | 				String currentSequence = "";
123 | 				for (int i = 0; i < this.pattern.length; i++) {
124 | 					currentSequence += originalLineSplit[linePointer + i] + " ";
125 | 				}
126 | 				currentSequence = currentSequence.replaceFirst(" $", "");
127 | 
128 | 				String[] currentSequenceSplit = currentSequence.split("\\s");
129 | 				String newSequence = "";
130 | 				for (int i = 0; i < this.pattern.length; i++) {
131 | 					if (this.pattern[i]) {
132 | 						newSequence += currentSequenceSplit[i] + " ";
133 | 					}
134 | 				}
135 | 				newSequence = newSequence.replaceFirst(" $", "");
136 | 				if (newSequence.length() > 0) {
137 | 					newSequences.add(newSequence);
138 | 				}
139 | 
140 | 				linePointer++;
141 | 			}
142 | 		}
143 | 		return newSequences;
144 | 	}
145 | }
146 | 


--------------------------------------------------------------------------------
/src/de/typology/tester/TestSequenceExtractor.java:
--------------------------------------------------------------------------------
  1 | package de.typology.tester;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.concurrent.ExecutorService;
  9 | import java.util.concurrent.Executors;
 10 | import java.util.concurrent.TimeUnit;
 11 | 
 12 | import de.typology.indexes.WordIndex;
 13 | import de.typology.patterns.PatternBuilder;
 14 | import de.typology.patterns.PatternTransformer;
 15 | 
 16 | /**
 17 |  * This class extracts all sequences that are needed for computing the
 18 |  * Kneser-Ney smoothed values for a set of given test sequences.
 19 |  * 
 20 |  * @author Martin Koerner
 21 |  * 
 22 |  */
 23 | public class TestSequenceExtractor {
 24 | 	private File testSequenceFile;
 25 | 	private File absoluteDirectory;
 26 | 	private File continuationDirectory;
 27 | 	private File outputDirectory;
 28 | 
 29 | 	private String delimiter;
 30 | 	private WordIndex wordIndex;
 31 | 
 32 | 	public TestSequenceExtractor(File testSequenceFile, File absoluteDirectory,
 33 | 			File continuationDirectory, File outputDirectory, String delimiter,
 34 | 			WordIndex wordIndex) {
 35 | 		this.testSequenceFile = testSequenceFile;
 36 | 		this.absoluteDirectory = absoluteDirectory;
 37 | 		this.continuationDirectory = continuationDirectory;
 38 | 		this.outputDirectory = outputDirectory;
 39 | 		this.delimiter = delimiter;
 40 | 		this.wordIndex = wordIndex;
 41 | 
 42 | 	}
 43 | 
 44 | 	public void extractSequences(int maxModelLength, int cores) {
 45 | 
 46 | 		// read test sequences into HashSet
 47 | 		ArrayList<String> sequences = new ArrayList<String>();
 48 | 		try {
 49 | 			BufferedReader testSequenceReader = new BufferedReader(
 50 | 					new FileReader(this.testSequenceFile));
 51 | 			String line;
 52 | 			while ((line = testSequenceReader.readLine()) != null) {
 53 | 				sequences.add(line);
 54 | 			}
 55 | 			testSequenceReader.close();
 56 | 		} catch (IOException e) {
 57 | 			// TODO Auto-generated catch block
 58 | 			e.printStackTrace();
 59 | 		}
 60 | 
 61 | 		ArrayList<boolean[]> absolutePatterns = PatternBuilder
 62 | 				.getGLMForSmoothingPatterns(maxModelLength);
 63 | 
 64 | 		// call SequenceExtractorTasks
 65 | 
 66 | 		// initialize executerService
 67 | 		// int cores = Runtime.getRuntime().availableProcessors();
 68 | 		ExecutorService executorService = Executors.newFixedThreadPool(cores);
 69 | 
 70 | 		for (boolean[] absolutePattern : absolutePatterns) {
 71 | 			// extract absolute sequences
 72 | 			String absoluteStringPattern = PatternTransformer
 73 | 					.getStringPattern(absolutePattern);
 74 | 			File absoluteInputDirectory = new File(
 75 | 					this.absoluteDirectory.getAbsolutePath() + "/"
 76 | 							+ absoluteStringPattern);
 77 | 			File absoluteOutputDirectory = new File(this.outputDirectory + "/"
 78 | 					+ this.absoluteDirectory.getName() + "/"
 79 | 					+ absoluteStringPattern);
 80 | 			SequenceExtractorTask absoluteSET = new SequenceExtractorTask(
 81 | 					sequences, absolutePattern, absoluteInputDirectory,
 82 | 					absoluteOutputDirectory, this.delimiter);
 83 | 			executorService.execute(absoluteSET);
 84 | 
 85 | 		}
 86 | 		executorService.shutdown();
 87 | 		try {
 88 | 			executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
 89 | 		} catch (InterruptedException e) {
 90 | 			// TODO Auto-generated catch block
 91 | 			e.printStackTrace();
 92 | 		}
 93 | 
 94 | 	}
 95 | 
 96 | 	public void extractContinuationSequences(int maxModelLength, int cores) {
 97 | 
 98 | 		// read test sequences into HashSet
 99 | 		ArrayList<String> sequences = new ArrayList<String>();
100 | 		try {
101 | 			BufferedReader testSequenceReader = new BufferedReader(
102 | 					new FileReader(this.testSequenceFile));
103 | 			String line;
104 | 			while ((line = testSequenceReader.readLine()) != null) {
105 | 				sequences.add(line);
106 | 			}
107 | 			testSequenceReader.close();
108 | 		} catch (IOException e) {
109 | 			// TODO Auto-generated catch block
110 | 			e.printStackTrace();
111 | 		}
112 | 		// call SequenceExtractorTasks
113 | 
114 | 		// initialize executerService
115 | 		// int cores = Runtime.getRuntime().availableProcessors();
116 | 		ExecutorService executorService = Executors.newFixedThreadPool(cores);
117 | 
118 | 		for (File continuationTypeDirectory : this.continuationDirectory
119 | 				.listFiles()) {
120 | 			// extract absolute sequences
121 | 			String continuationStringPattern = continuationTypeDirectory
122 | 					.getName();
123 | 			boolean[] continuationPattern = PatternTransformer
124 | 					.getBooleanPattern(continuationStringPattern.replaceAll(
125 | 							"_", "0"));
126 | 			File continuationOutputDirectory = new File(this.outputDirectory
127 | 					+ "/" + this.continuationDirectory.getName() + "/"
128 | 					+ continuationStringPattern);
129 | 			SequenceExtractorTask continuationSET = new SequenceExtractorTask(
130 | 					sequences, continuationPattern, continuationTypeDirectory,
131 | 					continuationOutputDirectory, this.delimiter);
132 | 			executorService.execute(continuationSET);
133 | 
134 | 		}
135 | 		executorService.shutdown();
136 | 		try {
137 | 			executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
138 | 		} catch (InterruptedException e) {
139 | 			// TODO Auto-generated catch block
140 | 			e.printStackTrace();
141 | 		}
142 | 
143 | 	}
144 | 	// public void extractContinuationSequences(int maxModelLength, int cores) {
145 | 	// ArrayList<boolean[]> absolutePatterns = PatternBuilder
146 | 	// .getLMPatterns(maxModelLength);
147 | 	//
148 | 	// // initialize executerService
149 | 	// // int cores = Runtime.getRuntime().availableProcessors();
150 | 	// ExecutorService executorService = Executors.newFixedThreadPool(cores);
151 | 	// for (boolean[] absolutePattern : absolutePatterns) {
152 | 	// File originalSequencesDirectory = new File(
153 | 	// this.outputDirectory.getAbsolutePath()
154 | 	// + "/"
155 | 	// + this.absoluteDirectory.getName()
156 | 	// + "/"
157 | 	// + PatternTransformer
158 | 	// .getStringPattern(absolutePattern));
159 | 	// File outputDirectory = new File(
160 | 	// this.outputDirectory.getAbsolutePath() + "/continuation");
161 | 	// ContinuationExtractorTask cet = new ContinuationExtractorTask(
162 | 	// originalSequencesDirectory, absolutePattern,
163 | 	// this.absoluteDirectory, outputDirectory, this.wordIndex,
164 | 	// this.delimiter);
165 | 	// executorService.execute(cet);
166 | 	// }
167 | 	//
168 | 	// executorService.shutdown();
169 | 	// try {
170 | 	// executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
171 | 	// } catch (InterruptedException e) {
172 | 	// // TODO Auto-generated catch block
173 | 	// e.printStackTrace();
174 | 	// }
175 | 	//
176 | 	// }
177 | }
178 | 


--------------------------------------------------------------------------------
/src/de/typology/utils/Config.java:
--------------------------------------------------------------------------------
  1 | package de.typology.utils;
  2 | 
  3 | import java.io.BufferedInputStream;
  4 | import java.io.FileInputStream;
  5 | import java.io.IOException;
  6 | import java.lang.reflect.Field;
  7 | import java.util.Properties;
  8 | 
  9 | /**
 10 |  * This is an interface class to the Config file for this project. For each
 11 |  * class field one java property must be defined in config.txt. The fields will
 12 |  * be automatically filled!
 13 |  * 
 14 |  * Allowed Types are String, int, boolean, String[] and long[] where arrays are
 15 |  * defined by semicolon-separated Strings like "array=a;b;c" boolen fields are
 16 |  * initialized with true or false
 17 |  * 
 18 |  * lines starting with # will be ignored and can serve as comments
 19 |  * 
 20 |  * @author Jonas Kunze, Rene Pickhardt
 21 |  * 
 22 |  */
 23 | public class Config extends Properties {
 24 | 	// CONTROLL PARAMETERS
 25 | 	public int numberOfCores;
 26 | 
 27 | 	public String languages;
 28 | 
 29 | 	public boolean splitData;
 30 | 	public boolean buildIndex;
 31 | 	public boolean buildGLM;
 32 | 	public boolean buildContinuationGLM;
 33 | 	public boolean extractContinuationGLM;
 34 | 	public boolean buildKneserNey;
 35 | 	public boolean buildModKneserNey;
 36 | 
 37 | 	public boolean conditionalProbabilityOnly;
 38 | 	public boolean backoffAbsolute;
 39 | 
 40 | 	public boolean kneserNeySimple;
 41 | 	public boolean kneserNeyComplex;
 42 | 
 43 | 	public boolean deleteTempFiles;
 44 | 
 45 | 	public boolean addSentenceTags;
 46 | 	public boolean addFakeStartTag;
 47 | 
 48 | 	public int decimalPlaces;
 49 | 	// DEBUGGING
 50 | 	public String inputDataSet;
 51 | 
 52 | 	// STEP 0 GLOBAL CONFIGS
 53 | 
 54 | 	public String outputDirectory;
 55 | 	public int maxCountDivider;
 56 | 	public int modelLength;
 57 | 
 58 | 	public int numberOfQueries;
 59 | 
 60 | 	// STEP 2 SAMPLING AND MAKE TRAININGS DATA SPLIT
 61 | 	public int sampleRate; // \in [0, 100] 0 means no data from input will be
 62 | 	// used. 100 means all input data will be used
 63 | 	public int splitDataRatio; // \in [0, 100] 0 means no training data. 100
 64 | 	// means only training data
 65 | 	public int splitTestRatio; // \in [0, 100] 0 means all data is stored in
 66 | 	// test file. 100 means all data is stored in (smaller) learning file
 67 | 
 68 | 	private static final long serialVersionUID = -4439565094382127683L;
 69 | 
 70 | 	static Config instance = null;
 71 | 
 72 | 	public Config() {
 73 | 		String file = "config.txt";
 74 | 		try {
 75 | 			BufferedInputStream stream = new BufferedInputStream(
 76 | 					new FileInputStream(file));
 77 | 			this.load(stream);
 78 | 			stream.close();
 79 | 		} catch (IOException e) {
 80 | 			e.printStackTrace();
 81 | 		}
 82 | 		try {
 83 | 			this.initialize();
 84 | 		} catch (IllegalArgumentException e) {
 85 | 			e.printStackTrace();
 86 | 		} catch (IllegalAccessException e) {
 87 | 			e.printStackTrace();
 88 | 		}
 89 | 	}
 90 | 
 91 | 	/**
 92 | 	 * Fills all fields with the data defined in the config file.
 93 | 	 * 
 94 | 	 * @throws IllegalArgumentException
 95 | 	 * @throws IllegalAccessException
 96 | 	 */
 97 | 	private void initialize() throws IllegalArgumentException,
 98 | 			IllegalAccessException {
 99 | 		Field[] fields = this.getClass().getFields();
100 | 		for (Field f : fields) {
101 | 			if (this.getProperty(f.getName()) == null) {
102 | 				System.err.print("Property '" + f.getName()
103 | 						+ "' not defined in config file");
104 | 			}
105 | 			if (f.getType().equals(String.class)) {
106 | 				f.set(this, this.getProperty(f.getName()));
107 | 			} else if (f.getType().equals(long.class)) {
108 | 				f.setLong(this, Long.valueOf(this.getProperty(f.getName())));
109 | 			} else if (f.getType().equals(int.class)) {
110 | 				f.setInt(this, Integer.valueOf(this.getProperty(f.getName())));
111 | 			} else if (f.getType().equals(boolean.class)) {
112 | 				f.setBoolean(this,
113 | 						Boolean.valueOf(this.getProperty(f.getName())));
114 | 			} else if (f.getType().equals(String[].class)) {
115 | 				f.set(this, this.getProperty(f.getName()).split(";"));
116 | 			} else if (f.getType().equals(int[].class)) {
117 | 				String[] tmp = this.getProperty(f.getName()).split(";");
118 | 				int[] ints = new int[tmp.length];
119 | 				for (int i = 0; i < tmp.length; i++) {
120 | 					ints[i] = Integer.parseInt(tmp[i]);
121 | 				}
122 | 				f.set(this, ints);
123 | 			} else if (f.getType().equals(long[].class)) {
124 | 				String[] tmp = this.getProperty(f.getName()).split(";");
125 | 				long[] longs = new long[tmp.length];
126 | 				for (int i = 0; i < tmp.length; i++) {
127 | 					longs[i] = Long.parseLong(tmp[i]);
128 | 				}
129 | 				f.set(this, longs);
130 | 			}
131 | 		}
132 | 	}
133 | 
134 | 	public static Config get() {
135 | 		if (instance == null) {
136 | 			instance = new Config();
137 | 		}
138 | 		return instance;
139 | 	}
140 | }
141 | 


--------------------------------------------------------------------------------
/src/de/typology/utils/Counter.java:
--------------------------------------------------------------------------------
  1 | package de.typology.utils;
  2 | 
  3 | import java.io.BufferedInputStream;
  4 | import java.io.BufferedReader;
  5 | import java.io.File;
  6 | import java.io.FileInputStream;
  7 | import java.io.FileNotFoundException;
  8 | import java.io.FileReader;
  9 | import java.io.IOException;
 10 | import java.io.InputStream;
 11 | 
 12 | public class Counter {
 13 | 
 14 | 	public static long countLinesInDirectory(File directory) {
 15 | 		long totalCount = 0;
 16 | 		for (File file : directory.listFiles()) {
 17 | 			totalCount += countLines(file);
 18 | 		}
 19 | 		return totalCount;
 20 | 	}
 21 | 
 22 | 	// derived from:
 23 | 	// http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java
 24 | 	public static long countLines(File file) {
 25 | 		InputStream is;
 26 | 		try {
 27 | 			is = new BufferedInputStream(new FileInputStream(file));
 28 | 			try {
 29 | 				try {
 30 | 					byte[] c = new byte[1024];
 31 | 					long count = 0;
 32 | 					int readChars = 0;
 33 | 					boolean empty = true;
 34 | 					while ((readChars = is.read(c)) != -1) {
 35 | 						empty = false;
 36 | 						for (int i = 0; i < readChars; ++i) {
 37 | 							if (c[i] == '\n') {
 38 | 								++count;
 39 | 							}
 40 | 						}
 41 | 					}
 42 | 					return count == 0 && !empty ? 1 : count;
 43 | 				} finally {
 44 | 					is.close();
 45 | 				}
 46 | 			} catch (IOException e) {
 47 | 				e.printStackTrace();
 48 | 			}
 49 | 		} catch (FileNotFoundException e) {
 50 | 			e.printStackTrace();
 51 | 		}
 52 | 		return 0;
 53 | 	}
 54 | 
 55 | 	private static int columnNumberStartZero;
 56 | 	private static File directory;
 57 | 	private static long currentCountForDirectory;
 58 | 
 59 | 	public static long countColumnCountsInDirectory(int columnNumberStartZero,
 60 | 			File directory) {
 61 | 		if (columnNumberStartZero == Counter.columnNumberStartZero
 62 | 				&& directory.equals(Counter.directory)) {
 63 | 			return Counter.currentCountForDirectory;
 64 | 		} else {
 65 | 			long totalCount = 0;
 66 | 			for (File file : directory.listFiles()) {
 67 | 				totalCount += countColumnCounts(columnNumberStartZero, file);
 68 | 			}
 69 | 			Counter.columnNumberStartZero = columnNumberStartZero;
 70 | 			Counter.currentCountForDirectory = totalCount;
 71 | 			Counter.directory = directory;
 72 | 			return totalCount;
 73 | 		}
 74 | 	}
 75 | 
 76 | 	public static long countColumnCounts(int columnNumberStartZero, File file) {
 77 | 		long totalCount = 0;
 78 | 		try {
 79 | 			BufferedReader br = new BufferedReader(new FileReader(file));
 80 | 			try {
 81 | 				String line;
 82 | 				String[] lineSplit;
 83 | 				while ((line = br.readLine()) != null) {
 84 | 					lineSplit = line.split("\t");
 85 | 					totalCount += Long
 86 | 							.parseLong(lineSplit[columnNumberStartZero]);
 87 | 				}
 88 | 			} finally {
 89 | 				br.close();
 90 | 			}
 91 | 		} catch (IOException e) {
 92 | 			// TODO Auto-generated catch block
 93 | 			e.printStackTrace();
 94 | 		}
 95 | 		return totalCount;
 96 | 	}
 97 | 
 98 | 	/**
 99 | 	 * used for aggregating the counts in a directory
100 | 	 * 
101 | 	 * @param count
102 | 	 * @param directory
103 | 	 * @return
104 | 	 */
105 | 	public static long aggregateCountsInDirectory(File directory) {
106 | 		long totalCount = 0;
107 | 		for (File file : directory.listFiles()) {
108 | 			totalCount += aggregateCounts(file);
109 | 		}
110 | 		return totalCount;
111 | 	}
112 | 
113 | 	/**
114 | 	 * used for calculating the count of counts in smoothing methods
115 | 	 * 
116 | 	 * @param count
117 | 	 * @param directoryName
118 | 	 * @return
119 | 	 */
120 | 	public static long aggregateCounts(File file) {
121 | 		long totalCount = 0;
122 | 		try {
123 | 			BufferedReader br = new BufferedReader(new FileReader(file));
124 | 			try {
125 | 				String line;
126 | 				String[] lineSplit;
127 | 				while ((line = br.readLine()) != null) {
128 | 					// TODO remove this or make it pretty
129 | 					if (line.startsWith("<fs>")) {
130 | 						continue;
131 | 					}
132 | 					lineSplit = line.split("\t");
133 | 					totalCount += Long
134 | 							.parseLong(lineSplit[lineSplit.length - 1]);
135 | 				}
136 | 			} finally {
137 | 				br.close();
138 | 			}
139 | 		} catch (IOException e) {
140 | 			// TODO Auto-generated catch block
141 | 			e.printStackTrace();
142 | 		}
143 | 		return totalCount;
144 | 	}
145 | 
146 | 	/**
147 | 	 * used for calculating the count of counts in smoothing methods
148 | 	 * 
149 | 	 * @param count
150 | 	 * @param directory
151 | 	 * @return
152 | 	 */
153 | 	public static long countCountsInDirectory(int count, File directory,
154 | 			String skipSequence) {
155 | 		long totalCount = 0;
156 | 		for (File file : directory.listFiles()) {
157 | 			if (!file.getName().contains("-split")) {
158 | 				totalCount += countCounts(count, file, skipSequence);
159 | 			}
160 | 		}
161 | 		return totalCount;
162 | 	}
163 | 
164 | 	/**
165 | 	 * used for calculating the count of counts in smoothing methods
166 | 	 * 
167 | 	 * @param count
168 | 	 * @param directoryName
169 | 	 * @return
170 | 	 */
171 | 	public static long countCounts(int count, File file, String skipSequence) {
172 | 		long totalCount = 0;
173 | 		try {
174 | 			BufferedReader br = new BufferedReader(new FileReader(file));
175 | 			try {
176 | 				String line;
177 | 				String[] lineSplit;
178 | 				while ((line = br.readLine()) != null) {
179 | 					if (line.startsWith("<fs>")) {
180 | 						continue;
181 | 					}
182 | 					// FIXME: put the delimiter to a global config file or at
183 | 					// least as a constant
184 | 					lineSplit = line.split("\t");
185 | 					long currentCount;
186 | 					if (lineSplit.length == 1) {
187 | 						currentCount = Long.parseLong(lineSplit[0]);
188 | 					} else {
189 | 						currentCount = Long.parseLong(lineSplit[1]);
190 | 					}
191 | 					if (count == currentCount && !lineSplit[0].equals("<fs>")) {
192 | 						totalCount += 1;
193 | 					}
194 | 				}
195 | 			} finally {
196 | 				br.close();
197 | 			}
198 | 		} catch (IOException e) {
199 | 			// TODO Auto-generated catch block
200 | 			e.printStackTrace();
201 | 		}
202 | 		return totalCount;
203 | 	}
204 | }


--------------------------------------------------------------------------------
/src/de/typology/utils/DecimalFormatter.java:
--------------------------------------------------------------------------------
 1 | package de.typology.utils;
 2 | 
 3 | import java.text.DecimalFormat;
 4 | import java.text.DecimalFormatSymbols;
 5 | 
 6 | public class DecimalFormatter {
 7 | 
 8 | 	DecimalFormat decimalFormat;
 9 | 
10 | 	public DecimalFormatter(int decimalPlaces) {
11 | 		String format = "###.";
12 | 		for (int i = 0; i < decimalPlaces; i++) {
13 | 			format += "#";
14 | 		}
15 | 		// set decimalFormat to override LOCALE values
16 | 		this.decimalFormat = new DecimalFormat(format);
17 | 		DecimalFormatSymbols symbols = new DecimalFormatSymbols();
18 | 		symbols.setDecimalSeparator('.');
19 | 		this.decimalFormat.setDecimalFormatSymbols(symbols);
20 | 	}
21 | 
22 | 	public String getRoundedResult(double input) {
23 | 		return this.decimalFormat.format(input);
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/src/de/typology/utils/SequenceFormatter.java:
--------------------------------------------------------------------------------
 1 | package de.typology.utils;
 2 | 
 3 | public class SequenceFormatter {
 4 | 
 5 | 	/**
 6 | 	 * Removes word at position removeWordAtPosition. Words are separated with
 7 | 	 * whitespaces. Returns the resulting string.
 8 | 	 * 
 9 | 	 * @param inputString
10 | 	 * @param removeWordAtPosition
11 | 	 * @return
12 | 	 */
13 | 	public static String removeWord(String inputString, int removeWordAtPosition) {
14 | 		String[] words = inputString.split("\\s");
15 | 		String result = "";
16 | 		for (int i = 0; i < words.length; i++) {
17 | 			if (i != removeWordAtPosition) {
18 | 				result += words[i] + " ";
19 | 			}
20 | 		}
21 | 		result = result.replaceFirst(" $", "");
22 | 		return result;
23 | 	}
24 | 
25 | 	public static String removeWords(String inputString, boolean[] pattern) {
26 | 		String[] words = inputString.split("\\s");
27 | 
28 | 		if (words.length == pattern.length) {
29 | 			String resultString = "";
30 | 			for (int i = 0; i < pattern.length; i++) {
31 | 				if (pattern[i]) {
32 | 					resultString += words[i] + " ";
33 | 				}
34 | 			}
35 | 			resultString = resultString.replaceFirst(" $", "");
36 | 			return resultString;
37 | 		} else {
38 | 			return "";
39 | 		}
40 | 	}
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration status="WARN">
 3 |   <appenders>
 4 |     <Console name="Console" target="SYSTEM_OUT">
 5 |       <PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
 6 |     </Console>
 7 | 
 8 |     <File name="MyFile" fileName="logs/app.log">
 9 |         <PatternLayout pattern="%d{yyyy-mm-dd HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
10 |     </File>             
11 |   </appenders>
12 | 
13 |   <loggers>     
14 |     <root level="trace">
15 |       <appender-ref ref="Console" level="trace"/>
16 |        <appender-ref ref="MyFile" level="trace"/>
17 |     </root>    
18 |   </loggers>
19 | </configuration>
20 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration status="WARN">
 3 |   <appenders>
 4 |     <Console name="Console" target="SYSTEM_OUT">
 5 |       <PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
 6 |     </Console>
 7 | 
 8 |     <File name="MyFile" fileName="logs/app.log">
 9 |         <PatternLayout pattern="%d{yyyy-mm-dd HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
10 |     </File>             
11 |   </appenders>
12 | 
13 |   <loggers>     
14 |     <root level="info">
15 |       <appender-ref ref="Console" level="trace"/>
16 |        <appender-ref ref="MyFile" level="trace"/>
17 |     </root>    
18 |   </loggers>
19 | </configuration>
20 | 


--------------------------------------------------------------------------------
/testDataset/testDataset.txt:
--------------------------------------------------------------------------------
 1 | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.
 2 | 
 3 | Lorem ipsum dolor sit amet, consetetur 
 4 | Lorem ipsum dolor sit amet,
 5 | Lorem ipsum dolor sit
 6 | Lorem ipsum dolor
 7 | Lorem ipsum
 8 | Lorem
 9 | 
10 | Lorem		ipsum	dolor    sit amet, 	consetetur sadipscing elitr, 
11 | 
12 | §$ $ % & / ) ( ! = + * + ~ # ' _ ,. > < < | ^ ° 
13 | 


--------------------------------------------------------------------------------
/testDataset/training.txt:
--------------------------------------------------------------------------------
 1 | Lorem ipsum dolor sit amet , consetetur sadipscing
 2 | Lorem ipsum dolor sit amet , consetetur
 3 | Lorem ipsum dolor sit amet ,
 4 | Lorem ipsum dolor sit amet
 5 | Lorem ipsum dolor sit
 6 | Lorem ipsum dolor
 7 | Lorem ipsum
 8 | Lorem
 9 | et justo duo dolores et ea
10 | et justo duo dolores et ea
11 | et justo duo dolores et
12 | et justo duo dolores et
13 | et justo duo dolores
14 | et justo duo dolores
15 | justo duo dolores
16 | justo duo dolores
17 | duo dolores
18 | duo dolores
19 | dolores
20 | dolores


--------------------------------------------------------------------------------
/tests/de/typology/indexes/WordIndexTest.java:
--------------------------------------------------------------------------------
 1 | package de.typology.indexes;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import static org.junit.Assert.assertTrue;
 5 | 
 6 | import java.io.File;
 7 | 
 8 | import org.junit.After;
 9 | import org.junit.AfterClass;
10 | import org.junit.Before;
11 | import org.junit.BeforeClass;
12 | import org.junit.Test;
13 | 
14 | public class WordIndexTest {
15 | 	File inputFile = new File("testDataset/training.txt");
16 | 	File indexFile = new File("testDataset/index.txt");
17 | 
18 | 	@BeforeClass
19 | 	public static void setUpBeforeClass() throws Exception {
20 | 	}
21 | 
22 | 	@AfterClass
23 | 	public static void tearDownAfterClass() throws Exception {
24 | 	}
25 | 
26 | 	@Before
27 | 	public void setUp() throws Exception {
28 | 		if (this.indexFile.exists()) {
29 | 			this.indexFile.delete();
30 | 		}
31 | 		WordIndexer wi = new WordIndexer();
32 | 		wi.buildIndex(this.inputFile, this.indexFile, 10, "<fs> <s> ", " </s>");
33 | 	}
34 | 
35 | 	@After
36 | 	public void tearDown() throws Exception {
37 | 		if (this.indexFile.exists()) {
38 | 			this.indexFile.delete();
39 | 		}
40 | 	}
41 | 
42 | 	@Test
43 | 	public void rankTest() {
44 | 		WordIndex wi = new WordIndex(this.indexFile);
45 | 		assertEquals(8, wi.rank("et"));
46 | 		assertEquals(3, wi.rank("A"));
47 | 		assertEquals(4, wi.rank("Z"));
48 | 		assertEquals(11, wi.rank("tempora"));
49 | 		assertEquals(11, wi.rank("z"));
50 | 
51 | 		for (String word : wi) {
52 | 			assertTrue(word.length() > 0);
53 | 		}
54 | 	}
55 | 
56 | 	@Test
57 | 	public void iteratorTest() {
58 | 		WordIndex wi = new WordIndex(this.indexFile);
59 | 
60 | 		for (String word : wi) {
61 | 			assertTrue(word.length() > 0);
62 | 		}
63 | 	}
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/tests/de/typology/indexes/WordIndexerTest.java:
--------------------------------------------------------------------------------
 1 | package de.typology.indexes;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import java.io.File;
 6 | 
 7 | import org.junit.After;
 8 | import org.junit.AfterClass;
 9 | import org.junit.Before;
10 | import org.junit.BeforeClass;
11 | import org.junit.Test;
12 | 
13 | public class WordIndexerTest {
14 | 	File inputFile = new File("testDataset/training.txt");
15 | 	File indexFile = new File("testDataset/index.txt");
16 | 
17 | 	@BeforeClass
18 | 	public static void setUpBeforeClass() throws Exception {
19 | 	}
20 | 
21 | 	@AfterClass
22 | 	public static void tearDownAfterClass() throws Exception {
23 | 	}
24 | 
25 | 	@Before
26 | 	public void setUp() throws Exception {
27 | 		if (this.indexFile.exists()) {
28 | 			this.indexFile.delete();
29 | 		}
30 | 	}
31 | 
32 | 	@After
33 | 	public void tearDown() throws Exception {
34 | 		if (this.indexFile.exists()) {
35 | 			this.indexFile.delete();
36 | 		}
37 | 	}
38 | 
39 | 	@Test
40 | 	public void buildIndexTest() {
41 | 		WordIndexer wi = new WordIndexer();
42 | 		long maxCountPerFile = wi.buildIndex(this.inputFile, this.indexFile,
43 | 				10, "<fs> <s> ", " </s>");
44 | 		assertEquals(13, maxCountPerFile);
45 | 	}
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/tests/de/typology/smoother/KneserNeySmootherTest.java:
--------------------------------------------------------------------------------
  1 | package de.typology.smoother;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | 
  5 | import java.io.File;
  6 | import java.util.ArrayList;
  7 | import java.util.HashMap;
  8 | 
  9 | import org.junit.AfterClass;
 10 | import org.junit.Before;
 11 | import org.junit.Test;
 12 | 
 13 | import de.typology.indexes.WordIndexer;
 14 | import de.typology.patterns.PatternBuilder;
 15 | import de.typology.splitter.AbsoluteSplitter;
 16 | import de.typology.splitter.SmoothingSplitter;
 17 | 
 18 | public class KneserNeySmootherTest {
 19 | 
 20 | 	File extractedSequenceDirectory;
 21 | 
 22 | 	File absoluteDirectory;
 23 | 	File continuationDirectory;
 24 | 	File testSequenceFile;
 25 | 	File kneserNeyFile;
 26 | 
 27 | 	@AfterClass
 28 | 	public static void tearDownAfterClass() throws Exception {
 29 | 	}
 30 | 
 31 | 	@Before
 32 | 	public void setUp() throws Exception {
 33 | 		String inputDirectoryPath = "testDataset/";
 34 | 		File inputFile = new File(inputDirectoryPath + "training.txt");
 35 | 		File indexFile = new File(inputDirectoryPath + "index.txt");
 36 | 		WordIndexer wier = new WordIndexer();
 37 | 		wier.buildIndex(inputFile, indexFile, 10, "<fs> <s> ", " </s>");
 38 | 		this.absoluteDirectory = new File(inputDirectoryPath + "absolute");
 39 | 		this.continuationDirectory = new File(inputDirectoryPath
 40 | 				+ "continuation");
 41 | 
 42 | 		AbsoluteSplitter as = new AbsoluteSplitter(inputFile, indexFile,
 43 | 				this.absoluteDirectory, "\t", true, "<fs> <s> ", " </s>");
 44 | 		as.split(PatternBuilder.getGLMForSmoothingPatterns(5), 2);
 45 | 
 46 | 		ArrayList<boolean[]> lmPatterns = PatternBuilder
 47 | 				.getReverseLMPatterns(5);
 48 | 		SmoothingSplitter smoothingSplitter = new SmoothingSplitter(
 49 | 				this.absoluteDirectory, this.continuationDirectory, indexFile,
 50 | 				"\t", true);
 51 | 		smoothingSplitter.split(lmPatterns, 2);
 52 | 
 53 | 		this.testSequenceFile = new File(inputDirectoryPath
 54 | 				+ "test-sequences-5.txt");
 55 | 		this.extractedSequenceDirectory = new File(inputDirectoryPath);
 56 | 		this.absoluteDirectory = new File(inputDirectoryPath + "absolute");
 57 | 		// TestSequenceExtractor tse = new TestSequenceExtractor(
 58 | 		// this.testSequenceFile, this.absoluteDirectory,
 59 | 		// this.continuationDirectory, this.extractedSequenceDirectory,
 60 | 		// "\t", wi);
 61 | 		// tse.extractContinuationSequences(5, 2);
 62 | 		this.kneserNeyFile = new File(inputDirectoryPath + "kn-sequences-5.txt");
 63 | 	}
 64 | 
 65 | 	// @Test
 66 | 	// public void calculateDiscoutValuesTest() {
 67 | 	//
 68 | 	// KneserNeySmoother kns = new KneserNeySmoother(
 69 | 	// this.extractedSequenceDirectory, this.absoluteDirectory,
 70 | 	// this.continuationDirectory, "\t", 5);
 71 | 	// kns.smooth(this.testSequenceFile, this.kneserNeyFile, 5, false);
 72 | 	// double d = kns.discountTypeValueMap.get("1").get("D1+");
 73 | 	// assertEquals(0.529412, d, 0.00001);
 74 | 	// }
 75 | 
 76 | 	@Test
 77 | 	public void calculateLowerOrderResultSimpleTest() {
 78 | 
 79 | 		KneserNeySmoother kns = new KneserNeySmoother(
 80 | 				this.extractedSequenceDirectory, this.absoluteDirectory,
 81 | 				this.continuationDirectory, "\t");
 82 | 
 83 | 		HashMap<String, HashMap<String, Long>> absoluteTypeSequenceValueMap = null;
 84 | 		HashMap<String, HashMap<String, Long[]>> continuationTypeSequenceValueMap = null;
 85 | 		absoluteTypeSequenceValueMap = kns
 86 | 				.readAbsoluteValuesIntoHashMap(kns.extractedAbsoluteDirectory);
 87 | 
 88 | 		continuationTypeSequenceValueMap = kns
 89 | 				.readContinuationValuesIntoHashMap(kns.extractedContinuationDirectory);
 90 | 
 91 | 		kns.absoluteTypeSequenceValueMap = absoluteTypeSequenceValueMap;
 92 | 		kns.continuationTypeSequenceValueMap = continuationTypeSequenceValueMap;
 93 | 
 94 | 		kns.smooth(this.testSequenceFile, this.kneserNeyFile, 5, false, true,
 95 | 				false);
 96 | 		System.out.println(kns.continuationTypeSequenceValueMap.get("__").get(
 97 | 				""));
 98 | 		assertEquals(0.625, kns.discountTypeValuesMap.get("_11").get("D1+"),
 99 | 				0.00001);
100 | 		assertEquals(0.0357,
101 | 				kns.calculateLowerOrderResult("dolor", 1, "1", false), 0.0001);
102 | 		assertEquals(0.07143,
103 | 				kns.calculateLowerOrderResult("et", 1, "1", false), 0.0001);
104 | 		assertEquals(0.39282,
105 | 				kns.calculateLowerOrderResult("</s>", 1, "1", false), 0.0001);
106 | 		assertEquals(0.00840,
107 | 				kns.calculateLowerOrderResult("<s>", 1, "1", false), 0.0001);
108 | 		assertEquals(0.2098,
109 | 				kns.calculateLowerOrderResult("sit amet", 2, "11", false),
110 | 				0.0001);
111 | 		assertEquals(0.00525,
112 | 				kns.calculateLowerOrderResult("sit unknown", 2, "11", false),
113 | 				0.0001);
114 | 		assertEquals(0.309885, kns.calculateLowerOrderResult("dolor sit amet",
115 | 				3, "111", false), 0.0001);
116 | 		assertEquals(0.3595, kns.calculateLowerOrderResult(
117 | 				"ipsum dolor sit amet", 4, "1111", false), 0.0001);
118 | 		assertEquals(0.77929, kns.calculateConditionalProbability(
119 | 				"Lorem ipsum dolor sit amet", 5, "11111", false), 0.0001);
120 | 
121 | 	}
122 | 
123 | 	@Test
124 | 	public void calculateLowerOrderResultComplexTest() {
125 | 
126 | 		KneserNeySmoother kns = new KneserNeySmoother(
127 | 				this.extractedSequenceDirectory, this.absoluteDirectory,
128 | 				this.continuationDirectory, "\t");
129 | 
130 | 		HashMap<String, HashMap<String, Long>> absoluteTypeSequenceValueMap = null;
131 | 		HashMap<String, HashMap<String, Long[]>> continuationTypeSequenceValueMap = null;
132 | 
133 | 		absoluteTypeSequenceValueMap = kns
134 | 				.readAbsoluteValuesIntoHashMap(kns.extractedAbsoluteDirectory);
135 | 
136 | 		continuationTypeSequenceValueMap = kns
137 | 				.readContinuationValuesIntoHashMap(kns.extractedContinuationDirectory);
138 | 
139 | 		kns.absoluteTypeSequenceValueMap = absoluteTypeSequenceValueMap;
140 | 		kns.continuationTypeSequenceValueMap = continuationTypeSequenceValueMap;
141 | 
142 | 		kns.smooth(this.testSequenceFile, this.kneserNeyFile, 5, true, true,
143 | 				false);
144 | 		try {
145 | 			Thread.sleep(500);
146 | 		} catch (InterruptedException ex) {
147 | 			Thread.currentThread().interrupt();
148 | 		}
149 | 		System.out.println("----");
150 | 		assertEquals(0.0084,
151 | 				kns.calculateConditionalProbability("notFound", 1, "1", false),
152 | 				0.0001);
153 | 		assertEquals(0.0084,
154 | 				kns.calculateLowerOrderResult("notFound", 1, "1", false),
155 | 				0.0001);
156 | 		kns.calculateProbability("Lorem ipsum dolor sit amet", 5, "11111",
157 | 				false);
158 | 		assertEquals(0.625, kns.discountTypeValuesMap.get("_11").get("D1+"),
159 | 				0.00001);
160 | 		assertEquals(0.0357,
161 | 				kns.calculateLowerOrderResult("dolor", 1, "1", false), 0.0001);
162 | 		assertEquals(0.07143,
163 | 				kns.calculateLowerOrderResult("et", 1, "1", false), 0.0001);
164 | 		assertEquals(0.08474,
165 | 				kns.calculateConditionalProbability("et", 1, "1", false),
166 | 				0.0001);
167 | 		assertEquals(0.39282,
168 | 				kns.calculateLowerOrderResult("</s>", 1, "1", false), 0.0001);
169 | 		assertEquals(0.0084,
170 | 				kns.calculateLowerOrderResult("<s>", 1, "1", false), 0.0001);
171 | 		assertEquals(0.2321,
172 | 				kns.calculateLowerOrderResult("sit amet", 2, "11", false),
173 | 				0.0001);
174 | 
175 | 		assertEquals(0.0275,
176 | 				kns.calculateLowerOrderResult("sit unknown", 2, "11", false),
177 | 				0.0001);
178 | 		assertEquals(0.3587, kns.calculateLowerOrderResult("dolor sit amet", 3,
179 | 				"111", false), 0.0001);
180 | 		assertEquals(0.4173, kns.calculateLowerOrderResult(
181 | 				"ipsum dolor sit amet", 4, "1111", false), 0.0001);
182 | 		assertEquals(0.09857, kns.calculateConditionalProbability(
183 | 				"<s> At vero eos et", 5, "11111", false), 0.0001);
184 | 		assertEquals(0.79221, kns.calculateConditionalProbability(
185 | 				"Lorem ipsum dolor sit amet", 5, "11111", false), 0.0001);
186 | 
187 | 		System.out.println(kns.calculateProbability(
188 | 				"Lorem ipsum dolor sit amet", 5, "11111", false));
189 | 		// assertEquals(0.00875, kns.calculateProbability(
190 | 		// "Lorem ipsum dolor sit amet", 5, "11111", false), 0.0001);
191 | 	}
192 | }
193 | 


--------------------------------------------------------------------------------
/tests/de/typology/splitter/AggregatorTest.java:
--------------------------------------------------------------------------------
  1 | package de.typology.splitter;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | import static org.junit.Assert.assertNull;
  5 | 
  6 | import java.io.BufferedReader;
  7 | import java.io.BufferedWriter;
  8 | import java.io.File;
  9 | import java.io.FileReader;
 10 | import java.io.FileWriter;
 11 | import java.io.IOException;
 12 | 
 13 | import org.junit.After;
 14 | import org.junit.AfterClass;
 15 | import org.junit.Before;
 16 | import org.junit.Test;
 17 | 
 18 | public class AggregatorTest {
 19 | 	File inputFile = new File("testDataset/aggregator-in.txt");
 20 | 	File outputFile = new File("testDataset/aggregator-out.txt");
 21 | 
 22 | 	@AfterClass
 23 | 	public static void tearDownAfterClass() throws Exception {
 24 | 	}
 25 | 
 26 | 	@Before
 27 | 	public void setUp() throws Exception {
 28 | 		BufferedWriter br = new BufferedWriter(new FileWriter(this.inputFile));
 29 | 		br.write("b y b\t1\n");
 30 | 		br.write("c x a\t1\n");
 31 | 		br.write("b y a\t1\n");
 32 | 		br.write("a z a\t1\n");
 33 | 		br.write("c y b\t1\n");
 34 | 		br.write("c x a\t1\n");
 35 | 		br.close();
 36 | 	}
 37 | 
 38 | 	@After
 39 | 	public void tearDown() throws Exception {
 40 | 		this.inputFile.delete();
 41 | 	}
 42 | 
 43 | 	@Test
 44 | 	public void aggregatorCol0Test() {
 45 | 		Aggregator aggregator = new Aggregator(this.inputFile, this.outputFile,
 46 | 				"\t", 0, false);
 47 | 		aggregator.aggregateCounts();
 48 | 		try {
 49 | 			BufferedReader br = new BufferedReader(new FileReader(
 50 | 					this.outputFile));
 51 | 			assertEquals("a z a\t1", br.readLine());
 52 | 			assertEquals("b y a\t1", br.readLine());
 53 | 			assertEquals("b y b\t1", br.readLine());
 54 | 			assertEquals("c x a\t2", br.readLine());
 55 | 			assertEquals("c y b\t1", br.readLine());
 56 | 			assertNull(br.readLine());
 57 | 			br.close();
 58 | 		} catch (IOException e) {
 59 | 			// TODO Auto-generated catch block
 60 | 			e.printStackTrace();
 61 | 		}
 62 | 		this.outputFile.delete();
 63 | 
 64 | 	}
 65 | 
 66 | 	@Test
 67 | 	public void aggregatorCol1Test() {
 68 | 		Aggregator aggregator = new Aggregator(this.inputFile, this.outputFile,
 69 | 				"\t", 1, false);
 70 | 		aggregator.aggregateCounts();
 71 | 		try {
 72 | 			BufferedReader br = new BufferedReader(new FileReader(
 73 | 					this.outputFile));
 74 | 			assertEquals("c x a\t2", br.readLine());
 75 | 			assertEquals("b y a\t1", br.readLine());
 76 | 			assertEquals("b y b\t1", br.readLine());
 77 | 			assertEquals("c y b\t1", br.readLine());
 78 | 			assertEquals("a z a\t1", br.readLine());
 79 | 			assertNull(br.readLine());
 80 | 			br.close();
 81 | 		} catch (IOException e) {
 82 | 			// TODO Auto-generated catch block
 83 | 			e.printStackTrace();
 84 | 		}
 85 | 		this.outputFile.delete();
 86 | 	}
 87 | 
 88 | 	@Test
 89 | 	public void aggregatorCol2Test() {
 90 | 		Aggregator aggregator = new Aggregator(this.inputFile, this.outputFile,
 91 | 				"\t", 2, false);
 92 | 		aggregator.aggregateCounts();
 93 | 		try {
 94 | 			BufferedReader br = new BufferedReader(new FileReader(
 95 | 					this.outputFile));
 96 | 			assertEquals("a z a\t1", br.readLine());
 97 | 			assertEquals("b y a\t1", br.readLine());
 98 | 			assertEquals("c x a\t2", br.readLine());
 99 | 			assertEquals("b y b\t1", br.readLine());
100 | 			assertEquals("c y b\t1", br.readLine());
101 | 			assertNull(br.readLine());
102 | 			br.close();
103 | 		} catch (IOException e) {
104 | 			// TODO Auto-generated catch block
105 | 			e.printStackTrace();
106 | 		}
107 | 		this.outputFile.delete();
108 | 	}
109 | }
110 | 


--------------------------------------------------------------------------------
/tests/de/typology/splitter/SequenceModifierTest.java:
--------------------------------------------------------------------------------
 1 | package de.typology.splitter;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import static org.junit.Assert.assertNull;
 5 | 
 6 | import java.io.BufferedReader;
 7 | import java.io.BufferedWriter;
 8 | import java.io.File;
 9 | import java.io.FileWriter;
10 | import java.io.IOException;
11 | import java.io.InputStreamReader;
12 | import java.io.OutputStream;
13 | import java.io.PipedInputStream;
14 | import java.io.PipedOutputStream;
15 | 
16 | import org.apache.commons.io.FileUtils;
17 | import org.junit.After;
18 | import org.junit.Before;
19 | import org.junit.Test;
20 | 
21 | public class SequenceModifierTest {
22 | 	File inputDirectory = new File("testDataset/sequenceModifier");
23 | 	OutputStream outputStream;
24 | 	private PipedInputStream pipedInputStream;
25 | 	private OutputStream pipedOutputStream;
26 | 
27 | 	@Before
28 | 	public void setUp() throws Exception {
29 | 		if (this.inputDirectory.exists()) {
30 | 			FileUtils.deleteDirectory(this.inputDirectory);
31 | 		}
32 | 		this.inputDirectory.mkdir();
33 | 		BufferedWriter br1 = new BufferedWriter(new FileWriter(
34 | 				this.inputDirectory.getAbsolutePath() + "/1"));
35 | 		br1.write("a b c\t13\n");
36 | 		br1.write("d e f\t14\n");
37 | 		br1.write("g h i\t15\n");
38 | 		br1.close();
39 | 		BufferedWriter br2 = new BufferedWriter(new FileWriter(
40 | 				this.inputDirectory.getAbsolutePath() + "/2"));
41 | 		br2.write("j k l\t16\n");
42 | 		br2.write("m n o\t17\n");
43 | 		br2.write("ä ö ü\t18\n");
44 | 		br2.write("p q r\t19\n");
45 | 		br2.close();
46 | 		this.pipedInputStream = new PipedInputStream(10 * 8 * 1024);
47 | 		this.pipedOutputStream = new PipedOutputStream(this.pipedInputStream);
48 | 	}
49 | 
50 | 	@After
51 | 	public void tearDown() throws Exception {
52 | 		if (this.inputDirectory.exists()) {
53 | 			FileUtils.deleteDirectory(this.inputDirectory);
54 | 		}
55 | 	}
56 | 
57 | 	@Test
58 | 	public void sequenceModifier101Test() {
59 | 		boolean[] pattern = { true, false, true };
60 | 
61 | 		SequenceModifier sequenceModifier = new SequenceModifier(
62 | 				this.inputDirectory, this.pipedOutputStream, "\t", pattern,
63 | 				true, true);
64 | 		sequenceModifier.run();
65 | 		BufferedReader bufferedReader = new BufferedReader(
66 | 				new InputStreamReader(this.pipedInputStream));
67 | 
68 | 		try {
69 | 			assertEquals("a c\t1", bufferedReader.readLine());
70 | 			assertEquals("d f\t1", bufferedReader.readLine());
71 | 			assertEquals("g i\t1", bufferedReader.readLine());
72 | 			assertEquals("j l\t1", bufferedReader.readLine());
73 | 			assertEquals("m o\t1", bufferedReader.readLine());
74 | 			assertEquals("ä ü\t1", bufferedReader.readLine());
75 | 			assertEquals("p r\t1", bufferedReader.readLine());
76 | 			assertNull(bufferedReader.readLine());
77 | 			bufferedReader.close();
78 | 		} catch (IOException e) {
79 | 			// TODO Auto-generated catch block
80 | 			e.printStackTrace();
81 | 		}
82 | 	}
83 | }
84 | 


--------------------------------------------------------------------------------
/tests/de/typology/splitter/SequencerTest.java:
--------------------------------------------------------------------------------
  1 | package de.typology.splitter;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | import static org.junit.Assert.assertNull;
  5 | 
  6 | import java.io.BufferedReader;
  7 | import java.io.File;
  8 | import java.io.FileInputStream;
  9 | import java.io.FileReader;
 10 | import java.io.IOException;
 11 | import java.io.InputStream;
 12 | 
 13 | import org.apache.commons.io.FileUtils;
 14 | import org.junit.After;
 15 | import org.junit.AfterClass;
 16 | import org.junit.Before;
 17 | import org.junit.BeforeClass;
 18 | import org.junit.Test;
 19 | 
 20 | import de.typology.indexes.WordIndex;
 21 | import de.typology.indexes.WordIndexer;
 22 | 
 23 | public class SequencerTest {
 24 | 	File inputFile = new File("testDataset/training.txt");
 25 | 	File indexFile = new File("testDataset/index.txt");
 26 | 	File sequencerOutputDirectory = new File("testDataset/sequencer/");
 27 | 
 28 | 	@BeforeClass
 29 | 	public static void setUpBeforeClass() throws Exception {
 30 | 	}
 31 | 
 32 | 	@AfterClass
 33 | 	public static void tearDownAfterClass() throws Exception {
 34 | 	}
 35 | 
 36 | 	@Before
 37 | 	public void setUp() throws Exception {
 38 | 		WordIndexer wordIndexer = new WordIndexer();
 39 | 		wordIndexer.buildIndex(this.inputFile, this.indexFile, 10, "<fs> <s> ",
 40 | 				" </s>");
 41 | 		if (this.sequencerOutputDirectory.exists()) {
 42 | 			FileUtils.deleteDirectory(this.sequencerOutputDirectory);
 43 | 		}
 44 | 		this.sequencerOutputDirectory.mkdir();
 45 | 	}
 46 | 
 47 | 	@After
 48 | 	public void tearDown() throws Exception {
 49 | 		if (this.sequencerOutputDirectory.exists()) {
 50 | 			FileUtils.deleteDirectory(this.sequencerOutputDirectory);
 51 | 		}
 52 | 		if (this.indexFile.exists()) {
 53 | 			this.indexFile.delete();
 54 | 		}
 55 | 	}
 56 | 
 57 | 	@Test
 58 | 	public void squencing1Test() {
 59 | 		WordIndex wordIndex = new WordIndex(this.indexFile);
 60 | 		boolean[] pattern = { true };
 61 | 
 62 | 		try {
 63 | 			InputStream inputStream = new FileInputStream(this.inputFile);
 64 | 			Sequencer sequencer = new Sequencer(inputStream,
 65 | 					this.sequencerOutputDirectory, wordIndex, pattern,
 66 | 					"<fs> <s> ", " </s>", "\t", false, 0);
 67 | 
 68 | 			sequencer.splitIntoFiles();
 69 | 
 70 | 			// test file contents
 71 | 			BufferedReader br8 = new BufferedReader(new FileReader(
 72 | 					this.sequencerOutputDirectory.getAbsolutePath() + "/8"));
 73 | 			for (int i = 0; i < 10; i++) {
 74 | 				assertEquals("et\t1", br8.readLine());
 75 | 			}
 76 | 			assertNull(br8.readLine());
 77 | 			br8.close();
 78 | 
 79 | 			BufferedReader br2 = new BufferedReader(new FileReader(
 80 | 					this.sequencerOutputDirectory.getAbsolutePath() + "/3"));
 81 | 			for (int i = 0; i < 20; i++) {
 82 | 				assertEquals("<s>\t1", br2.readLine());
 83 | 			}
 84 | 			assertNull(br2.readLine());
 85 | 			br2.close();
 86 | 		} catch (IOException e) {
 87 | 			// TODO Auto-generated catch block
 88 | 			e.printStackTrace();
 89 | 		}
 90 | 	}
 91 | 
 92 | 	@Test
 93 | 	public void squencing1101Test() {
 94 | 		WordIndex wordIndex = new WordIndex(this.indexFile);
 95 | 		boolean[] pattern = { true, true, false, true };
 96 | 
 97 | 		try {
 98 | 			InputStream inputStream = new FileInputStream(this.inputFile);
 99 | 			Sequencer sequencer = new Sequencer(inputStream,
100 | 					this.sequencerOutputDirectory, wordIndex, pattern,
101 | 					"<fs> <s> ", " </s>", "\t", false, 0);
102 | 			sequencer.splitIntoFiles();
103 | 
104 | 			// test file contents
105 | 			BufferedReader br0 = new BufferedReader(new FileReader(
106 | 					this.sequencerOutputDirectory.getAbsolutePath() + "/8"));
107 | 			for (int i = 0; i < 6; i++) {
108 | 				assertEquals("et justo dolores\t1", br0.readLine());
109 | 			}
110 | 			assertNull(br0.readLine());
111 | 			br0.close();
112 | 
113 | 			BufferedReader br10 = new BufferedReader(new FileReader(
114 | 					this.sequencerOutputDirectory.getAbsolutePath() + "/3"));
115 | 			for (int i = 0; i < 6; i++) {
116 | 				assertEquals("<s> Lorem dolor\t1", br10.readLine());
117 | 			}
118 | 			assertEquals("<s> Lorem </s>\t1", br10.readLine());
119 | 			br10.close();
120 | 		} catch (IOException e) {
121 | 			// TODO Auto-generated catch block
122 | 			e.printStackTrace();
123 | 		}
124 | 	}
125 | }
126 | 


--------------------------------------------------------------------------------