├── .gitignore ├── README.md ├── apps ├── pom.xml └── src │ └── test │ ├── java │ └── org │ │ └── trnltk │ │ └── apps │ │ ├── analysis │ │ └── FrequentWordAnalysis.java │ │ ├── commands │ │ ├── BulkParseCommand.java │ │ └── SingleParseCommand.java │ │ ├── commons │ │ ├── App.java │ │ ├── AppProperties.java │ │ ├── AppRunner.java │ │ ├── LoggingSettings.java │ │ └── SampleFiles.java │ │ ├── criticalsurface │ │ ├── CriticalSurfaceEntry.java │ │ ├── CriticalSurfaceFileHelper.java │ │ ├── SentenceCollectorForCriticalSurfaces.java │ │ └── SentenceIdentifier.java │ │ ├── experiments │ │ ├── AmbiguityMatrixApp.java │ │ ├── TurkishCollatorPerformanceTest.java │ │ └── YAMLExperiments.java │ │ ├── morphology │ │ └── contextless │ │ │ └── parser │ │ │ ├── CachingMorphologicParserApp.java │ │ │ ├── CharSuffixGraphExtractorDrawingApp.java │ │ │ └── FolderContextlessMorphologicParsingApp.java │ │ └── tokenizer │ │ ├── TextTokenizerCorpusApp.java │ │ ├── TextTokenizerDefaultTrainingApp.java │ │ ├── UniqueWordFinderApp.java │ │ └── WordCountFinderApp.java │ ├── octave │ └── wordHistogram.m │ └── resources │ └── log4j.xml ├── core ├── pom.xml └── src │ ├── doc │ └── org │ │ └── trnltk │ │ ├── cookbook │ │ ├── customrootfinder │ │ │ └── CustomRootFinder.java │ │ ├── oldturkishsuffixgraph │ │ │ └── Main.java │ │ ├── samplecorpusstats │ │ │ ├── Stats1.java │ │ │ └── Stats2.java │ │ └── spellcheck │ │ │ ├── SpellChecker.java │ │ │ └── Tolerance.java │ │ └── doc │ │ ├── advancedparsing │ │ └── AdvancedParsing.java │ │ ├── formattingoptions │ │ └── FormattingOptions.java │ │ └── simpleparsing │ │ └── SimpleParsing.java │ ├── main │ ├── java │ │ └── org │ │ │ └── trnltk │ │ │ ├── common │ │ │ ├── specification │ │ │ │ ├── AbstractSpecification.java │ │ │ │ ├── AndSpecification.java │ │ │ │ ├── FalseSpecification.java │ │ │ │ ├── NotSpecification.java │ │ │ │ ├── OrSpecification.java │ │ │ │ ├── Specification.java │ │ │ │ ├── Specifications.java │ │ │ │ └── TrueSpecification.java │ │ │ ├── structure │ │ │ │ ├── StringEnum.java │ │ │ │ └── StringEnumMap.java │ │ │ └── util │ │ │ │ └── Comparators.java │ │ │ ├── experiment │ │ │ ├── model │ │ │ │ └── ambiguity │ │ │ │ │ └── morphology │ │ │ │ │ ├── ParseResult.java │ │ │ │ │ ├── ParseResultDifference.java │ │ │ │ │ ├── ParseResultPart.java │ │ │ │ │ ├── ParseResultPartDifference.java │ │ │ │ │ ├── RootDifference.java │ │ │ │ │ └── WordParseResultEntry.java │ │ │ └── morphology │ │ │ │ └── ambiguity │ │ │ │ ├── AmbiguityClassifier.java │ │ │ │ ├── DataDiffUtil.java │ │ │ │ ├── ParseResultDiffTool.java │ │ │ │ └── ParseResultReader.java │ │ │ ├── model │ │ │ ├── letter │ │ │ │ ├── TurkicLetter.java │ │ │ │ ├── TurkishAlphabet.java │ │ │ │ ├── TurkishChar.java │ │ │ │ └── TurkishSequence.java │ │ │ ├── lexicon │ │ │ │ ├── DynamicLexeme.java │ │ │ │ ├── DynamicRoot.java │ │ │ │ ├── ImmutableLexeme.java │ │ │ │ ├── ImmutableRoot.java │ │ │ │ ├── Lexeme.java │ │ │ │ ├── LexemeAttribute.java │ │ │ │ ├── NumeralRoot.java │ │ │ │ ├── PhoneticAttribute.java │ │ │ │ ├── PhoneticAttributeMetadata.java │ │ │ │ ├── PhoneticExpectation.java │ │ │ │ ├── PrimaryPos.java │ │ │ │ ├── Root.java │ │ │ │ └── SecondaryPos.java │ │ │ ├── morpheme │ │ │ │ └── MorphemeContainer.java │ │ │ └── suffix │ │ │ │ ├── ConditionalFreeTransitionSuffix.java │ │ │ │ ├── FreeTransitionSuffix.java │ │ │ │ ├── Suffix.java │ │ │ │ ├── SuffixForm.java │ │ │ │ ├── SuffixFormApplication.java │ │ │ │ ├── SuffixFormSequence.java │ │ │ │ ├── SuffixGroup.java │ │ │ │ ├── SuffixTransition.java │ │ │ │ └── ZeroTransitionSuffix.java │ │ │ ├── morphology │ │ │ ├── contextless │ │ │ │ ├── parser │ │ │ │ │ ├── CachingMorphologicParser.java │ │ │ │ │ ├── ContextlessMorphologicParser.java │ │ │ │ │ ├── ContextlessMorphologicParserBuilder.java │ │ │ │ │ ├── ContextlessMorphologicParserListener.java │ │ │ │ │ ├── MandatoryTransitionApplier.java │ │ │ │ │ ├── MorphologicParser.java │ │ │ │ │ ├── PhoneticAttributeSets.java │ │ │ │ │ ├── PredefinedPathBuilder.java │ │ │ │ │ ├── PredefinedPaths.java │ │ │ │ │ ├── SuffixApplier.java │ │ │ │ │ ├── SuffixFormGraph.java │ │ │ │ │ ├── SuffixFormGraphExtractor.java │ │ │ │ │ ├── SuffixFormGraphNode.java │ │ │ │ │ ├── SuffixFormGraphNodeKey.java │ │ │ │ │ ├── SuffixFormGraphSuffixEdge.java │ │ │ │ │ └── cache │ │ │ │ │ │ ├── LRUMorphologicParserCache.java │ │ │ │ │ │ ├── MorphologicParserCache.java │ │ │ │ │ │ ├── SimpleOfflineCache.java │ │ │ │ │ │ └── TwoLevelMorphologicParserCache.java │ │ │ │ └── rootfinder │ │ │ │ │ ├── BruteForceCompoundNounRootFinder.java │ │ │ │ │ ├── BruteForceNounRootFinder.java │ │ │ │ │ ├── BruteForceVerbRootFinder.java │ │ │ │ │ ├── CardinalDigitsRootFinder.java │ │ │ │ │ ├── DictionaryRootFinder.java │ │ │ │ │ ├── OrdinalDigitsRootFinder.java │ │ │ │ │ ├── ProperNounFromApostropheRootFinder.java │ │ │ │ │ ├── ProperNounWithoutApostropheRootFinder.java │ │ │ │ │ ├── PuncRootFinder.java │ │ │ │ │ ├── RangeDigitsRootFinder.java │ │ │ │ │ ├── RootFinder.java │ │ │ │ │ ├── RootFinderChain.java │ │ │ │ │ └── RootValidator.java │ │ │ ├── lexicon │ │ │ │ ├── CircumflexConvertingRootGenerator.java │ │ │ │ ├── DictionaryLoader.java │ │ │ │ ├── ImmutableRootGenerator.java │ │ │ │ ├── LexemeCreator.java │ │ │ │ ├── RootMapFactory.java │ │ │ │ └── RootMapGenerator.java │ │ │ ├── morphotactics │ │ │ │ ├── BaseSuffixGraph.java │ │ │ │ ├── BasicSuffixGraph.java │ │ │ │ ├── CopulaSuffixGraph.java │ │ │ │ ├── EmptySuffixGraph.java │ │ │ │ ├── NumeralSuffixGraph.java │ │ │ │ ├── PrecachingSuffixFormSequenceApplier.java │ │ │ │ ├── ProperNounSuffixGraph.java │ │ │ │ ├── SuffixEdge.java │ │ │ │ ├── SuffixFormSequenceApplier.java │ │ │ │ ├── SuffixFormSequenceRuleApplier.java │ │ │ │ ├── SuffixGraph.java │ │ │ │ ├── SuffixGraphState.java │ │ │ │ ├── SuffixGraphStateType.java │ │ │ │ ├── reducedambiguity │ │ │ │ │ └── BasicRASuffixGraph.java │ │ │ │ └── suffixformspecifications │ │ │ │ │ ├── AppliesToRoot.java │ │ │ │ │ ├── DoesntHaveLexemeAttributes.java │ │ │ │ │ ├── HasLastNonBlankDerivation.java │ │ │ │ │ ├── HasLexemeAttributes.java │ │ │ │ │ ├── HasSuffixFormAsLastDerivation.java │ │ │ │ │ ├── HasSuffixFormSinceLastDerivation.java │ │ │ │ │ ├── LastSuffixGoesToStateWithType.java │ │ │ │ │ ├── RootHasPrimaryPos.java │ │ │ │ │ ├── RootHasSecondaryPos.java │ │ │ │ │ ├── RootHasVowelDrop.java │ │ │ │ │ └── SuffixFormSpecifications.java │ │ │ └── phonetics │ │ │ │ ├── PhoneticsAnalyzer.java │ │ │ │ └── PhoneticsEngine.java │ │ │ ├── numeral │ │ │ └── DigitsToTextConverter.java │ │ │ ├── tokenizer │ │ │ ├── MissingTokenizationRuleException.java │ │ │ ├── TextBlock.java │ │ │ ├── TextBlockGroup.java │ │ │ ├── TextBlockSplitter.java │ │ │ ├── TextBlockType.java │ │ │ ├── TextBlockTypeGroup.java │ │ │ ├── TextTokenizer.java │ │ │ ├── TextTokenizerTrainer.java │ │ │ ├── Token.java │ │ │ ├── TokenizationGraph.java │ │ │ ├── TokenizationGraphEdge.java │ │ │ ├── TokenizationGraphNode.java │ │ │ ├── TokenizationUtils.java │ │ │ └── data │ │ │ │ ├── TokenizerTrainingData.java │ │ │ │ └── TokenizerTrainingEntry.java │ │ │ └── util │ │ │ ├── Constants.java │ │ │ ├── DiffUtil.java │ │ │ ├── MorphemeContainerFormatter.java │ │ │ └── Utilities.java │ └── resources │ │ ├── master-dictionary.dict │ │ ├── master-numeral-dictionary.dict │ │ ├── tokenizer │ │ ├── abbreviations.txt │ │ └── training-data.yaml │ │ ├── top20kwords.txt │ │ └── top2kwords.txt │ └── test │ ├── java │ └── org │ │ └── trnltk │ │ ├── experiment │ │ ├── bruteforce │ │ │ └── BruteForceExperiments.java │ │ └── morphology │ │ │ ├── ambiguity │ │ │ ├── DataDiffUtilTest.java │ │ │ └── ParseResultDiffToolTest.java │ │ │ ├── contextless │ │ │ └── parser │ │ │ │ └── PhoneticAttributeSetsTest.java │ │ │ └── morphotactics │ │ │ └── SuffixGraphDrawingTest.java │ │ ├── model │ │ ├── letter │ │ │ └── TurkishAlphabetTest.java │ │ ├── lexicon │ │ │ └── PhoneticAttributeMetadataTest.java │ │ └── suffix │ │ │ ├── SuffixFormSequenceRuleStub.java │ │ │ └── SuffixFormSequenceTest.java │ │ ├── morphology │ │ ├── contextless │ │ │ ├── parser │ │ │ │ ├── MandatoryTransitionApplierTest.java │ │ │ │ ├── PredefinedPathsTest.java │ │ │ │ ├── parsing │ │ │ │ │ ├── ContextlessMorphologicParserBasicSuffixGraphTest.java │ │ │ │ │ ├── ContextlessMorphologicParserBruteForceNounCompoundTest.java │ │ │ │ │ ├── ContextlessMorphologicParserBruteForceNounTest.java │ │ │ │ │ ├── ContextlessMorphologicParserBruteForceVerbTest.java │ │ │ │ │ ├── ContextlessMorphologicParserCopulaSuffixGraphTest.java │ │ │ │ │ ├── ContextlessMorphologicParserNumeralSuffixGraphTest.java │ │ │ │ │ ├── ContextlessMorphologicParserProperNounSuffixGraphTest.java │ │ │ │ │ ├── ContextlessMorphologicParserPuncTest.java │ │ │ │ │ ├── ContextlessMorphologicParserSimpleParseSetCharacterTest.java │ │ │ │ │ ├── ContextlessMorphologicParserSimpleParseSetSpeedTest.java │ │ │ │ │ ├── MockPhoneticAttributeSets.java │ │ │ │ │ ├── SampleSuffixGraph.java │ │ │ │ │ ├── SimplifiedSampleSuffixGraph.java │ │ │ │ │ └── base │ │ │ │ │ │ ├── BaseContextlessMorphologicParserBasicSuffixGraphTest.java │ │ │ │ │ │ ├── BaseContextlessMorphologicParserBruteForceNounCompoundTest.java │ │ │ │ │ │ ├── BaseContextlessMorphologicParserBruteForceNounTest.java │ │ │ │ │ │ ├── BaseContextlessMorphologicParserBruteForceVerbTest.java │ │ │ │ │ │ ├── BaseContextlessMorphologicParserCopulaSuffixGraphTest.java │ │ │ │ │ │ ├── BaseContextlessMorphologicParserNumeralSuffixGraphTest.java │ │ │ │ │ │ ├── BaseContextlessMorphologicParserProperNounSuffixGraphTest.java │ │ │ │ │ │ ├── BaseContextlessMorphologicParserPuncTest.java │ │ │ │ │ │ ├── BaseContextlessMorphologicParserSimpleParseSetCharacterTest.java │ │ │ │ │ │ ├── BaseContextlessMorphologicParserSimpleParseSetSpeedTest.java │ │ │ │ │ │ └── BaseContextlessMorphologicParserTest.java │ │ │ │ └── reducedambiguity │ │ │ │ │ └── ContextlessMorphologicParserBasicSuffixGraphTest.java │ │ │ └── rootfinder │ │ │ │ ├── BaseRootFinderTest.java │ │ │ │ ├── BruteForceCompoundNounRootFinderTest.java │ │ │ │ ├── BruteForceNounRootFinderTest.java │ │ │ │ ├── BruteForceVerbRootFinderTest.java │ │ │ │ ├── CardinalDigitsRootFinderTest.java │ │ │ │ ├── DictionaryRootFinderTest.java │ │ │ │ ├── OrdinalDigitsRootFinderTest.java │ │ │ │ ├── ProperNounFromApostropheRootFinderTest.java │ │ │ │ ├── ProperNounWithoutApostropheRootFinderTest.java │ │ │ │ ├── PuncRootFinderTest.java │ │ │ │ ├── RangeDigitsRootFinderTest.java │ │ │ │ └── RootValidatorTest.java │ │ ├── lexicon │ │ │ ├── CircumflexConvertingRootGeneratorTest.java │ │ │ ├── DictionaryLoaderTest.java │ │ │ ├── ImmutableRootGeneratorTest.java │ │ │ └── LexemeCreatorTest.java │ │ ├── morphotactics │ │ │ ├── PrecachingSuffixFormSequenceApplierTest.java │ │ │ ├── SuffixFormSequenceApplierTest.java │ │ │ └── SuffixFormSequenceRuleApplierTest.java │ │ └── phonetics │ │ │ ├── PhoneticsAnalyzerDistinctionTest.java │ │ │ ├── PhoneticsAnalyzerTest.java │ │ │ ├── PhoneticsAnalyzerValidityBruteForceTest.java │ │ │ └── PhoneticsEngineTest.java │ │ ├── numeral │ │ └── DigitsToTextConverterTest.java │ │ ├── testutil │ │ ├── RegexMatcher.java │ │ ├── TestEnvironment.java │ │ └── testmatchers │ │ │ ├── BaseParseResultsMatcher.java │ │ │ ├── ParseResultsDontExistMatcher.java │ │ │ ├── ParseResultsEqualMatcher.java │ │ │ └── ParseResultsExistMatcher.java │ │ ├── tokenizer │ │ ├── TextBlockSplitterTest.java │ │ ├── TextTokenizerCorpusTest.java │ │ ├── TextTokenizerDefaultTrainingTest.java │ │ └── TextTokenizerTest.java │ │ └── util │ │ ├── DiffUtilTest.java │ │ └── MorphemeContainerFormatterTest.java │ └── resources │ ├── log4j.xml │ ├── simpleparsesets │ ├── .gitignore │ ├── simpleparseset001.txt │ ├── simpleparseset002.txt │ ├── simpleparseset003.txt │ ├── simpleparseset004.txt │ └── simpleparseset005.txt │ ├── tokenizer │ ├── sentence-boundary-text.txt │ ├── tbmm_b0241h.txt │ ├── tbmm_b0241h_lines.txt │ └── tbmm_b0241h_tokenized.txt │ └── trnltk.apps.properties ├── data ├── pom.xml └── src │ └── main │ └── resources │ └── criticalSurfaces.txt ├── docs ├── 102.md ├── README.md ├── cookbook │ ├── README.md │ ├── custom_root_finder.md │ ├── numeral_to_text.md │ ├── old_turkish_suffix_graph.md │ ├── sample_corpus_stats_1.md │ ├── sample_corpus_stats_2.md │ └── spell_check.md ├── resources_102 │ ├── rootFinders.png │ ├── suffixGraphHierarchy.png │ ├── z_01.png │ ├── z_02.png │ ├── z_03.png │ ├── z_04.png │ ├── z_05.png │ ├── z_06.png │ └── z_07.png └── tutorial │ ├── README.md │ ├── advanced_parsing.md │ ├── brute_force.md │ ├── caching.md │ ├── glossary.md │ ├── root_finders_explained.md │ ├── simple_parsing.md │ ├── suffix_graphs_explained.md │ ├── tokenization.md │ └── tokenization_resources │ ├── img01.png │ ├── img02.png │ ├── img03.png │ ├── img04.png │ └── img05.png ├── pom.xml ├── scripts ├── dictionary_ops.py └── dictionary_tools.py └── web ├── pom.xml └── src ├── main ├── java │ └── org │ │ └── trnltk │ │ └── web │ │ ├── common │ │ └── Constants.java │ │ ├── criticalsurface │ │ ├── CriticalSurfaceTaggingController.java │ │ ├── CriticalSurfaceTaggingData.java │ │ ├── CriticalSurfaceTaggingProgressData.java │ │ ├── ParseResultWithSentencesContainer.java │ │ └── SentenceContainer.java │ │ ├── morphology │ │ └── parser │ │ │ ├── ParserBean.java │ │ │ ├── RootFinderSelectionData.java │ │ │ ├── RootMapData.java │ │ │ └── SuffixGraphSelectionData.java │ │ └── training │ │ ├── TrainingFileCreator.java │ │ ├── TrainingFileData.java │ │ └── TrainingSetCreatorBean.java ├── resources │ ├── commons-logging.properties │ ├── log4j.xml │ └── trainingSets │ │ └── sample.trainingset └── webapp │ ├── WEB-INF │ ├── faces-config.xml │ └── web.xml │ ├── basetemplate.xhtml │ ├── createTrainingFile.xhtml │ ├── index.xhtml │ ├── resources │ ├── components │ │ └── criticalsurface │ │ │ └── sentence.xhtml │ ├── img │ │ └── ajaxloading.gif │ ├── style.css │ └── thirdparty │ │ ├── css │ │ ├── bootstrap.css │ │ └── bootstrap.min.css │ │ ├── img │ │ ├── glyphicons-halflings-white.png │ │ └── glyphicons-halflings.png │ │ └── js │ │ ├── bootstrap.js │ │ ├── bootstrap.min.js │ │ └── jquery.min.js │ └── tagCriticalSurfaces.xhtml └── test ├── java └── org │ └── trnltk │ └── web │ └── training │ └── TrainingSetCreatorParserSelectionTest.java └── resources └── trainingSetParserExpectation.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Maven target folder 2 | target/ 3 | 4 | # Intellij files 5 | .idea/ 6 | *.iml 7 | atlassian-ide-plugin.xml 8 | 9 | # 10 | *.class 11 | 12 | # Package Files # 13 | *.jar 14 | *.war 15 | *.ear 16 | 17 | # OS files: 18 | *.DS_Store 19 | 20 | */bin 21 | **/.project 22 | **/.classpath 23 | **/.settings/ 24 | **/target/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | """ Copyright 2012-2013 Ali Ok (aliokATapacheDOTorg) 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at 4 | 5 | http://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ 8 | 9 | Turkish Natural Language Toolkit 10 | ================================ 11 | This project provides a toolkit for computer linguistic work for Turkish. 12 | 13 | Currently a morphologic parser and a tokenizer is provided. Biggest challenge is providing an ambiguity resolver. 14 | 15 | Project first implemented in Python, [TRNLTK Python](https://github.com/aliok/trnltk), then Java. 16 | Python project is obsolete. 17 | 18 | **See [documentation, tutorial and cookbook](docs/README.md)** 19 | 20 | News: 21 | ----- 22 | * TRNLTK 1.0.2 is released : [Release notes](docs/102.md) 23 | 24 | 25 | Motivation 26 | ======================== 27 | Why another parsing tool and why FSM? 28 | 29 | I've inspected other other approaches and I saw that tracking the problems are very hard with them. 30 | For example, one approach is creating a suffix graph by defining what suffix can come after other suffix. 31 | But with that approach it is impossible to have an overview of the graph, since there would be thousands of nodes and edges. 32 | 33 | **See [documentation](docs/README.md) for more information.** 34 | 35 | 36 | Phonetic rules and phonetic implementation are similar to from open-source java library Zemberek3. 37 | 38 | How it is tested? 39 | ================= 40 | There are thousands of parsing unit tests. Plus, I use the treebank from METU-Sabanci, but is closed-source. 41 | Unfortunately, its license doesn't allow anyone to publish any portion of the treebank, 42 | thus I only test the parser against it in my local environment. 43 | -------------------------------------------------------------------------------- /apps/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 21 | 4.0.0 22 | 23 | 24 | org.trnltk 25 | trnltk 26 | 1.0.3-SNAPSHOT 27 | 28 | 29 | apps 30 | TRNLTK Apps 31 | TRNLTK Apps 32 | 33 | 34 | 35 | 36 | org.apache.maven.plugins 37 | maven-jar-plugin 38 | 2.6 39 | 40 | 41 | 42 | test-jar 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | org.trnltk 53 | core 54 | 1.0.3-SNAPSHOT 55 | 56 | 57 | org.trnltk 58 | core 59 | 1.0.3-SNAPSHOT 60 | test-jar 61 | 62 | 63 | 64 | 65 | ${encoding} 66 | 67 | 68 | -------------------------------------------------------------------------------- /apps/src/test/java/org/trnltk/apps/commands/BulkParseCommand.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.apps.commands; 2 | 3 | import org.trnltk.model.morpheme.MorphemeContainer; 4 | import org.trnltk.morphology.contextless.parser.MorphologicParser; 5 | import org.trnltk.util.MorphemeContainerFormatter; 6 | 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | /** 11 | * @author Ali Ok (ali.ok@apache.org) 12 | */ 13 | public class BulkParseCommand implements Runnable { 14 | private final MorphologicParser parser; 15 | private final List subWordList; 16 | private final int wordIndex; 17 | private boolean printUnparseable; 18 | final Map> resultMap; 19 | 20 | public BulkParseCommand(final MorphologicParser parser, final List subWordList, final int wordIndex, boolean printUnparseable, Map> resultMap) { 21 | this.parser = parser; 22 | this.subWordList = subWordList; 23 | this.wordIndex = wordIndex; 24 | this.printUnparseable = printUnparseable; 25 | this.resultMap = resultMap; 26 | } 27 | 28 | public BulkParseCommand(final MorphologicParser parser, final List subWordList, final int wordIndex, boolean printUnparseable) { 29 | this(parser, subWordList, wordIndex, printUnparseable, null); 30 | } 31 | 32 | @Override 33 | public void run() { 34 | final List> results = parser.parseAllStr(subWordList); 35 | 36 | System.out.println("Finished " + wordIndex); 37 | 38 | if (resultMap != null) { 39 | for (int i = 0; i < results.size(); i++) { 40 | String surface = subWordList.get(i); 41 | List result = results.get(i); 42 | if (result.size() > 1) 43 | resultMap.put(surface, MorphemeContainerFormatter.formatMorphemeContainers(result)); 44 | } 45 | } 46 | 47 | if (printUnparseable) { 48 | for (int i = 0; i < results.size(); i++) { 49 | String surface = subWordList.get(i); 50 | List result = results.get(i); 51 | 52 | if (result.isEmpty()) 53 | System.out.println("Word is not parsable " + surface); 54 | } 55 | } 56 | 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /apps/src/test/java/org/trnltk/apps/commands/SingleParseCommand.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.apps.commands; 2 | 3 | import org.trnltk.model.morpheme.MorphemeContainer; 4 | import org.trnltk.morphology.contextless.parser.MorphologicParser; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * @author Ali Ok (ali.ok@apache.org) 10 | */ 11 | public class SingleParseCommand implements Runnable { 12 | private final MorphologicParser parser; 13 | private final String word; 14 | private final int wordIndex; 15 | private boolean printUnparseable; 16 | 17 | public SingleParseCommand(final MorphologicParser parser, final String word, final int wordIndex, boolean printUnparseable) { 18 | this.parser = parser; 19 | this.word = word; 20 | this.wordIndex = wordIndex; 21 | this.printUnparseable = printUnparseable; 22 | } 23 | 24 | @Override 25 | public void run() { 26 | final List morphemeContainers = parser.parseStr(word); 27 | if (printUnparseable) { 28 | if (morphemeContainers.isEmpty()) 29 | System.out.println("Word is not parsable " + word); 30 | } 31 | if (wordIndex % 500 == 0) 32 | System.out.println("Finished " + wordIndex); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /apps/src/test/java/org/trnltk/apps/commons/App.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.apps.commons; 18 | 19 | import java.lang.annotation.ElementType; 20 | import java.lang.annotation.Retention; 21 | import java.lang.annotation.RetentionPolicy; 22 | import java.lang.annotation.Target; 23 | 24 | /** 25 | * Nothing that different than standard JUnit annotation {@link org.junit.Test}. 26 | *

27 | * More readable, since apps are actually JUnit tests, but not really "test"s in TDD manner. 28 | * 29 | * @see org.trnltk.apps.commons.AppRunner 30 | */ 31 | @Retention(RetentionPolicy.RUNTIME) 32 | @Target({ElementType.METHOD}) 33 | public @interface App { 34 | /** 35 | * Some description about the app. Not actually used in the code. Only required for readability. 36 | * 37 | * @return desc 38 | */ 39 | String value() default ""; 40 | } 41 | -------------------------------------------------------------------------------- /apps/src/test/java/org/trnltk/apps/commons/AppProperties.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.apps.commons; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.Properties; 6 | 7 | /** 8 | * @author Ali Ok (ali.ok@apache.org) 9 | */ 10 | public class AppProperties { 11 | private static Properties properties; 12 | 13 | public static String largeFilesFolder() { 14 | return getString("app.data.folder.large.files"); 15 | } 16 | 17 | public static String oneMillionSentencesFolder() { 18 | return getString("app.data.folder.1msentences"); 19 | } 20 | 21 | public static String generalFolder() { 22 | return getString("app.data.folder.general"); 23 | } 24 | 25 | public static String criticalSurfaceFolder() { 26 | return getString("app.data.folder.criticalSurface"); 27 | } 28 | 29 | private static String getString(String key) { 30 | return (String) getObject(key); 31 | } 32 | 33 | private static Object getObject(String key) { 34 | if (properties == null) 35 | loadProperties(); 36 | 37 | final Object value = properties.get(key); 38 | if (value == null) 39 | throw new RuntimeException("Cannot find key in properties file! Key : " + key + " properties file: " + properties.toString()); 40 | return value; 41 | } 42 | 43 | private static void loadProperties() { 44 | final ClassLoader classLoader = AppProperties.class.getClassLoader(); 45 | final InputStream stream = classLoader.getResourceAsStream("trnltk.apps.properties"); 46 | final Properties props = new Properties(); 47 | try { 48 | props.load(stream); 49 | } catch (IOException e) { 50 | throw new RuntimeException(e); 51 | } 52 | AppProperties.properties = props; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /apps/src/test/java/org/trnltk/apps/commons/AppRunner.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.apps.commons; 18 | 19 | import org.apache.log4j.Level; 20 | import org.junit.runner.notification.RunNotifier; 21 | import org.junit.runners.BlockJUnit4ClassRunner; 22 | import org.junit.runners.model.FrameworkMethod; 23 | import org.junit.runners.model.InitializationError; 24 | import org.junit.runners.model.Statement; 25 | 26 | import java.util.List; 27 | 28 | /** 29 | * A custom runner to run apps which are actually JUnit tests. 30 | *

31 | * The differences are: 32 | *

    33 | *
  • apps are not picked by surefire, so they're not run during maven build.
  • 34 | *
  • you cannot run multiple apps at once.
  • 35 | *
36 | *

37 | *

38 | * You should use the annotation {@link App} on the methods. 39 | */ 40 | public class AppRunner extends BlockJUnit4ClassRunner { 41 | 42 | public AppRunner(Class klass) throws InitializationError { 43 | super(klass); 44 | } 45 | 46 | @Override 47 | protected List computeTestMethods() { 48 | return getTestClass().getAnnotatedMethods(App.class); 49 | } 50 | 51 | @Override 52 | protected void validateTestMethods(List errors) { 53 | validatePublicVoidNoArgMethods(App.class, false, errors); 54 | } 55 | 56 | @Override 57 | protected Statement possiblyExpectingExceptions(FrameworkMethod method, Object test, Statement next) { 58 | // no expected expectation 59 | return next; 60 | } 61 | 62 | @Override 63 | protected Statement withPotentialTimeout(FrameworkMethod method, Object test, Statement next) { 64 | // no time out 65 | return next; 66 | } 67 | 68 | @Override 69 | protected Statement classBlock(RunNotifier notifier) { 70 | if (this.getDescription().getChildren().size() > 1) 71 | throw new IllegalStateException("You cannot run multiple apps at once! This runner prevents that!"); 72 | else 73 | return super.classBlock(notifier); 74 | } 75 | 76 | @Override 77 | protected Statement methodBlock(FrameworkMethod method) { 78 | LoggingSettings.setLoggerLevel(LoggingSettings.Piece.EVERYTHING, Level.WARN); 79 | return super.methodBlock(method); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /apps/src/test/java/org/trnltk/apps/commons/LoggingSettings.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.apps.commons; 2 | 3 | import com.google.common.collect.ImmutableList; 4 | import org.apache.log4j.Level; 5 | import org.apache.log4j.Logger; 6 | 7 | import java.util.Enumeration; 8 | 9 | /** 10 | * @author Ali Ok (ali.ok@apache.org) 11 | */ 12 | public class LoggingSettings { 13 | 14 | public static void turnOnLogger(Piece piece) { 15 | setLoggerLevel(piece, Level.INFO); 16 | } 17 | 18 | public static void turnOffLogger(Piece piece) { 19 | setLoggerLevel(piece, Level.OFF); 20 | } 21 | 22 | public static void setLoggerLevel(Piece piece, Level level) { 23 | for (String relatedLogger : piece.relatedLoggers) { 24 | final Enumeration currentLoggers = Logger.getLogger(relatedLogger).getLoggerRepository().getCurrentLoggers(); 25 | while (currentLoggers.hasMoreElements()) { 26 | final Logger logger = (Logger) currentLoggers.nextElement(); 27 | logger.setLevel(level); 28 | } 29 | } 30 | } 31 | 32 | public enum Piece { 33 | EVERYTHING("org.trnltk"), 34 | 35 | FrequentWordAnalysis( 36 | org.trnltk.apps.analysis.FrequentWordAnalysis.class.getName() 37 | ); 38 | 39 | 40 | private ImmutableList relatedLoggers; 41 | 42 | private Piece(String... relatedLoggers) { 43 | this.relatedLoggers = ImmutableList.copyOf(relatedLoggers); 44 | } 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /apps/src/test/java/org/trnltk/apps/criticalsurface/CriticalSurfaceEntry.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.apps.criticalsurface; 2 | 3 | import org.trnltk.common.util.Comparators; 4 | 5 | import java.io.Serializable; 6 | import java.util.Map; 7 | import java.util.TreeMap; 8 | import java.util.TreeSet; 9 | 10 | /** 11 | * @author Ali Ok (ali.ok@apache.org) 12 | */ 13 | public class CriticalSurfaceEntry implements Serializable { 14 | private final String criticalSurface; 15 | private TreeSet ignoredOccurrences = new TreeSet(); 16 | private TreeSet nonTaggedOccurrences = new TreeSet(); 17 | private TreeMap> parseResultSentences = new TreeMap>(Comparators.parseResultOrdering); 18 | 19 | public CriticalSurfaceEntry(String criticalSurface) { 20 | this.criticalSurface = criticalSurface; 21 | } 22 | 23 | public SentenceIdentifier getLatestOccurrence() { 24 | SentenceIdentifier latestOne = null; 25 | 26 | for (SentenceIdentifier ignoredOccurrence : ignoredOccurrences) { 27 | if (latestOne == null || ignoredOccurrence.compareTo(latestOne) > 0) 28 | latestOne = ignoredOccurrence; 29 | } 30 | for (SentenceIdentifier nonTaggedOccurrence : nonTaggedOccurrences) { 31 | if (latestOne == null || nonTaggedOccurrence.compareTo(latestOne) > 0) 32 | latestOne = nonTaggedOccurrence; 33 | } 34 | for (Map.Entry> entry : parseResultSentences.entrySet()) { 35 | for (SentenceIdentifier identifier : entry.getValue()) { 36 | if (latestOne == null || identifier.compareTo(latestOne) > 0) 37 | latestOne = identifier; 38 | } 39 | } 40 | return latestOne; 41 | } 42 | 43 | public String getCriticalSurface() { 44 | return criticalSurface; 45 | } 46 | 47 | public TreeSet getIgnoredOccurrences() { 48 | return ignoredOccurrences; 49 | } 50 | 51 | public TreeSet getNonTaggedOccurrences() { 52 | return nonTaggedOccurrences; 53 | } 54 | 55 | public TreeMap> getParseResultSentences() { 56 | return parseResultSentences; 57 | } 58 | 59 | @Override 60 | public boolean equals(Object o) { 61 | if (this == o) return true; 62 | if (!(o instanceof CriticalSurfaceEntry)) return false; 63 | 64 | CriticalSurfaceEntry that = (CriticalSurfaceEntry) o; 65 | 66 | if (!criticalSurface.equals(that.criticalSurface)) return false; 67 | 68 | return true; 69 | } 70 | 71 | @Override 72 | public int hashCode() { 73 | return criticalSurface.hashCode(); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /apps/src/test/java/org/trnltk/apps/criticalsurface/SentenceIdentifier.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.apps.criticalsurface; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * @author Ali Ok (ali.ok@apache.org) 7 | */ 8 | public class SentenceIdentifier implements Comparable, Serializable { 9 | private final String fileId; 10 | private final int line; 11 | 12 | public SentenceIdentifier(String fileId, int line) { 13 | this.fileId = fileId; 14 | this.line = line; 15 | } 16 | 17 | @Override 18 | public String toString() { 19 | return fileId + "#" + line; 20 | } 21 | 22 | public static SentenceIdentifier fromString(String input) { 23 | final int endIndexOfFileId = input.indexOf('#'); 24 | final SentenceIdentifier sentenceIdentifier = new SentenceIdentifier( 25 | input.substring(0, endIndexOfFileId), 26 | Integer.parseInt(input.substring(endIndexOfFileId + 1) 27 | )); 28 | return sentenceIdentifier; 29 | } 30 | 31 | @Override 32 | public int compareTo(SentenceIdentifier other) { 33 | final int fileIdCompareResult = this.fileId.compareTo(other.fileId); 34 | if (fileIdCompareResult == 0) 35 | return Integer.compare(this.line, other.line); 36 | else 37 | return fileIdCompareResult; 38 | } 39 | 40 | @Override 41 | public boolean equals(Object o) { 42 | if (this == o) return true; 43 | if (!(o instanceof SentenceIdentifier)) return false; 44 | 45 | SentenceIdentifier that = (SentenceIdentifier) o; 46 | 47 | if (line != that.line) return false; 48 | if (!fileId.equals(that.fileId)) return false; 49 | 50 | return true; 51 | } 52 | 53 | @Override 54 | public int hashCode() { 55 | int result = fileId.hashCode(); 56 | result = 31 * result + line; 57 | return result; 58 | } 59 | 60 | public String getFileId() { 61 | return fileId; 62 | } 63 | 64 | public int getLine() { 65 | return line; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /apps/src/test/java/org/trnltk/apps/experiments/YAMLExperiments.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.apps.experiments; 18 | 19 | import com.google.common.io.Resources; 20 | import org.junit.Ignore; 21 | import org.junit.Test; 22 | import org.yaml.snakeyaml.TypeDescription; 23 | import org.yaml.snakeyaml.Yaml; 24 | import org.yaml.snakeyaml.constructor.Constructor; 25 | import org.trnltk.tokenizer.data.TokenizerTrainingData; 26 | import org.trnltk.tokenizer.data.TokenizerTrainingEntry; 27 | 28 | import java.io.File; 29 | import java.io.FileInputStream; 30 | import java.io.FileNotFoundException; 31 | 32 | @Ignore 33 | public class YAMLExperiments { 34 | 35 | @Test 36 | public void testYamlLoadNoType() throws FileNotFoundException { 37 | final Yaml yaml = new Yaml(); 38 | final Object load = yaml.load(new FileInputStream(new File(Resources.getResource("tokenizer/training-data.yaml").getFile()))); 39 | System.out.println(load); 40 | } 41 | 42 | @Test 43 | public void testYamlLoadWithType() throws FileNotFoundException { 44 | TypeDescription dataDescription = new TypeDescription(TokenizerTrainingData.class); 45 | dataDescription.putListPropertyType("entries", TokenizerTrainingEntry.class); 46 | 47 | Constructor constructor = new Constructor(TokenizerTrainingData.class); 48 | constructor.addTypeDescription(dataDescription); 49 | Yaml yaml = new Yaml(constructor); 50 | 51 | final FileInputStream fileInputStream = new FileInputStream(new File(Resources.getResource("tokenizer/training-data.yaml").getFile())); 52 | final TokenizerTrainingData data = (TokenizerTrainingData) yaml.load(fileInputStream); 53 | for (TokenizerTrainingEntry tokenizerTrainingEntry : data.getEntries()) { 54 | System.out.println(tokenizerTrainingEntry.getText() + " " + tokenizerTrainingEntry.getTknz()); 55 | } 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /apps/src/test/octave/wordHistogram.m: -------------------------------------------------------------------------------- 1 | % M = dlmread("/Users/ali/Desktop/devl-data/trnltk/largefiles/histograms/wordHistogram-a.txt", " "); 2 | M = dlmread("/Users/ali/Desktop/devl-data/trnltk/largefiles/wordCounts.txt", " "); 3 | 4 | counts = M(:,2); 5 | hist(counts,1:100); -------------------------------------------------------------------------------- /apps/src/test/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 21 | 4.0.0 22 | 23 | 24 | org.trnltk 25 | trnltk 26 | 1.0.3-SNAPSHOT 27 | 28 | 29 | core 30 | TRNLTK Core 31 | TRNLTK Core 32 | 33 | 34 | 35 | 36 | org.apache.maven.plugins 37 | maven-jar-plugin 38 | 2.6 39 | 40 | 41 | 42 | test-jar 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | diff_match_patch 53 | diff_match_patch 54 | current 55 | 56 | 57 | org.yaml 58 | snakeyaml 59 | 1.11 60 | 61 | 62 | 63 | 64 | 65 | google-diff-patch-match 66 | google-diff-patch-match 67 | http://google-diff-match-patch.googlecode.com/svn/trunk/maven/ 68 | 69 | 70 | Sonatype-public 71 | SnakeYAML repository 72 | http://oss.sonatype.org/content/groups/public/ 73 | 74 | 75 | 76 | 77 | ${encoding} 78 | 79 | 80 | -------------------------------------------------------------------------------- /core/src/doc/org/trnltk/cookbook/customrootfinder/CustomRootFinder.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.cookbook.customrootfinder; 2 | 3 | public class CustomRootFinder { 4 | } 5 | -------------------------------------------------------------------------------- /core/src/doc/org/trnltk/cookbook/oldturkishsuffixgraph/Main.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.cookbook.oldturkishsuffixgraph; 2 | 3 | public class Main { 4 | } 5 | -------------------------------------------------------------------------------- /core/src/doc/org/trnltk/cookbook/samplecorpusstats/Stats1.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.cookbook.samplecorpusstats; 2 | 3 | public class Stats1 { 4 | } 5 | -------------------------------------------------------------------------------- /core/src/doc/org/trnltk/cookbook/samplecorpusstats/Stats2.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.cookbook.samplecorpusstats; 2 | 3 | public class Stats2 { 4 | } 5 | -------------------------------------------------------------------------------- /core/src/doc/org/trnltk/cookbook/spellcheck/Tolerance.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.cookbook.spellcheck; 2 | 3 | public enum Tolerance { 4 | ALLOW_PROPER_NOUNS, 5 | ALLOW_NON_DICTIONARY_NOUNS, 6 | ALLOW_NON_DICTIONARY_VERBS, 7 | ALLOW_NON_DICTIONARY_NOUN_COMPOUNDS, 8 | ALLOW_CONVERSION_OF_CIRCUMFLEXES, 9 | } 10 | -------------------------------------------------------------------------------- /core/src/doc/org/trnltk/doc/formattingoptions/FormattingOptions.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.doc.formattingoptions; 2 | 3 | import org.trnltk.model.morpheme.MorphemeContainer; 4 | import org.trnltk.morphology.contextless.parser.MorphologicParser; 5 | import org.trnltk.morphology.contextless.parser.ContextlessMorphologicParserBuilder; 6 | import org.trnltk.util.MorphemeContainerFormatter; 7 | 8 | import java.util.List; 9 | 10 | public class FormattingOptions { 11 | public static void main(String[] args) { 12 | final MorphologicParser parser = ContextlessMorphologicParserBuilder.createSimple(); 13 | final List morphemeContainers = parser.parseStr("kitaba"); 14 | 15 | // there should be only one, get it 16 | final MorphemeContainer result = morphemeContainers.get(0); 17 | 18 | // Oflazer format : kitap+Noun+A3sg+Pnon+Dat 19 | System.out.println(MorphemeContainerFormatter.formatMorphemeContainer(result)); 20 | // TRNLTK detailed format : {"Parts":[{"POS":"Noun","Suffixes":["A3sg","Pnon","Dat"]}],"LemmaRoot":"kitap","RootPos":"Noun","Root":"kitab"} 21 | System.out.println(MorphemeContainerFormatter.formatMorphemeContainerDetailed(result)); 22 | // Metu-Sabanci corpus format : (1,"kitap+Noun+A3sg+Pnon+Dat") 23 | System.out.println(MorphemeContainerFormatter.formatMorphemeContainerWithDerivationGrouping(result)); 24 | // TRNLTK format : kitab(kitap)+Noun+A3sg+Pnon+Dat(+yA[a]) 25 | System.out.println(MorphemeContainerFormatter.formatMorphemeContainerWithForms(result)); 26 | 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /core/src/doc/org/trnltk/doc/simpleparsing/SimpleParsing.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.doc.simpleparsing; 2 | 3 | import org.trnltk.model.morpheme.MorphemeContainer; 4 | import org.trnltk.morphology.contextless.parser.MorphologicParser; 5 | import org.trnltk.morphology.contextless.parser.ContextlessMorphologicParserBuilder; 6 | import org.trnltk.util.MorphemeContainerFormatter; 7 | 8 | import java.util.List; 9 | 10 | public class SimpleParsing { 11 | 12 | public static void main(String[] args) { 13 | // create a morphologic parser with simplest suffix graph and numeral suffix graph, roots from bundled dictionary 14 | MorphologicParser parser = ContextlessMorphologicParserBuilder.createSimple(); 15 | 16 | // parse surface 17 | List morphemeContainers = parser.parseStr("eti"); 18 | 19 | // print results 20 | for (MorphemeContainer morphemeContainer : morphemeContainers) { 21 | // printing format is the simplest one : no suffix form applications, no grouping 22 | System.out.println(MorphemeContainerFormatter.formatMorphemeContainer(morphemeContainer)); 23 | } 24 | } 25 | 26 | } -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/common/specification/AbstractSpecification.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.common.specification; 18 | 19 | public abstract class AbstractSpecification implements Specification { 20 | public Specification and(final Specification specification) { 21 | return new AndSpecification(this, specification); 22 | } 23 | 24 | public Specification or(final Specification specification) { 25 | return new OrSpecification(this, specification); 26 | } 27 | 28 | public Specification not() { 29 | return new NotSpecification(this); 30 | } 31 | 32 | @Override 33 | public String toString() { 34 | return this.describe(); 35 | } 36 | 37 | public abstract String describe(); 38 | } 39 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/common/specification/AndSpecification.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.common.specification; 18 | 19 | public class AndSpecification extends AbstractSpecification { 20 | private final Specification spec1; 21 | private final Specification spec2; 22 | 23 | public AndSpecification(final Specification spec1, final Specification spec2) { 24 | this.spec1 = spec1; 25 | this.spec2 = spec2; 26 | } 27 | 28 | @Override 29 | public boolean isSatisfiedBy(T object) { 30 | return spec1.isSatisfiedBy(object) && spec2.isSatisfiedBy(object); 31 | } 32 | 33 | @Override 34 | public String describe() { 35 | return spec1.toString() + " AND " + spec2.toString(); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/common/specification/FalseSpecification.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.common.specification; 18 | 19 | public class FalseSpecification extends AbstractSpecification { 20 | public static final FalseSpecification INSTANCE = new FalseSpecification(); 21 | 22 | private FalseSpecification() { 23 | } 24 | 25 | @Override 26 | public String describe() { 27 | return "FALSE"; 28 | } 29 | 30 | @Override 31 | public boolean isSatisfiedBy(T object) { 32 | return false; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/common/specification/NotSpecification.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.common.specification; 18 | 19 | public class NotSpecification extends AbstractSpecification { 20 | private final Specification wrapped; 21 | 22 | public NotSpecification(Specification wrapped) { 23 | this.wrapped = wrapped; 24 | } 25 | 26 | @Override 27 | public boolean isSatisfiedBy(T object) { 28 | return !wrapped.isSatisfiedBy(object); 29 | } 30 | 31 | @Override 32 | public String describe() { 33 | return "NOT " + wrapped.toString(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/common/specification/OrSpecification.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.common.specification; 18 | 19 | public class OrSpecification extends AbstractSpecification { 20 | private Specification spec1; 21 | private Specification spec2; 22 | 23 | public OrSpecification(final Specification spec1, final Specification spec2) { 24 | this.spec1 = spec1; 25 | this.spec2 = spec2; 26 | } 27 | 28 | @Override 29 | public boolean isSatisfiedBy(final T object) { 30 | return spec1.isSatisfiedBy(object) || spec2.isSatisfiedBy(object); 31 | } 32 | 33 | @Override 34 | public String describe() { 35 | return spec1.toString() + " OR " + spec2.toString(); 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/common/specification/Specification.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.common.specification; 18 | 19 | /** 20 | * Contract for specifications. See http://en.wikipedia.org/wiki/Specification_pattern for details 21 | * @param Class which the specifications will be checked against its instances 22 | */ 23 | public interface Specification { 24 | 25 | boolean isSatisfiedBy(T object); 26 | 27 | Specification and(Specification other); 28 | 29 | Specification not(); 30 | 31 | Specification or(Specification other); 32 | } -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/common/specification/TrueSpecification.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.common.specification; 18 | 19 | public class TrueSpecification extends AbstractSpecification { 20 | public static TrueSpecification INSTANCE = new TrueSpecification(); 21 | 22 | private TrueSpecification() { 23 | } 24 | 25 | @Override 26 | public String describe() { 27 | return "TRUE"; 28 | } 29 | 30 | @Override 31 | public boolean isSatisfiedBy(T object) { 32 | return true; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/common/structure/StringEnum.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.common.structure; 18 | 19 | /** 20 | * Enums which can be represented as strings need to implement this interface. 21 | * 22 | * Then, while reading textual data, {@link StringEnumMap} could be used. 23 | * @param 24 | */ 25 | public interface StringEnum { 26 | 27 | String getStringForm(); 28 | } 29 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/common/structure/StringEnumMap.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.common.structure; 18 | 19 | import com.google.common.base.Function; 20 | import com.google.common.base.Strings; 21 | import com.google.common.collect.Collections2; 22 | import com.google.common.collect.ImmutableMap; 23 | 24 | import java.util.Collection; 25 | import java.util.Collections; 26 | 27 | /** 28 | * This is a convenience class for enums that also are represented with strings. 29 | * This classes can be useful for loading enum values from textual data. 30 | * 31 | * @param 32 | */ 33 | public class StringEnumMap { 34 | private final ImmutableMap map; 35 | private final Class clazz; 36 | 37 | private StringEnumMap(Class clazz) { 38 | this.clazz = clazz; 39 | final ImmutableMap.Builder mapBuilder = new ImmutableMap.Builder(); 40 | for (T senum : clazz.getEnumConstants()) { 41 | mapBuilder.put(senum.getStringForm(), senum); 42 | } 43 | this.map = mapBuilder.build(); 44 | } 45 | 46 | public static StringEnumMap get(Class clazz) { 47 | return new StringEnumMap(clazz); 48 | } 49 | 50 | public T getEnum(String s) { 51 | if (Strings.isNullOrEmpty(s)) 52 | throw new IllegalArgumentException("Input String must have content."); 53 | T res = map.get(s); 54 | if (res == null) 55 | throw new IllegalArgumentException("Cannot find a representation of :" + s + " for enum class:" + clazz.getName()); 56 | return res; 57 | } 58 | 59 | public Collection getEnums(Collection strs) { 60 | if (strs == null || strs.isEmpty()) 61 | return Collections.emptyList(); 62 | else 63 | return Collections2.transform(strs, new Function() { 64 | @Override 65 | public T apply(String input) { 66 | return getEnum(input); 67 | } 68 | }); 69 | } 70 | 71 | @SuppressWarnings("UnusedDeclaration") 72 | public boolean enumExists(String s) { 73 | return map.containsKey(s); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/common/util/Comparators.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.common.util; 2 | 3 | import com.google.common.collect.Ordering; 4 | import com.google.common.primitives.Ints; 5 | 6 | import java.util.Arrays; 7 | 8 | /** 9 | * @author Ali Ok (ali.ok@apache.org) 10 | */ 11 | public class Comparators { 12 | 13 | @SuppressWarnings("WeakerAccess") 14 | public static final Ordering byLengthOrdering = new Ordering() { 15 | public int compare(String left, String right) { 16 | return Ints.compare(left.length(), right.length()); 17 | } 18 | }; 19 | 20 | @SuppressWarnings("unchecked") 21 | public static final Ordering parseResultOrdering = Ordering.compound( 22 | Arrays.asList(byLengthOrdering, Ordering.natural()) 23 | ); 24 | 25 | } 26 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/experiment/model/ambiguity/morphology/ParseResult.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.experiment.model.ambiguity.morphology; 18 | 19 | import java.util.List; 20 | 21 | /** 22 | * Holds the parts of a morphologic parse result. 23 | */ 24 | public class ParseResult { 25 | private final String str; 26 | private final String root; 27 | private final String lemmaRoot; 28 | private final String rootPos; 29 | private final String rootSpos; 30 | private final List parts; 31 | 32 | public ParseResult(String str, String root, String lemmaRoot, String rootPos, String rootSpos, List parts) { 33 | this.str = str; 34 | this.root = root; 35 | this.lemmaRoot = lemmaRoot; 36 | this.rootPos = rootPos; 37 | this.rootSpos = rootSpos; 38 | this.parts = parts; 39 | } 40 | 41 | public List getParts() { 42 | return parts; 43 | } 44 | 45 | public String getRoot() { 46 | return root; 47 | } 48 | 49 | public String getLemmaRoot() { 50 | return lemmaRoot; 51 | } 52 | 53 | public String getRootPos() { 54 | return rootPos; 55 | } 56 | 57 | public String getRootSpos() { 58 | return rootSpos; 59 | } 60 | 61 | public String getStr() { 62 | return str; 63 | } 64 | 65 | @Override 66 | public String toString() { 67 | return "ParseResult{" + 68 | "root='" + root + '\'' + 69 | ", lemmaRoot='" + lemmaRoot + '\'' + 70 | ", rootPos='" + rootPos + '\'' + 71 | ", rootSpos='" + rootSpos + '\'' + 72 | ", parts=" + parts + 73 | '}'; 74 | } 75 | 76 | @Override 77 | public boolean equals(Object o) { 78 | if (this == o) return true; 79 | if (o == null || getClass() != o.getClass()) return false; 80 | 81 | ParseResult that = (ParseResult) o; 82 | 83 | return str.equals(that.str); 84 | } 85 | 86 | @Override 87 | public int hashCode() { 88 | return str.hashCode(); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/experiment/model/ambiguity/morphology/ParseResultPart.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.experiment.model.ambiguity.morphology; 18 | 19 | import java.util.List; 20 | 21 | public class ParseResultPart { 22 | private final String primaryPos; 23 | private final String secondaryPos; 24 | private final List suffixes; 25 | 26 | public ParseResultPart(String primaryPos, String secondaryPos, List suffixes) { 27 | this.primaryPos = primaryPos; 28 | this.secondaryPos = secondaryPos; 29 | this.suffixes = suffixes; 30 | } 31 | 32 | @SuppressWarnings("UnusedDeclaration") 33 | public String getPrimaryPos() { 34 | return primaryPos; 35 | } 36 | 37 | @SuppressWarnings("UnusedDeclaration") 38 | public String getSecondaryPos() { 39 | return secondaryPos; 40 | } 41 | 42 | @SuppressWarnings("UnusedDeclaration") 43 | public List getSuffixes() { 44 | return suffixes; 45 | } 46 | 47 | @Override 48 | public String toString() { 49 | return "ParseResultPart{" + 50 | "primaryPos='" + primaryPos + '\'' + 51 | ", secondaryPos='" + secondaryPos + '\'' + 52 | ", suffixes=" + suffixes + 53 | '}'; 54 | } 55 | 56 | @Override 57 | public boolean equals(Object o) { 58 | if (this == o) return true; 59 | if (o == null || getClass() != o.getClass()) return false; 60 | 61 | ParseResultPart that = (ParseResultPart) o; 62 | 63 | if (primaryPos != null ? !primaryPos.equals(that.primaryPos) : that.primaryPos != null) return false; 64 | else if (secondaryPos != null ? !secondaryPos.equals(that.secondaryPos) : that.secondaryPos != null) return false; 65 | else if (suffixes != null ? !suffixes.equals(that.suffixes) : that.suffixes != null) return false; 66 | 67 | return true; 68 | } 69 | 70 | @Override 71 | public int hashCode() { 72 | int result = primaryPos != null ? primaryPos.hashCode() : 0; 73 | result = 31 * result + (secondaryPos != null ? secondaryPos.hashCode() : 0); 74 | result = 31 * result + (suffixes != null ? suffixes.hashCode() : 0); 75 | return result; 76 | } 77 | } -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/experiment/model/ambiguity/morphology/ParseResultPartDifference.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.experiment.model.ambiguity.morphology; 18 | 19 | import org.apache.commons.lang3.tuple.Pair; 20 | 21 | import java.util.List; 22 | 23 | /** 24 | * Holds the difference of parse result parts. 25 | */ 26 | public class ParseResultPartDifference { 27 | private final Pair, List> parts; 28 | 29 | public ParseResultPartDifference(Pair, List> parts) { 30 | this.parts = parts; 31 | } 32 | 33 | public Pair, List> getParts() { 34 | return parts; 35 | } 36 | 37 | @Override 38 | public String toString() { 39 | return "ParseResultPartDifference{\n\t\t\t" + 40 | "parts=\n\t\t\t\t" + 41 | parts.getLeft() + "\n\t\t\t\t" + 42 | parts.getRight() + 43 | "\n\t\t}"; 44 | } 45 | 46 | @Override 47 | public boolean equals(Object o) { 48 | if (this == o) return true; 49 | if (o == null || getClass() != o.getClass()) return false; 50 | 51 | ParseResultPartDifference that = (ParseResultPartDifference) o; 52 | 53 | return !(parts != null ? !parts.equals(that.parts) : that.parts != null); 54 | } 55 | 56 | @Override 57 | public int hashCode() { 58 | return parts != null ? parts.hashCode() : 0; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/experiment/model/ambiguity/morphology/WordParseResultEntry.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.experiment.model.ambiguity.morphology; 18 | 19 | import java.util.ArrayList; 20 | import java.util.Collections; 21 | import java.util.List; 22 | 23 | /** 24 | * Holds a word (surface) and parse results for it. 25 | */ 26 | public class WordParseResultEntry { 27 | private final String word; 28 | private final List parseResults = new ArrayList(); 29 | 30 | public WordParseResultEntry(String word) { 31 | this.word = word; 32 | } 33 | 34 | @SuppressWarnings("UnusedDeclaration") 35 | public String getWord() { 36 | return word; 37 | } 38 | 39 | public List getParseResults() { 40 | return Collections.unmodifiableList(parseResults); 41 | } 42 | 43 | public void addParseResult(ParseResult parseResult){ 44 | this.parseResults.add(parseResult); 45 | } 46 | 47 | @Override 48 | public String toString() { 49 | return "WordParseResultEntry{" + 50 | "word='" + word + '\'' + 51 | ", parseResults=" + parseResults + 52 | '}'; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/letter/TurkishChar.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.letter; 18 | 19 | /** 20 | * Container to hold {@link TurkicLetter} and the actual char value. 21 | * 22 | * Actual char value can be different because of the upper-lower cases. 23 | */ 24 | public class TurkishChar { 25 | 26 | private final char charValue; 27 | private final TurkicLetter letter; 28 | 29 | public TurkishChar(char charValue, TurkicLetter letter) { 30 | this.charValue = charValue; 31 | this.letter = letter; 32 | } 33 | 34 | public TurkicLetter getLetter() { 35 | return letter; 36 | } 37 | 38 | public char getCharValue() { 39 | return charValue; 40 | } 41 | 42 | @Override 43 | public boolean equals(Object o) { 44 | if (this == o) return true; 45 | if (o == null || getClass() != o.getClass()) return false; 46 | 47 | TurkishChar that = (TurkishChar) o; 48 | 49 | return charValue == that.charValue; 50 | } 51 | 52 | @Override 53 | public int hashCode() { 54 | return (int) charValue; 55 | } 56 | 57 | @Override 58 | public String toString() { 59 | return "TurkishChar{" + 60 | "charValue=" + charValue + 61 | '}'; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/lexicon/Lexeme.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.lexicon; 18 | 19 | import java.util.Set; 20 | 21 | /** 22 | * A lexeme is an entry in lexicon. It has lemma (e.g yapmak), 23 | * lemmaRoot(e.g. yap), primaryPos(e.g. VERB), secondaryPos and attributes. 24 | *

25 | * Lexeme = lemma + POS 26 | *

27 | * It is important to know that there can be multiple lexemes for a lemmaRoot. 28 | * For example : yüz+VERB (denizde yüzmek), yüz+NUMERAL (yüz kişi) 29 | */ 30 | public interface Lexeme { 31 | String getLemma(); 32 | 33 | String getLemmaRoot(); 34 | 35 | PrimaryPos getPrimaryPos(); 36 | 37 | SecondaryPos getSecondaryPos(); 38 | 39 | Set getAttributes(); 40 | } 41 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/lexicon/PhoneticAttribute.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.lexicon; 18 | 19 | import org.trnltk.common.structure.StringEnum; 20 | import org.trnltk.common.structure.StringEnumMap; 21 | 22 | public enum PhoneticAttribute implements StringEnum { 23 | LastLetterVowel("LLV"), 24 | LastLetterConsonant("LLC"), 25 | 26 | LastVowelFrontal("LVF"), 27 | LastVowelBack("LVB"), 28 | LastVowelRounded("LVR"), 29 | LastVowelUnrounded("LVuR"), 30 | 31 | LastLetterVoiceless("LLVless"), 32 | LastLetterNotVoiceless("LLNotVless"), 33 | 34 | LastLetterVoicelessStop("LLStop"), 35 | 36 | FirstLetterVowel("FLV"), 37 | FirstLetterConsonant("FLC"), 38 | 39 | HasNoVowel("NoVow"); 40 | 41 | private final static StringEnumMap shortFormToPosMap = StringEnumMap.get(PhoneticAttribute.class); 42 | 43 | private final String shortForm; 44 | 45 | private PhoneticAttribute(String shortForm) { 46 | this.shortForm = shortForm; 47 | } 48 | 49 | @Override 50 | public String getStringForm() { 51 | return shortForm; 52 | } 53 | 54 | @SuppressWarnings("UnusedDeclaration") 55 | public static StringEnumMap converter() { 56 | return shortFormToPosMap; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/lexicon/PhoneticExpectation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.lexicon; 18 | 19 | public enum PhoneticExpectation { 20 | VowelStart, 21 | ConsonantStart 22 | } 23 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/lexicon/PrimaryPos.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.lexicon; 18 | 19 | import org.trnltk.common.structure.StringEnum; 20 | import org.trnltk.common.structure.StringEnumMap; 21 | 22 | /** 23 | * Primary part of speech tags. 24 | *

25 | * See this Wikipedia article 26 | * for definition of the part of speech. 27 | */ 28 | @SuppressWarnings("UnusedDeclaration") 29 | public enum PrimaryPos implements StringEnum { 30 | Noun("Noun"), 31 | Adjective("Adj"), 32 | Adverb("Adv"), 33 | Conjunction("Conj"), 34 | Interjection("Interj"), 35 | Verb("Verb"), 36 | Pronoun("Pron"), 37 | Numeral("Num"), 38 | Determiner("Det"), 39 | PostPositive("Postp"), 40 | Question("Ques"), 41 | Duplicator("Dup"), 42 | Punctuation("Punc"), 43 | Unknown("Unk"); 44 | 45 | private final String shortForm; 46 | 47 | PrimaryPos(String shortForm) { 48 | this.shortForm = shortForm; 49 | } 50 | 51 | private final static StringEnumMap shortFormToPosMap = StringEnumMap.get(PrimaryPos.class); 52 | 53 | public static StringEnumMap converter() { 54 | return shortFormToPosMap; 55 | } 56 | 57 | public String getStringForm() { 58 | return shortForm; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/lexicon/Root.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.lexicon; 18 | 19 | import org.trnltk.model.letter.TurkishSequence; 20 | 21 | import java.util.Set; 22 | 23 | /** 24 | * A root is a possible beginning of a surface. Every root is derived from a lexeme. 25 | *

26 | * To illustrate: 27 | * 28 | * 29 | * 30 | * 31 | * 32 | * 33 | * 34 | * 35 | * 36 | * 37 | * 38 | * 39 | * 40 | * 41 | * 42 | * 43 | * 44 | * 45 | * 46 | * 47 | * 48 | * 49 | * 50 | * 51 | * 52 | * 53 | *
LexemeRootsUsages for roots
kitap+Nounkitap, kitabkitapta, kitaba
omuz+Nounomuz, omzomuzlarda, omzunu
hakhak, hakkhak davası, hakkımı arıyorum
kek+Nounkek*keki
54 | *

* Please note that there is no keg or keğ
55 | *

56 | * In other words, a root is a text derived from a lexeme with one of the possible phonetic rules 57 | * respecting the phonetic attributes and lexeme attributes of the lexeme and the lemma root sequence. 58 | */ 59 | public interface Root { 60 | public TurkishSequence getSequence(); 61 | 62 | public Lexeme getLexeme(); 63 | 64 | public Set getPhoneticAttributes(); 65 | 66 | public Set getPhoneticExpectations(); 67 | } 68 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/lexicon/SecondaryPos.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.lexicon; 18 | 19 | import com.google.common.collect.ImmutableSet; 20 | import com.google.common.collect.Sets; 21 | import org.trnltk.common.structure.StringEnum; 22 | import org.trnltk.common.structure.StringEnumMap; 23 | 24 | /** 25 | * A secondary part of speech is a tag used for tagging the lexemes/roots/etc. with an additional information. 26 | *

27 | * A {@code SecondaryPos} is not meaningful alone, it is only valid with a {@link PrimaryPos}. 28 | * For example a {@link Lexeme} having {@link PrimaryPos#Adjective} and {@link SecondaryPos#Demonstrative} is a 29 | * Demonstrative Adjective whereas a {@link Lexeme} having {@link PrimaryPos#Pronoun} and {@link SecondaryPos#Demonstrative} is a 30 | * Demonstrative Pronoun. 31 | * 32 | * @see PrimaryPos 33 | */ 34 | @SuppressWarnings("UnusedDeclaration") 35 | public enum SecondaryPos implements StringEnum { 36 | Demonstrative("Demons"), 37 | Time("Time"), 38 | Quantitive("Quant"), 39 | Question("Ques"), 40 | ProperNoun("Prop"), 41 | Personal("Pers"), 42 | Reflexive("Reflex"), 43 | None("None"), 44 | Unknown("Unk"), 45 | Ordinal("Ord"), 46 | Cardinal("Card"), 47 | Percentage("Percent"), 48 | Ratio("Ratio"), 49 | Range("Range"), 50 | Real("Real"), 51 | Distribution("Dist"), 52 | Clock("Clock"), 53 | Date("Date"), 54 | 55 | Abbreviation("Abbr"), 56 | DigitsCardinal("DigitsC"), 57 | DigitsOrdinal("DigitsO"), 58 | 59 | Duplicator("Dup"); 60 | 61 | public static final ImmutableSet NUMERAL_APPLICABLE = Sets.immutableEnumSet(Cardinal, Ordinal, Range, DigitsCardinal, DigitsOrdinal); 62 | 63 | private final static StringEnumMap shortFormToPosMap = StringEnumMap.get(SecondaryPos.class); 64 | 65 | private final String shortForm; 66 | 67 | private SecondaryPos(String shortForm) { 68 | this.shortForm = shortForm; 69 | } 70 | 71 | @Override 72 | public String getStringForm() { 73 | return shortForm; 74 | } 75 | 76 | public static StringEnumMap converter() { 77 | return shortFormToPosMap; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/suffix/ConditionalFreeTransitionSuffix.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.suffix; 18 | 19 | /** 20 | * A shorthand for having a {@link Suffix} that has a default {@link SuffixForm} that has conditions but has a 21 | * suffix form of empty string. 22 | *

23 | * That means, it is an conditional epsilon transition in the FSM. 24 | *

25 | * ConditionalFreeTransitionSuffixes are not put in the string representation of a parse result. 26 | */ 27 | public class ConditionalFreeTransitionSuffix extends Suffix { 28 | 29 | public ConditionalFreeTransitionSuffix(String name) { 30 | super(name, null, null, false); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/suffix/FreeTransitionSuffix.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.suffix; 18 | 19 | /** 20 | * A shorthand for having a {@link Suffix} that has a default {@link SuffixForm} that has no conditions and has a 21 | * suffix form of empty string. 22 | *

23 | * That means, it is an epsilon transition in the FSM. 24 | *

25 | * FreeTransitionSuffixes are not put in the string representation of a parse result. 26 | */ 27 | public class FreeTransitionSuffix extends Suffix { 28 | 29 | public FreeTransitionSuffix(String name) { 30 | super(name, null, null, false); 31 | this.addSuffixForm(""); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/suffix/SuffixGroup.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.suffix; 18 | 19 | import java.util.LinkedHashSet; 20 | import java.util.Set; 21 | 22 | /** 23 | * Groups multiple {@link Suffix}es. 24 | *

25 | * This is useful when parsing (e.g. 2 suffixes from the same group cannot exist in a inflection group) and also 26 | * in graphical representation (e.g. suffixes from the same group are drawn with the same color). 27 | */ 28 | public class SuffixGroup { 29 | private final String name; 30 | private final Set suffixes = new LinkedHashSet(); 31 | 32 | public SuffixGroup(String name) { 33 | this.name = name; 34 | } 35 | 36 | /** 37 | * @return Unique name of the group 38 | */ 39 | public String getName() { 40 | return name; 41 | } 42 | 43 | /** 44 | * @return Suffixes that belong to this group 45 | */ 46 | public Set getSuffixes() { 47 | return suffixes; 48 | } 49 | 50 | @Override 51 | public boolean equals(Object o) { 52 | if (this == o) return true; 53 | if (o == null || getClass() != o.getClass()) return false; 54 | 55 | SuffixGroup that = (SuffixGroup) o; 56 | 57 | return name.equals(that.name); 58 | } 59 | 60 | @Override 61 | public int hashCode() { 62 | return name.hashCode(); 63 | } 64 | 65 | @Override 66 | public String toString() { 67 | return "SuffixGroup{" + 68 | "name='" + name + '\'' + 69 | '}'; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/model/suffix/ZeroTransitionSuffix.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.suffix; 18 | 19 | /** 20 | * A shorthand for having a {@link Suffix} which changes the POS and 21 | * has a default {@link SuffixForm} that has no conditions and has a 22 | * suffix form of empty string. 23 | *

24 | * That means, it is an epsilon transition in the FSM 25 | * which goes to a state with a different POS. Going to another POS is the difference from {@link FreeTransitionSuffix}. 26 | *

27 | * ZeroTransitionSuffixes are put in the string representation of a parse result. 28 | */ 29 | public class ZeroTransitionSuffix extends Suffix { 30 | private static final String PRETTY_NAME = "Zero"; 31 | 32 | public ZeroTransitionSuffix(String name) { 33 | super(name, null, PRETTY_NAME, false); 34 | this.addSuffixForm(""); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/contextless/parser/ContextlessMorphologicParserListener.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.contextless.parser; 18 | 19 | import org.trnltk.model.morpheme.MorphemeContainer; 20 | 21 | /** 22 | * Contract for a listener that is notified when something happens on {@link MorphemeContainer}s that are being traversed. 23 | */ 24 | public interface ContextlessMorphologicParserListener { 25 | 26 | public void onMorphemeContainerInvalidated(MorphemeContainer morphemeContainer); 27 | 28 | } 29 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/contextless/parser/MorphologicParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.contextless.parser; 18 | 19 | import org.trnltk.model.morpheme.MorphemeContainer; 20 | import org.trnltk.model.letter.TurkishSequence; 21 | 22 | import java.util.List; 23 | 24 | /** 25 | * The contract for the morphologic parser implementations. 26 | *

27 | * A morphologic parser takes the input and fragments it into smaller parts, namely morphemes. 28 | * These smaller parts are root, suffixes, etc. 29 | * Morphemes are contained within a {@link MorphemeContainer}. 30 | */ 31 | public interface MorphologicParser { 32 | 33 | /** 34 | * Parses the given string and returns all of the possible morphologic parse results for it. 35 | */ 36 | public List parseStr(final String input); 37 | 38 | /** 39 | * Parses the given {@link TurkishSequence} and returns all of the possible morphologic parse results for it. 40 | */ 41 | public List parse(final TurkishSequence input); 42 | 43 | /** 44 | * Parses all of the given strings and returns all possible results for each. Returned results are in inputs' order. 45 | */ 46 | public List> parseAllStr(final List input); 47 | 48 | /** 49 | * Parses all of the given {@link TurkishSequence}s and returns all possible results for each. Returned results are in inputs' order. 50 | */ 51 | @SuppressWarnings("UnusedDeclaration") 52 | public List> parseAll(final List input); 53 | } 54 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/contextless/parser/SuffixFormGraphNodeKey.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.contextless.parser; 18 | 19 | import com.google.common.collect.ImmutableSet; 20 | import com.google.common.collect.Sets; 21 | import org.trnltk.morphology.morphotactics.SuffixGraphState; 22 | import org.trnltk.model.lexicon.PhoneticAttribute; 23 | 24 | import java.util.Set; 25 | 26 | /** 27 | * A unique key for a {@link SuffixFormGraphNode}. 28 | *

29 | * The key consists of a state and a phonetic attributes combination. 30 | */ 31 | public class SuffixFormGraphNodeKey { 32 | 33 | private final ImmutableSet phonAttrSet; 34 | private final SuffixGraphState state; 35 | 36 | public SuffixFormGraphNodeKey(SuffixGraphState state, Set phonAttrSet) { 37 | this.state = state; 38 | this.phonAttrSet = Sets.immutableEnumSet(phonAttrSet); 39 | } 40 | 41 | @Override 42 | public boolean equals(Object o) { 43 | if (this == o) return true; 44 | if (o == null || getClass() != o.getClass()) return false; 45 | 46 | SuffixFormGraphNodeKey suffixFormGraphNodeKey = (SuffixFormGraphNodeKey) o; 47 | 48 | if (!phonAttrSet.equals(suffixFormGraphNodeKey.phonAttrSet)) return false; 49 | else if (!state.equals(suffixFormGraphNodeKey.state)) return false; 50 | 51 | return true; 52 | } 53 | 54 | @Override 55 | public int hashCode() { 56 | int result = state.hashCode(); 57 | result = 31 * result + phonAttrSet.hashCode(); 58 | return result; 59 | } 60 | 61 | @Override 62 | public String toString() { 63 | return "SuffixFormGraphNodeKey{" + 64 | "state='" + state + '\'' + 65 | ", phonAttrSet=" + phonAttrSet + 66 | '}'; 67 | } 68 | 69 | public ImmutableSet getPhonAttrSet() { 70 | return phonAttrSet; 71 | } 72 | 73 | public SuffixGraphState getState() { 74 | return state; 75 | } 76 | } -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/contextless/parser/cache/MorphologicParserCache.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.contextless.parser.cache; 18 | 19 | import org.trnltk.model.morpheme.MorphemeContainer; 20 | import org.trnltk.morphology.contextless.parser.MorphologicParser; 21 | 22 | import java.util.List; 23 | import java.util.Map; 24 | 25 | /** 26 | * Contract for a cache that can be used by a {@link org.trnltk.morphology.contextless.parser.CachingMorphologicParser} 27 | *

28 | * A MorphologicParserCache helps remembering parse results for inputs. Different implementations can exist: 29 | * offline, online with LRU, online with MFU ... 30 | */ 31 | public interface MorphologicParserCache { 32 | List get(String input); 33 | 34 | void put(String input, List morphemeContainers); 35 | 36 | void putAll(Map> map); 37 | 38 | /** 39 | * Build the cache, ie. parse the values to be stored in the cache. 40 | */ 41 | void build(MorphologicParser parser); 42 | 43 | /** 44 | * Check if cache is already built. 45 | * @see MorphologicParserCache#build(org.trnltk.morphology.contextless.parser.MorphologicParser) 46 | */ 47 | boolean isNotBuilt(); 48 | } 49 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/contextless/rootfinder/DictionaryRootFinder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.contextless.rootfinder; 18 | 19 | import com.google.common.collect.Multimap; 20 | import org.apache.commons.collections.CollectionUtils; 21 | import org.apache.commons.lang3.Validate; 22 | import org.trnltk.model.letter.TurkishAlphabet; 23 | import org.trnltk.model.letter.TurkishSequence; 24 | import org.trnltk.model.lexicon.Root; 25 | 26 | import java.util.ArrayList; 27 | import java.util.Collection; 28 | import java.util.Collections; 29 | 30 | public class DictionaryRootFinder implements RootFinder { 31 | private final Multimap rootMap; 32 | 33 | public DictionaryRootFinder(Multimap rootMap) { 34 | Validate.notNull(rootMap); 35 | this.rootMap = rootMap; 36 | } 37 | 38 | @Override 39 | public boolean handles(TurkishSequence partialInput, TurkishSequence input) { 40 | return partialInput != null && !partialInput.isBlank(); 41 | } 42 | 43 | @Override 44 | public Collection findRootsForPartialInput(TurkishSequence partialInput, TurkishSequence _input) { 45 | final Collection roots = this.rootMap.get(partialInput.getUnderlyingString()); 46 | if (Character.isUpperCase(partialInput.charAt(0).getCharValue())) { 47 | final ArrayList result = new ArrayList(); 48 | 49 | final String uncapitalized = TurkishAlphabet.uncapitalize(partialInput.getUnderlyingString()); 50 | final Collection lowerCaseRoots = this.rootMap.get(uncapitalized); 51 | result.addAll(roots); 52 | result.addAll(lowerCaseRoots); 53 | 54 | if (CollectionUtils.isEmpty(result)) 55 | return Collections.emptyList(); 56 | else 57 | return result; 58 | 59 | } else { 60 | if (CollectionUtils.isEmpty(roots)) 61 | return Collections.emptyList(); 62 | else 63 | return roots; 64 | } 65 | 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/contextless/rootfinder/PuncRootFinder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.contextless.rootfinder; 18 | 19 | import org.apache.commons.lang3.Validate; 20 | import org.trnltk.model.lexicon.ImmutableLexeme; 21 | import org.trnltk.model.lexicon.ImmutableRoot; 22 | import org.trnltk.model.letter.TurkishSequence; 23 | import org.trnltk.model.lexicon.PrimaryPos; 24 | 25 | import java.util.Arrays; 26 | import java.util.List; 27 | import java.util.regex.Pattern; 28 | 29 | 30 | public class PuncRootFinder implements RootFinder { 31 | 32 | /** 33 | * You can check types defined in java.lang.Character class. such as START_PUNCTUATION 34 | * http://www.unicode.org/notes/tn36/Categories.txt 35 | * http://www.fileformat.info/info/unicode/category/index.htm 36 | * ALL Punc = 37 | * [Pc] Punctuation, Connector 38 | * [Pd] Punctuation, Dash 39 | * [Pe] Punctuation, Close 40 | * [Pf] Punctuation, Final quote (may behave like Ps or Pe depending on usage) 41 | * [Pi] Punctuation, Initial quote (may behave like Ps or Pe depending on usage) 42 | * [Po] Punctuation, Other 43 | * [Ps] Punctuation, Open 44 | * [Sm] Symbol, Math 45 | * [So] Symbol, Other 46 | */ 47 | private static final Pattern ALL_PUNC_PATTERN = Pattern.compile("^(\\p{Pc}|\\p{Pd}|\\p{Pe}|\\p{Pf}|\\p{Pi}|\\p{Po}|\\p{Ps}|\\p{Sm}|\\p{So})+$"); 48 | 49 | @Override 50 | public boolean handles(TurkishSequence partialInput, TurkishSequence input) { 51 | if (partialInput == null || partialInput.isBlank()) 52 | return false; 53 | 54 | if (partialInput.length() == input.length()) { 55 | Validate.isTrue(input.equals(partialInput)); 56 | return ALL_PUNC_PATTERN.matcher(partialInput.getUnderlyingString()).matches(); 57 | } 58 | return false; 59 | } 60 | 61 | @Override 62 | public List findRootsForPartialInput(TurkishSequence partialInput, TurkishSequence input) { 63 | final ImmutableLexeme lexeme = new ImmutableLexeme(partialInput.getUnderlyingString(), input.getUnderlyingString(), PrimaryPos.Punctuation, null, null); 64 | return Arrays.asList(new ImmutableRoot(partialInput, lexeme, null, null)); 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/contextless/rootfinder/RootFinder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.contextless.rootfinder; 18 | 19 | import org.trnltk.model.lexicon.Root; 20 | import org.trnltk.model.letter.TurkishSequence; 21 | 22 | import java.util.Collection; 23 | 24 | /** 25 | * Contract for finding roots from a part of a surface. 26 | */ 27 | public interface RootFinder { 28 | 29 | /** 30 | * A quick check if any roots could be created for the given input. A RootFinder impl doesn't have to 31 | * check everything, but it can use a regex to check a pattern quickly. 32 | *

33 | * A RootFinder implementation doesn't have to return some roots for an input which is marked as "could be handled" or vice-versa. 34 | *

35 | * The method {@link RootFinder#findRootsForPartialInput(org.trnltk.model.letter.TurkishSequence, org.trnltk.model.letter.TurkishSequence)} 36 | * can still return nothing, even if this method returns true. 37 | * 38 | * @param partialInput Partial surface 39 | * @param wholeSurface Whole surface 40 | * @return true if partial input could be handled 41 | */ 42 | @SuppressWarnings("BooleanMethodIsAlwaysInverted") 43 | public boolean handles(TurkishSequence partialInput, TurkishSequence wholeSurface); 44 | 45 | /** 46 | * Returns all roots find from the partial surface. Whole surface could be used for doing some look-ahead checks. 47 | *

48 | * Returned roots must pass the check defined in {@link RootValidator} 49 | */ 50 | public Collection findRootsForPartialInput(TurkishSequence partialInput, TurkishSequence wholeSurface); 51 | } 52 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/contextless/rootfinder/RootValidator.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.morphology.contextless.rootfinder; 2 | 3 | import org.apache.commons.lang3.Validate; 4 | import org.trnltk.model.letter.TurkishSequence; 5 | import org.trnltk.model.lexicon.LexemeAttribute; 6 | import org.trnltk.model.lexicon.Root; 7 | import org.trnltk.util.Constants; 8 | 9 | /** 10 | * Validates a root. 11 | */ 12 | public class RootValidator { 13 | 14 | /** 15 | * Checks if a root is valid for a partial surface. 16 | *

17 | * A root cannot be longer than the partial surface. 18 | *

19 | * A root must be the beginning of a partial surface, except when the found root is a noun compound with 20 | * implicit 3rd person possession. 21 | */ 22 | public boolean isValid(Root root, TurkishSequence partialSurface) { 23 | // a root for a partial sequence must be the beginning of the partialSurface 24 | // except when it is a compound. 25 | // in case of compounds, root is not the actual root, but the root of the lexeme 26 | // e.g. 27 | // partial_input = atkuyrugu 28 | // results_with_partial_input_one_char_missing : <'atkuyruk', 'atkuyrug', 'atkuyrugh'> 29 | 30 | // these are all roots of noun compound. so the lexeme can be derived from any of 31 | // atkuyruk+u, atkuyrug+u, atkuyrugh+u 32 | 33 | // we would like to see all of the following as the possible parse results: 34 | // atkuyruk+P3sg 35 | // atkuyrug+P3sg 36 | // atkuyrugh+P3sg 37 | 38 | Validate.notNull(root, "Root to validate cannot be null."); 39 | Validate.notNull(partialSurface, "Partial surface for the root cannot be null"); 40 | 41 | final TurkishSequence rootSequence = root.getSequence(); 42 | if (rootSequence.length() > partialSurface.length()) 43 | return false; 44 | 45 | if (root.getLexeme().getAttributes().contains(LexemeAttribute.CompoundP3sg)) 46 | return true; 47 | 48 | return partialSurface.startsWith(rootSequence) || 49 | partialSurface.getUnderlyingString().toLowerCase(Constants.TURKISH_LOCALE).startsWith(rootSequence.getUnderlyingString().toLowerCase(Constants.TURKISH_LOCALE)); 50 | 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/lexicon/RootMapGenerator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.lexicon; 18 | 19 | import com.google.common.collect.HashMultimap; 20 | import org.trnltk.model.lexicon.Root; 21 | 22 | import java.util.Collection; 23 | 24 | public class RootMapGenerator { 25 | 26 | public HashMultimap generate(Collection allRoots) { 27 | final HashMultimap map = HashMultimap.create(allRoots.size(), 2); 28 | for (Root root : allRoots) { 29 | final String rootStr = root.getSequence().getUnderlyingString(); 30 | map.put(rootStr, root); 31 | } 32 | 33 | return map; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/EmptySuffixGraph.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics; 18 | 19 | import org.trnltk.model.lexicon.Root; 20 | import org.trnltk.model.suffix.Suffix; 21 | import org.trnltk.model.suffix.SuffixForm; 22 | 23 | import java.util.Collection; 24 | import java.util.Collections; 25 | 26 | public class EmptySuffixGraph implements SuffixGraph { 27 | @Override 28 | public SuffixGraphState getDefaultStateForRoot(Root root) { 29 | return null; 30 | } 31 | 32 | @Override 33 | public Collection getRootSuffixGraphStates() { 34 | return Collections.emptyList(); 35 | } 36 | 37 | @Override 38 | public void initialize() { 39 | // do nothing 40 | } 41 | 42 | @Override 43 | public Suffix getSuffix(String name) { 44 | return null; 45 | } 46 | 47 | @Override 48 | public SuffixGraphState getSuffixGraphState(String stateName) { 49 | return null; 50 | } 51 | 52 | @Override 53 | public Collection getAllSuffixes() { 54 | return Collections.emptyList(); 55 | } 56 | 57 | @Override 58 | public SuffixForm getSuffixForm(String suffixName, String suffixFormStr) { 59 | return null; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/SuffixEdge.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics; 18 | 19 | import org.apache.commons.lang3.Validate; 20 | import org.trnltk.model.suffix.Suffix; 21 | 22 | public class SuffixEdge { 23 | private final Suffix suffix; 24 | private final SuffixGraphState targetState; 25 | 26 | public SuffixEdge(Suffix suffix, SuffixGraphState targetState) { 27 | Validate.notNull(suffix); 28 | Validate.notNull(targetState); 29 | 30 | this.suffix = suffix; 31 | this.targetState = targetState; 32 | } 33 | 34 | public Suffix getSuffix() { 35 | return suffix; 36 | } 37 | 38 | public SuffixGraphState getTargetState() { 39 | return targetState; 40 | } 41 | 42 | @Override 43 | public boolean equals(Object o) { 44 | if (this == o) return true; 45 | if (o == null || getClass() != o.getClass()) return false; 46 | 47 | SuffixEdge that = (SuffixEdge) o; 48 | 49 | if (!suffix.equals(that.suffix)) return false; 50 | else if (!targetState.equals(that.targetState)) return false; 51 | 52 | return true; 53 | } 54 | 55 | @Override 56 | public int hashCode() { 57 | int result = suffix.hashCode(); 58 | result = 31 * result + targetState.hashCode(); 59 | return result; 60 | } 61 | 62 | @Override 63 | public String toString() { 64 | return "SuffixEdge{" + 65 | "suffix=" + suffix + 66 | ", targetState=" + targetState + 67 | '}'; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/SuffixFormSequenceApplier.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics; 18 | 19 | import com.google.common.collect.ImmutableList; 20 | import org.apache.commons.collections.CollectionUtils; 21 | import org.trnltk.model.suffix.SuffixFormSequence; 22 | import org.trnltk.model.lexicon.PhoneticAttribute; 23 | 24 | import java.util.Set; 25 | 26 | public class SuffixFormSequenceApplier { 27 | public String apply(final SuffixFormSequence suffixFormSequence, final Set phoneticAttributesOfSurface) { 28 | final StringBuilder builder = new StringBuilder(); 29 | for (SuffixFormSequence.SuffixFormSequenceRule rule : suffixFormSequence.getRules()) { 30 | final Character c = rule.apply(phoneticAttributesOfSurface); 31 | if (c != null) 32 | builder.append(c); 33 | } 34 | 35 | return builder.toString().trim(); 36 | } 37 | 38 | public boolean isApplicable(final SuffixFormSequence suffixFormSequence, final Set phoneticAttributesOfSurface) { 39 | final ImmutableList rules = suffixFormSequence.getRules(); 40 | if (CollectionUtils.isEmpty(rules)) 41 | return true; 42 | 43 | // the only case where the suffix form is not applicable is, having two vowels together 44 | // following code (unfortunately) assumes, in the suffix form, there are no 2 vowels in a row! 45 | 46 | final boolean lastSurfaceLetterIsVowel = phoneticAttributesOfSurface.contains(PhoneticAttribute.LastLetterVowel); 47 | 48 | if (!lastSurfaceLetterIsVowel) 49 | return true; 50 | 51 | final SuffixFormSequence.SuffixFormSequenceRule firstRule = rules.get(0); 52 | 53 | return !firstRule.getRuleType().equals(SuffixFormSequence.SuffixFormSequenceRuleType.INSERT_VOWEL_WITHOUT_HARMONY) && 54 | !firstRule.getRuleType().equals(SuffixFormSequence.SuffixFormSequenceRuleType.INSERT_VOWEL_A_WITH_HARMONY) && 55 | !firstRule.getRuleType().equals(SuffixFormSequence.SuffixFormSequenceRuleType.INSERT_VOWEL_I_WITH_HARMONY) && 56 | !firstRule.getRuleType().equals(SuffixFormSequence.SuffixFormSequenceRuleType.INSERT_VOWEL_I_WITH_HARMONY_AND_NO_ROUNDING); 57 | 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/SuffixFormSequenceRuleApplier.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics; 18 | 19 | import com.google.common.collect.ImmutableSet; 20 | import org.trnltk.model.suffix.SuffixFormSequence; 21 | import org.trnltk.model.lexicon.PhoneticAttribute; 22 | 23 | public class SuffixFormSequenceRuleApplier { 24 | 25 | public Character apply(SuffixFormSequence.SuffixFormSequenceRule rule, ImmutableSet phoneticAttributesOfSurface) { 26 | return rule.apply(phoneticAttributesOfSurface); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/SuffixGraph.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics; 18 | 19 | import org.trnltk.model.lexicon.Root; 20 | import org.trnltk.model.suffix.Suffix; 21 | import org.trnltk.model.suffix.SuffixForm; 22 | 23 | import java.util.Collection; 24 | 25 | public interface SuffixGraph { 26 | 27 | SuffixGraphState getDefaultStateForRoot(Root root); 28 | 29 | Collection getRootSuffixGraphStates(); 30 | 31 | void initialize(); 32 | 33 | Suffix getSuffix(String name); 34 | 35 | SuffixGraphState getSuffixGraphState(String stateName); 36 | 37 | Collection getAllSuffixes(); 38 | 39 | SuffixForm getSuffixForm(String suffixName, String suffixFormStr); 40 | } 41 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/SuffixGraphState.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics; 18 | 19 | import com.google.common.collect.ImmutableSet; 20 | import com.google.common.collect.Sets; 21 | 22 | import org.trnltk.model.lexicon.SecondaryPos; 23 | import org.trnltk.model.suffix.Suffix; 24 | import org.trnltk.model.lexicon.PrimaryPos; 25 | 26 | import java.util.HashSet; 27 | 28 | public class SuffixGraphState { 29 | private final String name; 30 | private final SuffixGraphStateType type; 31 | private final PrimaryPos primaryPos; 32 | private final SecondaryPos secondaryPos; 33 | private ImmutableSet outEdges; 34 | 35 | public SuffixGraphState(String name, SuffixGraphStateType suffixGraphStateType, PrimaryPos primaryPos, SecondaryPos secondaryPos) { 36 | this.name = name; 37 | this.type = suffixGraphStateType; 38 | this.primaryPos = primaryPos; 39 | this.secondaryPos = secondaryPos; 40 | this.outEdges = ImmutableSet.of(); 41 | } 42 | 43 | public String getName() { 44 | return name; 45 | } 46 | 47 | public PrimaryPos getPrimaryPos() { 48 | return primaryPos; 49 | } 50 | 51 | public SecondaryPos getSecondaryPos() { 52 | return secondaryPos; 53 | } 54 | 55 | public SuffixGraphStateType getType() { 56 | return type; 57 | } 58 | 59 | public ImmutableSet getOutEdges() { 60 | return this.outEdges; 61 | } 62 | 63 | public void addOutSuffix(Suffix suffix, SuffixGraphState suffixGraphState) { 64 | final HashSet tempSet = Sets.newHashSet(outEdges); 65 | tempSet.add(new SuffixEdge(suffix, suffixGraphState)); 66 | this.outEdges = ImmutableSet.copyOf(tempSet); 67 | } 68 | 69 | @Override 70 | public boolean equals(Object o) { 71 | if (this == o) return true; 72 | if (o == null || getClass() != o.getClass()) return false; 73 | 74 | SuffixGraphState that = (SuffixGraphState) o; 75 | 76 | return name.equals(that.name); 77 | } 78 | 79 | @Override 80 | public int hashCode() { 81 | return name.hashCode(); 82 | } 83 | 84 | @Override 85 | public String toString() { 86 | return "SuffixGraphState{" + 87 | "name='" + name + '\'' + 88 | '}'; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/SuffixGraphStateType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics; 18 | 19 | public enum SuffixGraphStateType { 20 | DERIVATIONAL("DERIVATIONAL"), 21 | TRANSFER("TRANSFER"), 22 | TERMINAL("TERMINAL"); 23 | 24 | private final String str; 25 | 26 | SuffixGraphStateType(String str) { 27 | this.str = str; 28 | } 29 | 30 | @Override 31 | public String toString() { 32 | return "SuffixGraphStateType{" + 33 | "str='" + str + '\'' + 34 | '}'; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/suffixformspecifications/AppliesToRoot.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics.suffixformspecifications; 18 | 19 | import org.apache.commons.lang3.Validate; 20 | import org.trnltk.common.specification.AbstractSpecification; 21 | import org.trnltk.model.morpheme.MorphemeContainer; 22 | 23 | public class AppliesToRoot extends AbstractSpecification { 24 | private final String rootStr; 25 | 26 | public AppliesToRoot(String rootStr) { 27 | this.rootStr = rootStr; 28 | } 29 | 30 | @Override 31 | public String describe() { 32 | return String.format("applies_to_root(%s)", this.rootStr); 33 | } 34 | 35 | @Override 36 | public boolean isSatisfiedBy(MorphemeContainer morphemeContainer) { 37 | Validate.notNull(morphemeContainer); 38 | 39 | return morphemeContainer.getRoot().getSequence().getUnderlyingString().equals(this.rootStr); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/suffixformspecifications/HasLastNonBlankDerivation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics.suffixformspecifications; 18 | 19 | import org.apache.commons.lang3.Validate; 20 | import org.trnltk.model.suffix.Suffix; 21 | import org.trnltk.common.specification.AbstractSpecification; 22 | import org.trnltk.model.morpheme.MorphemeContainer; 23 | import org.trnltk.model.suffix.SuffixForm; 24 | import org.trnltk.model.suffix.SuffixTransition; 25 | 26 | public class HasLastNonBlankDerivation extends AbstractSpecification { 27 | private final Suffix suffix; 28 | private final String suffixFormStr; 29 | 30 | public HasLastNonBlankDerivation(Suffix suffix, String suffixFormStr) { 31 | this.suffix = suffix; 32 | this.suffixFormStr = suffixFormStr; 33 | } 34 | 35 | @Override 36 | public String describe() { 37 | if (this.suffixFormStr != null) // can be blank 38 | return String.format("has_last_non_blank_derivation(%s[%s])", this.suffix, this.suffixFormStr); 39 | else 40 | return String.format("has_last_non_blank_derivation(%s)", this.suffix); 41 | } 42 | 43 | @Override 44 | public boolean isSatisfiedBy(MorphemeContainer morphemeContainer) { 45 | Validate.notNull(morphemeContainer); 46 | 47 | SuffixTransition lastNonBlankDerivation = morphemeContainer.getLastNonBlankDerivation(); 48 | 49 | if (lastNonBlankDerivation == null) 50 | return false; 51 | 52 | final SuffixForm lastNonBlankDerivationSuffixForm = lastNonBlankDerivation.getSuffixFormApplication().getSuffixForm(); 53 | if (this.suffixFormStr != null) { //can be blank 54 | return this.suffix.equals(lastNonBlankDerivationSuffixForm.getSuffix()) && 55 | this.suffixFormStr.equals(lastNonBlankDerivationSuffixForm.getForm().getSuffixFormStr()); 56 | } else { 57 | return this.suffix.equals(lastNonBlankDerivationSuffixForm.getSuffix()); 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/suffixformspecifications/HasSuffixFormAsLastDerivation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics.suffixformspecifications; 18 | 19 | import org.apache.commons.lang3.Validate; 20 | import org.trnltk.common.specification.AbstractSpecification; 21 | import org.trnltk.model.morpheme.MorphemeContainer; 22 | import org.trnltk.model.suffix.Suffix; 23 | import org.trnltk.model.suffix.SuffixTransition; 24 | 25 | public class HasSuffixFormAsLastDerivation extends AbstractSpecification { 26 | private final Suffix suffix; 27 | private final String suffixFormStr; 28 | 29 | HasSuffixFormAsLastDerivation(Suffix suffix, String suffixFormStr) { 30 | this.suffix = suffix; 31 | this.suffixFormStr = suffixFormStr; 32 | } 33 | 34 | @Override 35 | public String describe() { 36 | if (this.suffixFormStr != null) // can be blank 37 | return String.format("has_suffix_form_as_last_deriv(%s[%s])", this.suffix, this.suffixFormStr); 38 | else 39 | return String.format("has_suffix_form_as_last_deriv(%s)", this.suffix); 40 | } 41 | 42 | 43 | @Override 44 | public boolean isSatisfiedBy(MorphemeContainer morphemeContainer) { 45 | Validate.notNull(morphemeContainer); 46 | 47 | SuffixTransition lastDerivationSuffixTransition = morphemeContainer.getLastDerivationSuffixTransition(); 48 | if (lastDerivationSuffixTransition == null) 49 | return false; 50 | 51 | if (this.suffixFormStr != null) { //can be blank 52 | return lastDerivationSuffixTransition.getSuffixFormApplication().getSuffixForm().getSuffix().equals(this.suffix) 53 | && lastDerivationSuffixTransition.getSuffixFormApplication().getSuffixForm().getForm().getSuffixFormStr().equals(this.suffixFormStr); 54 | } else { 55 | return lastDerivationSuffixTransition.getSuffixFormApplication().getSuffixForm().getSuffix().equals(this.suffix); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/suffixformspecifications/LastSuffixGoesToStateWithType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics.suffixformspecifications; 18 | 19 | import org.apache.commons.lang3.Validate; 20 | import org.trnltk.model.suffix.SuffixTransition; 21 | import org.trnltk.common.specification.AbstractSpecification; 22 | import org.trnltk.model.morpheme.MorphemeContainer; 23 | import org.trnltk.morphology.morphotactics.SuffixGraphStateType; 24 | 25 | public class LastSuffixGoesToStateWithType extends AbstractSpecification { 26 | private final SuffixGraphStateType suffixGraphStateType; 27 | 28 | public LastSuffixGoesToStateWithType(SuffixGraphStateType suffixGraphStateType) { 29 | this.suffixGraphStateType = suffixGraphStateType; 30 | } 31 | 32 | @Override 33 | public String describe() { 34 | return String.format("suffix_goes_to_state_type(%s)", this.suffixGraphStateType); 35 | } 36 | 37 | @Override 38 | public boolean isSatisfiedBy(MorphemeContainer morphemeContainer) { 39 | Validate.notNull(morphemeContainer); 40 | 41 | SuffixTransition lastSuffixTransition = morphemeContainer.getLastSuffixTransition(); 42 | if (lastSuffixTransition == null) 43 | return false; 44 | 45 | return lastSuffixTransition.getTargetState().getType().equals(this.suffixGraphStateType); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/suffixformspecifications/RootHasPrimaryPos.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics.suffixformspecifications; 18 | 19 | import org.apache.commons.lang3.Validate; 20 | import org.trnltk.common.specification.AbstractSpecification; 21 | import org.trnltk.model.morpheme.MorphemeContainer; 22 | import org.trnltk.model.lexicon.PrimaryPos; 23 | 24 | public class RootHasPrimaryPos extends AbstractSpecification { 25 | private final PrimaryPos primaryPos; 26 | 27 | public RootHasPrimaryPos(PrimaryPos primaryPos) { 28 | this.primaryPos = primaryPos; 29 | } 30 | 31 | @Override 32 | public String describe() { 33 | return String.format("root_has_pos(%s)", primaryPos); 34 | } 35 | 36 | @Override 37 | public boolean isSatisfiedBy(MorphemeContainer morphemeContainer) { 38 | Validate.notNull(morphemeContainer); 39 | 40 | return this.primaryPos.equals(morphemeContainer.getRoot().getLexeme().getPrimaryPos()); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/suffixformspecifications/RootHasSecondaryPos.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics.suffixformspecifications; 18 | 19 | import org.apache.commons.lang3.Validate; 20 | import org.trnltk.common.specification.AbstractSpecification; 21 | import org.trnltk.model.lexicon.SecondaryPos; 22 | import org.trnltk.model.morpheme.MorphemeContainer; 23 | 24 | 25 | public class RootHasSecondaryPos extends AbstractSpecification { 26 | private final SecondaryPos secondaryPos; 27 | 28 | public RootHasSecondaryPos(SecondaryPos secondaryPos) { 29 | this.secondaryPos = secondaryPos; 30 | } 31 | 32 | @Override 33 | public String describe() { 34 | return String.format("root_has_secondary_pos(%s)", secondaryPos); 35 | } 36 | 37 | @Override 38 | public boolean isSatisfiedBy(MorphemeContainer morphemeContainer) { 39 | Validate.notNull(morphemeContainer); 40 | 41 | return this.secondaryPos.equals(morphemeContainer.getRoot().getLexeme().getSecondaryPos()); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/morphology/morphotactics/suffixformspecifications/RootHasVowelDrop.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics.suffixformspecifications; 18 | 19 | import org.apache.commons.lang3.Validate; 20 | import org.trnltk.model.lexicon.Lexeme; 21 | import org.trnltk.model.lexicon.LexemeAttribute; 22 | import org.trnltk.model.lexicon.Root; 23 | import org.trnltk.model.morpheme.MorphemeContainer; 24 | import org.trnltk.common.specification.AbstractSpecification; 25 | 26 | public class RootHasVowelDrop extends AbstractSpecification { 27 | 28 | @Override 29 | public String describe() { 30 | return "root_has_vowel_drop()"; 31 | } 32 | 33 | @Override 34 | public boolean isSatisfiedBy(MorphemeContainer morphemeContainer) { 35 | Validate.notNull(morphemeContainer); 36 | 37 | final Root root = morphemeContainer.getRoot(); 38 | final Lexeme lexeme = root.getLexeme(); 39 | return lexeme.getAttributes().contains(LexemeAttribute.ProgressiveVowelDrop) && 40 | root.getSequence().length() == lexeme.getLemmaRoot().length() - 1; 41 | 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/tokenizer/MissingTokenizationRuleException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.tokenizer; 18 | 19 | public class MissingTokenizationRuleException extends RuntimeException { 20 | private final TextBlockGroup leftTextBlockGroup; 21 | private final TextBlockGroup rightTextBlockGroup; 22 | private final TextBlockGroup contextBlockGroup; 23 | 24 | public MissingTokenizationRuleException(TextBlockGroup leftTextBlockGroup, TextBlockGroup rightTextBlockGroup, String msg, TextBlockGroup contextBlockGroup) { 25 | super(msg); 26 | this.leftTextBlockGroup = leftTextBlockGroup; 27 | this.rightTextBlockGroup = rightTextBlockGroup; 28 | this.contextBlockGroup = contextBlockGroup; 29 | } 30 | 31 | public TextBlockGroup getLeftTextBlockGroup() { 32 | return leftTextBlockGroup; 33 | } 34 | 35 | public TextBlockGroup getRightTextBlockGroup() { 36 | return rightTextBlockGroup; 37 | } 38 | 39 | public TextBlockGroup getContextBlockGroup() { 40 | return contextBlockGroup; 41 | } 42 | 43 | @Override 44 | public boolean equals(Object o) { 45 | if (this == o) return true; 46 | if (o == null || getClass() != o.getClass()) return false; 47 | 48 | MissingTokenizationRuleException that = (MissingTokenizationRuleException) o; 49 | 50 | if (!contextBlockGroup.equals(that.contextBlockGroup)) return false; 51 | else if (!leftTextBlockGroup.equals(that.leftTextBlockGroup)) return false; 52 | else if (!rightTextBlockGroup.equals(that.rightTextBlockGroup)) return false; 53 | 54 | return true; 55 | } 56 | 57 | @Override 58 | public int hashCode() { 59 | int result = leftTextBlockGroup.hashCode(); 60 | result = 31 * result + rightTextBlockGroup.hashCode(); 61 | result = 31 * result + contextBlockGroup.hashCode(); 62 | return result; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/tokenizer/TextBlock.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.tokenizer; 18 | 19 | /** 20 | * @author Ali Ok 21 | */ 22 | public class TextBlock { 23 | private String text; 24 | private TextBlockType textBlockType; 25 | 26 | public TextBlock(String text, TextBlockType textBlockType) { 27 | this.text = text; 28 | this.textBlockType = textBlockType; 29 | } 30 | 31 | public String getText() { 32 | return text; 33 | } 34 | 35 | public TextBlockType getTextBlockType() { 36 | return textBlockType; 37 | } 38 | 39 | @Override 40 | public String toString() { 41 | return "TextBlock{" + 42 | "text='" + text + '\'' + 43 | ", textBlockType=" + textBlockType + 44 | '}'; 45 | } 46 | 47 | @Override 48 | public boolean equals(Object o) { 49 | if (this == o) return true; 50 | if (o == null || getClass() != o.getClass()) return false; 51 | 52 | TextBlock textBlock = (TextBlock) o; 53 | 54 | if (!text.equals(textBlock.text)) return false; 55 | else if (textBlockType != textBlock.textBlockType) return false; 56 | 57 | return true; 58 | } 59 | 60 | @Override 61 | public int hashCode() { 62 | int result = text.hashCode(); 63 | result = 31 * result + textBlockType.hashCode(); 64 | return result; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/tokenizer/TextBlockGroup.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.tokenizer; 18 | 19 | import com.google.common.base.Function; 20 | import com.google.common.base.Joiner; 21 | import com.google.common.collect.ImmutableList; 22 | import com.google.common.collect.Lists; 23 | 24 | import java.util.List; 25 | 26 | /** 27 | * @author Ali Ok 28 | */ 29 | public class TextBlockGroup { 30 | private final ImmutableList textBlocks; 31 | private final TextBlockTypeGroup textBlockTypeGroup; 32 | private final TextBlock firstTextBlock; 33 | 34 | public TextBlockGroup(List textBlocks) { 35 | this.textBlocks = ImmutableList.copyOf(textBlocks); 36 | this.textBlockTypeGroup = new TextBlockTypeGroup(Lists.transform(textBlocks, new Function() { 37 | @Override 38 | public TextBlockType apply(TextBlock input) { 39 | return input.getTextBlockType(); 40 | } 41 | })); 42 | this.firstTextBlock = textBlocks.get(0); 43 | } 44 | 45 | public ImmutableList getTextBlocks() { 46 | return textBlocks; 47 | } 48 | 49 | public TextBlockTypeGroup getTextBlockTypeGroup() { 50 | return textBlockTypeGroup; 51 | } 52 | 53 | public TextBlock getFirstTextBlock() { 54 | return firstTextBlock; 55 | } 56 | 57 | public String getText() { 58 | return Joiner.on("").join(Lists.transform(this.textBlocks, new Function() { 59 | @Override 60 | public String apply(TextBlock input) { 61 | return input.getText(); 62 | } 63 | })); 64 | } 65 | 66 | @Override 67 | public String toString() { 68 | return "TextBlockGroup{" + 69 | "textBlocks=" + textBlocks + 70 | '}'; 71 | } 72 | 73 | @Override 74 | public boolean equals(Object o) { 75 | if (this == o) return true; 76 | if (o == null || getClass() != o.getClass()) return false; 77 | 78 | TextBlockGroup that = (TextBlockGroup) o; 79 | 80 | return textBlocks.equals(that.textBlocks); 81 | } 82 | 83 | @Override 84 | public int hashCode() { 85 | return textBlocks.hashCode(); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/tokenizer/TextBlockSplitter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.tokenizer; 18 | 19 | import org.apache.commons.lang3.StringUtils; 20 | 21 | import java.util.LinkedList; 22 | import java.util.List; 23 | 24 | /** 25 | * @author Ali Ok 26 | */ 27 | public class TextBlockSplitter { 28 | 29 | private static final TextBlock SENTENCE_START_TEXT_BLOCK = new TextBlock(StringUtils.EMPTY, TextBlockType.Sentence_Start); 30 | private static final TextBlock SENTENCE_END_TEXT_BLOCK = new TextBlock(StringUtils.EMPTY, TextBlockType.Sentence_End); 31 | 32 | protected LinkedList splitToTextParts(String text) { 33 | final LinkedList textBlocks = new LinkedList(); 34 | while (StringUtils.isNotBlank(text)) { 35 | boolean foundOneClass = false; 36 | for (TextBlockType textBlockType : TextBlockType.PHYSICAL_TYPES) { 37 | final String matchedStr = textBlockType.findMatchFromBeginning(text); 38 | if (matchedStr != null) { 39 | textBlocks.add(new TextBlock(matchedStr, textBlockType)); 40 | text = text.substring(matchedStr.length()); 41 | foundOneClass = true; 42 | break; 43 | } else { 44 | //noinspection UnnecessaryContinue 45 | continue; 46 | } 47 | } 48 | if (!foundOneClass) { 49 | throw new IllegalArgumentException("Text is not matched with any of the classes: \"" + text + "\""); 50 | } 51 | } 52 | 53 | return textBlocks; 54 | } 55 | 56 | public TextBlockGroup getTextBlockGroup(List textBlocks, int blockSize, int startIndex) { 57 | return new TextBlockGroup(textBlocks.subList(startIndex, startIndex + blockSize)); 58 | } 59 | 60 | public void addTextStartsAndEnds(List textBlocks, int blockSize) { 61 | for (int i = 0; i < blockSize; i++) { 62 | textBlocks.add(0, SENTENCE_START_TEXT_BLOCK); 63 | } 64 | 65 | for (int i = 0; i < blockSize; i++) { 66 | textBlocks.add(SENTENCE_END_TEXT_BLOCK); 67 | } 68 | } 69 | } -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/tokenizer/TextBlockTypeGroup.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.tokenizer; 18 | 19 | import com.google.common.collect.ImmutableList; 20 | 21 | import java.util.List; 22 | 23 | /** 24 | * @author Ali Ok 25 | */ 26 | public class TextBlockTypeGroup { 27 | private final ImmutableList textBlockTypes; 28 | 29 | public TextBlockTypeGroup(List textBlockTypes) { 30 | this.textBlockTypes = ImmutableList.copyOf(textBlockTypes); 31 | } 32 | 33 | public ImmutableList getTextBlockTypes() { 34 | return textBlockTypes; 35 | } 36 | 37 | @Override 38 | public String toString() { 39 | return "TextBlockTypeGroup{" + 40 | "textBlockTypes=" + textBlockTypes + 41 | '}'; 42 | } 43 | 44 | @Override 45 | public boolean equals(Object o) { 46 | if (this == o) return true; 47 | if (o == null || getClass() != o.getClass()) return false; 48 | 49 | TextBlockTypeGroup that = (TextBlockTypeGroup) o; 50 | 51 | return textBlockTypes.equals(that.textBlockTypes); 52 | } 53 | 54 | @Override 55 | public int hashCode() { 56 | return textBlockTypes.hashCode(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/tokenizer/Token.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.tokenizer; 2 | 3 | import java.util.List; 4 | 5 | public class Token { 6 | private final String surface; 7 | private final List textBlockTypes; 8 | 9 | public Token(String surface, List textBlockTypes) { 10 | this.surface = surface; 11 | this.textBlockTypes = textBlockTypes; 12 | } 13 | 14 | public String getSurface() { 15 | return surface; 16 | } 17 | 18 | public List getTextBlockTypes() { 19 | return textBlockTypes; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/tokenizer/TokenizationGraphEdge.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.tokenizer; 18 | 19 | import com.google.common.collect.Lists; 20 | 21 | import java.util.LinkedList; 22 | import java.util.List; 23 | 24 | /** 25 | * @author Ali Ok 26 | */ 27 | public class TokenizationGraphEdge { 28 | private final boolean inferred; 29 | private final TokenizationGraphNode target; 30 | private final boolean addSpace; 31 | private final List> examples; 32 | 33 | public TokenizationGraphEdge(boolean inferred, TokenizationGraphNode target, boolean addSpace) { 34 | this.inferred = inferred; 35 | this.target = target; 36 | this.addSpace = addSpace; 37 | this.examples = new LinkedList>(); 38 | } 39 | 40 | public void addExample(List example) { 41 | this.examples.add(example); 42 | } 43 | 44 | public boolean isInferred() { 45 | return inferred; 46 | } 47 | 48 | public TokenizationGraphNode getTarget() { 49 | return target; 50 | } 51 | 52 | public boolean isAddSpace() { 53 | return addSpace; 54 | } 55 | 56 | public List> getExamples() { 57 | return examples; 58 | } 59 | 60 | @Override 61 | public String toString() { 62 | return "TokenizationGraphEdge{" + 63 | "addSpace=" + addSpace + 64 | ", inferred=" + inferred + 65 | '}'; 66 | } 67 | 68 | @Override 69 | public boolean equals(Object o) { 70 | if (this == o) return true; 71 | if (o == null || getClass() != o.getClass()) return false; 72 | 73 | TokenizationGraphEdge that = (TokenizationGraphEdge) o; 74 | 75 | if (addSpace != that.addSpace) return false; 76 | else if (!target.equals(that.target)) return false; 77 | 78 | return true; 79 | } 80 | 81 | @Override 82 | public int hashCode() { 83 | int result = target.hashCode(); 84 | result = 31 * result + (addSpace ? 1 : 0); 85 | return result; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/tokenizer/TokenizationUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.tokenizer; 18 | 19 | public class TokenizationUtils { 20 | public static String normalizeQuotesHyphens(String input) { 21 | // rdquo, ldquo, laquo, raquo, Prime sybols in unicode. 22 | return input 23 | .replaceAll("[\u201C\u201D\u00BB\u00AB\u2033\u0093\u0094]|''", "\"") 24 | .replaceAll("[\u0091\u0092\u2032´`’‘]", "'") 25 | .replaceAll("[\u0096\u0097–]", "-"); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/tokenizer/data/TokenizerTrainingData.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.tokenizer.data; 18 | 19 | import com.google.common.io.ByteSource; 20 | import com.google.common.io.Files; 21 | import com.google.common.io.Resources; 22 | import org.yaml.snakeyaml.TypeDescription; 23 | import org.yaml.snakeyaml.Yaml; 24 | import org.yaml.snakeyaml.constructor.Constructor; 25 | 26 | import java.io.*; 27 | import java.net.URL; 28 | import java.util.List; 29 | 30 | public class TokenizerTrainingData { 31 | private List entries; 32 | 33 | public List getEntries() { 34 | return entries; 35 | } 36 | 37 | public void setEntries(List entries) { 38 | this.entries = entries; 39 | } 40 | 41 | public static TokenizerTrainingData createDefaultTrainingData() throws IOException { 42 | URL resourceURL = Resources.getResource("tokenizer/training-data.yaml"); 43 | ByteSource byteSource = Resources.asByteSource(resourceURL); 44 | return createFromYamlByteSource(byteSource); 45 | } 46 | 47 | public static TokenizerTrainingData createFromYamlFile(File file) throws FileNotFoundException { 48 | ByteSource byteSource = Files.asByteSource(file); 49 | return createFromYamlByteSource(byteSource); 50 | } 51 | 52 | public static TokenizerTrainingData createFromYamlByteSource(ByteSource byteSource) { 53 | TypeDescription dataDescription = new TypeDescription(TokenizerTrainingData.class); 54 | dataDescription.putListPropertyType("entries", TokenizerTrainingEntry.class); 55 | 56 | Constructor constructor = new Constructor(TokenizerTrainingData.class); 57 | constructor.addTypeDescription(dataDescription); 58 | Yaml yaml = new Yaml(constructor); 59 | 60 | try { 61 | InputStream str = byteSource.openBufferedStream(); 62 | return (TokenizerTrainingData) yaml.load(str); 63 | } catch (IOException e) { 64 | e.printStackTrace(); 65 | return null; 66 | } 67 | } 68 | } -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/tokenizer/data/TokenizerTrainingEntry.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.tokenizer.data; 18 | 19 | // ignore stuff w/o usage as it is used when YAML processor creates entries. 20 | @SuppressWarnings("UnusedDeclaration") 21 | public class TokenizerTrainingEntry { 22 | private String text; 23 | private String tknz; 24 | 25 | public TokenizerTrainingEntry(String text, String tknz) { 26 | this.text = text; 27 | this.tknz = tknz; 28 | } 29 | 30 | public TokenizerTrainingEntry(String text) { 31 | this(text, null); 32 | } 33 | 34 | public TokenizerTrainingEntry() { 35 | this(null); 36 | } 37 | 38 | public String getText() { 39 | return text; 40 | } 41 | 42 | public void setText(String T) { 43 | this.text = T; 44 | } 45 | 46 | public String getTknz() { 47 | return tknz; 48 | } 49 | 50 | public void setTknz(String tknz) { 51 | this.tknz = tknz; 52 | } 53 | 54 | @Override 55 | public boolean equals(Object o) { 56 | if (this == o) return true; 57 | if (o == null || getClass() != o.getClass()) return false; 58 | 59 | TokenizerTrainingEntry wheel = (TokenizerTrainingEntry) o; 60 | 61 | if (!text.equals(wheel.text)) return false; 62 | else if (!tknz.equals(wheel.tknz)) return false; 63 | 64 | return true; 65 | } 66 | 67 | @Override 68 | public int hashCode() { 69 | int result = text.hashCode(); 70 | result = 31 * result + tknz.hashCode(); 71 | return result; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/util/Constants.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.util; 2 | 3 | import java.util.Locale; 4 | 5 | public class Constants { 6 | 7 | public static final Locale TURKISH_LOCALE = new Locale("tr"); 8 | 9 | } 10 | -------------------------------------------------------------------------------- /core/src/main/java/org/trnltk/util/Utilities.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.util; 2 | 3 | import java.io.*; 4 | 5 | /** 6 | * @author Ali Ok (ali.ok@apache.org) 7 | */ 8 | public class Utilities { 9 | public static int lineCount(File file) throws IOException { 10 | InputStream is = new BufferedInputStream(new FileInputStream(file)); 11 | try { 12 | byte[] c = new byte[1024]; 13 | int count = 0; 14 | int readChars = 0; 15 | boolean empty = true; 16 | while ((readChars = is.read(c)) != -1) { 17 | empty = false; 18 | for (int i = 0; i < readChars; ++i) { 19 | if (c[i] == '\n') { 20 | ++count; 21 | } 22 | } 23 | } 24 | return (count == 0 && !empty) ? 1 : count; 25 | } finally { 26 | is.close(); 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /core/src/main/resources/master-numeral-dictionary.dict: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2012 Zemberek3 Developers (Original work) 3 | # Copyright 2012 Ali Ok (aliokATapacheDOTorg) (Derivative work) 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | altı [P:Num, Card] 18 | altıncı [P:Num, Ord] 19 | altmış [P:Num, Card] 20 | altmışıncı [P:Num, Ord] 21 | beş [P:Num, Card] 22 | beşinci [P:Num, Ord] 23 | bin [P:Num, Card] 24 | bininci [P:Num, Ord] 25 | bir [P:Num, Card] 26 | birinci [P:Num, Ord] 27 | doksan [P:Num, Card] 28 | doksanıncı [P:Num, Ord] 29 | dokuz [P:Num, Card] 30 | dokuzuncu [P:Num, Ord] 31 | dördüncü [P:Num, Ord] 32 | dört [P:Num, Card] 33 | elli [P:Num, Card] 34 | ellinci [P:Num, Ord] 35 | iki [P:Num, Card] 36 | ikinci [P:Num, Ord] 37 | katrilyon [P:Num, Card] 38 | katrilyonuncu [P:Num, Ord] 39 | kentilyon [P:Num, Card] 40 | kentilyonuncu [P:Num, Ord] 41 | kırk [P:Num, Card] 42 | kırkıncı [P:Num, Ord] 43 | milyar [P:Num, Card] 44 | milyarıncı [P:Num, Ord] 45 | milyon [P:Num, Card] 46 | milyonuncu [P:Num, Ord] 47 | on [P:Num, Card] 48 | onuncu [P:Num, Ord] 49 | otuz [P:Num, Card] 50 | otuzuncu [P:Num, Ord] 51 | sekiz [P:Num, Card] 52 | sekizinci [P:Num, Ord] 53 | seksen [P:Num, Card] 54 | sekseninci [P:Num, Ord] 55 | sıfır [P:Num, Card] 56 | sıfırıncı [P:Num, Ord] 57 | sonuncu [P:Num, Ord] 58 | trilyon [P:Num, Card] 59 | trilyonuncu [P:Num, Ord] 60 | üç [P:Num, Card; A:NoVoicing] 61 | üçüncü [P:Num, Ord] 62 | yedi [P:Num, Card] 63 | yedinci [P:Num, Ord] 64 | yetmiş [P:Num, Card] 65 | yetmişinci [P:Num, Ord] 66 | yirmi [P:Num, Card] 67 | yirminci [P:Num, Ord] 68 | yüz [P:Num, Card] 69 | yüzüncü [P:Num, Ord] -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/experiment/morphology/contextless/parser/PhoneticAttributeSetsTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.experiment.morphology.contextless.parser; 18 | 19 | import com.google.common.collect.ImmutableMap; 20 | import org.junit.Ignore; 21 | import org.junit.Test; 22 | import org.trnltk.morphology.contextless.parser.PhoneticAttributeSets; 23 | import org.trnltk.morphology.phonetics.PhoneticsAnalyzer; 24 | import org.trnltk.model.lexicon.PhoneticAttribute; 25 | 26 | import java.util.EnumSet; 27 | import java.util.Map; 28 | import java.util.Set; 29 | 30 | public class PhoneticAttributeSetsTest { 31 | 32 | @Ignore 33 | @Test 34 | public void printValidSets() { 35 | final PhoneticAttributeSets sets = new PhoneticAttributeSets(); 36 | final ImmutableMap> map = sets.getValidPhoneticAttributeSetsMap(); 37 | for (Map.Entry> entry : map.entrySet()) { 38 | System.out.println(entry.getKey() + "\t" + entry.getValue().toString()); 39 | } 40 | } 41 | 42 | @Ignore 43 | @Test 44 | public void printSetForWord() { 45 | final PhoneticAttributeSets sets = new PhoneticAttributeSets(); 46 | final EnumSet set = new PhoneticsAnalyzer().calculatePhoneticAttributes("keleğ", null); 47 | System.out.println(sets.getNumberForSet(set)); 48 | System.out.println(set); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/model/letter/TurkishAlphabetTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.letter; 18 | 19 | import com.google.common.base.Function; 20 | import com.google.common.collect.Collections2; 21 | import com.google.common.collect.HashMultiset; 22 | import com.google.common.collect.Lists; 23 | import com.google.common.collect.Multiset; 24 | import org.junit.Assert; 25 | import org.junit.Test; 26 | 27 | import static org.hamcrest.core.Is.is; 28 | 29 | public class TurkishAlphabetTest { 30 | 31 | @Test 32 | public void getLetterByChar() { 33 | Assert.assertEquals(TurkishAlphabet.getLetter('c'), TurkishAlphabet.L_c); 34 | Assert.assertEquals(TurkishAlphabet.getLetter('a'), TurkishAlphabet.L_a); 35 | Assert.assertEquals(TurkishAlphabet.getLetter('w'), TurkishAlphabet.L_w); 36 | Assert.assertEquals(TurkishAlphabet.getLetter('z'), TurkishAlphabet.L_z); 37 | Assert.assertEquals(TurkishAlphabet.getLetter('x'), TurkishAlphabet.L_x); 38 | Assert.assertEquals(TurkishAlphabet.getLetter(TurkishAlphabet.C_cc), TurkishAlphabet.L_cc); 39 | Assert.assertEquals(TurkishAlphabet.getLetter(TurkishAlphabet.C_ii), TurkishAlphabet.L_ii); 40 | } 41 | 42 | @Test 43 | public void isVowelTest() { 44 | String vowels = "aeiuüıoöâîû"; 45 | for (char c : vowels.toCharArray()) { 46 | Assert.assertTrue(TurkishAlphabet.getLetter(c).isVowel()); 47 | } 48 | String nonvowels = "bcçdfgğjklmnprştvxwzq."; 49 | for (char c : nonvowels.toCharArray()) { 50 | Assert.assertFalse(TurkishAlphabet.getLetter(c).isVowel()); 51 | } 52 | } 53 | 54 | @Test 55 | public void alphabetShouldNotHaveDuplicateChars() { 56 | final HashMultiset lowerCaseChars = HashMultiset.create(Collections2.transform(Lists.newArrayList(TurkishAlphabet.TURKISH_LETTERS), 57 | new Function() { 58 | @Override 59 | public Character apply(TurkicLetter input) { 60 | return input.charValue(); 61 | } 62 | })); 63 | 64 | for (Multiset.Entry characterEntry : lowerCaseChars.entrySet()) { 65 | Assert.assertThat("For char " + characterEntry.getElement() + ", count must be null", characterEntry.getCount(), is(1)); 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/model/lexicon/PhoneticAttributeMetadataTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.lexicon; 18 | 19 | 20 | import org.junit.Test; 21 | import org.trnltk.model.lexicon.PhoneticAttributeMetadata; 22 | 23 | import java.util.Arrays; 24 | 25 | import static org.junit.Assert.assertFalse; 26 | import static org.junit.Assert.assertTrue; 27 | import static org.trnltk.model.lexicon.PhoneticAttribute.*; 28 | 29 | public class PhoneticAttributeMetadataTest { 30 | //TODO 31 | // @Test 32 | // public void shouldCheckValidCases() { 33 | // assertTrue(PhoneticAttributeMetadata.isValid(Arrays.asList(FirstLetterConsonant))); 34 | // assertTrue(PhoneticAttributeMetadata.isValid(Arrays.asList(LastLetterConsonant, LastLetterVoiceless, FirstLetterVowel, LastVowelBack, LastVowelRounded))); 35 | // assertTrue(PhoneticAttributeMetadata.isValid(Arrays.asList(LastLetterVowel, LastVowelBack, LastVowelRounded, LastLetterNotVoiceless))); 36 | // assertTrue(PhoneticAttributeMetadata.isValid(Arrays.asList(FirstLetterConsonant, LastLetterConsonant, HasNoVowel, LastLetterNotVoiceless))); 37 | // } 38 | 39 | @Test 40 | public void shouldCheckInvalidCases() { 41 | assertFalse(PhoneticAttributeMetadata.isValid(Arrays.asList(FirstLetterConsonant, FirstLetterVowel))); 42 | assertFalse(PhoneticAttributeMetadata.isValid(Arrays.asList(LastLetterConsonant, LastLetterVowel))); 43 | assertFalse(PhoneticAttributeMetadata.isValid(Arrays.asList(LastLetterVowel, LastVowelBack, LastVowelRounded, LastLetterNotVoiceless, HasNoVowel))); 44 | assertFalse(PhoneticAttributeMetadata.isValid(Arrays.asList(FirstLetterConsonant, LastLetterConsonant, HasNoVowel))); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/model/suffix/SuffixFormSequenceRuleStub.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.model.suffix; 18 | 19 | import org.trnltk.model.letter.TurkishAlphabet; 20 | 21 | public class SuffixFormSequenceRuleStub extends SuffixFormSequence.SuffixFormSequenceRule { 22 | public SuffixFormSequenceRuleStub(char charToAdd, SuffixFormSequence.SuffixFormSequenceRuleType ruleType) { 23 | super(TurkishAlphabet.getChar(charToAdd), ruleType); 24 | } 25 | 26 | public SuffixFormSequenceRuleStub(SuffixFormSequence.SuffixFormSequenceRuleType ruleType) { 27 | super(ruleType); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/morphology/contextless/parser/parsing/base/BaseContextlessMorphologicParserPuncTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.contextless.parser.parsing.base; 18 | 19 | import org.junit.Test; 20 | 21 | public abstract class BaseContextlessMorphologicParserPuncTest extends BaseContextlessMorphologicParserTest { 22 | 23 | @Test 24 | public void shouldParsePuncStrings() { 25 | assertParseCorrect(".", ".(.)+Punc"); 26 | assertParseCorrect("-", "-(-)+Punc"); 27 | assertParseCorrect("....", "....(....)+Punc"); 28 | assertParseCorrect("‿﹎﹏»”>", "‿﹎﹏»”>(‿﹎﹏»”>)+Punc"); 29 | assertParseCorrect("„⁅{﹃⦅&_!§՜։܀܍෴៘‱⁂〽﹌@。;゠﹣︾҂©°", "„⁅{﹃⦅&_!§՜։܀܍෴៘‱⁂〽﹌@。;゠﹣︾҂©°(„⁅{﹃⦅&_!§՜։܀܍෴៘‱⁂〽﹌@。;゠﹣︾҂©°)+Punc"); 30 | } 31 | 32 | @Test 33 | public void shouldNotMarkAsPunc() { 34 | assertNotParsable(""); 35 | assertNotParsable(". "); 36 | assertNotParsable(" ."); 37 | assertNotParsable(".a"); 38 | assertNotParsable(".1"); 39 | assertNotParsable(".\n"); 40 | assertNotParsable(".\t"); 41 | assertNotParsable(".¨"); //has control char 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/morphology/contextless/rootfinder/BaseRootFinderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.contextless.rootfinder; 18 | 19 | import org.apache.commons.lang3.Validate; 20 | import org.junit.Before; 21 | import org.trnltk.model.lexicon.Root; 22 | import org.trnltk.model.letter.TurkishSequence; 23 | 24 | import java.util.ArrayList; 25 | import java.util.Collection; 26 | import java.util.Collections; 27 | import java.util.List; 28 | 29 | public abstract class BaseRootFinderTest { 30 | 31 | private RootFinder rootFinder; 32 | private RootValidator rootValidator; 33 | 34 | @Before 35 | public void setUp() throws Exception { 36 | this.rootFinder = this.createRootFinder(); 37 | this.rootValidator = new RootValidator(); 38 | } 39 | 40 | protected abstract RootFinder createRootFinder(); 41 | 42 | protected List findRootsForPartialInput(String partialInput, String wholeSurface) { 43 | final TurkishSequence partialInputSeq = partialInput != null ? new TurkishSequence(partialInput) : null; 44 | final TurkishSequence inputSeq = wholeSurface != null ? new TurkishSequence(wholeSurface) : null; 45 | if (!rootFinder.handles(partialInputSeq, inputSeq)) { 46 | return Collections.EMPTY_LIST; 47 | } else { 48 | List rootList = new ArrayList((Collection) rootFinder.findRootsForPartialInput(partialInputSeq, inputSeq)); 49 | for (R r : rootList) { 50 | Validate.isTrue(rootValidator.isValid(r, partialInputSeq), "Invalid root " + r.toString() + " for partial input " + partialInput); 51 | } 52 | return rootList; 53 | } 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/morphology/contextless/rootfinder/DictionaryRootFinderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.contextless.rootfinder; 18 | 19 | import com.google.common.collect.ImmutableMultimap; 20 | import org.hamcrest.Matcher; 21 | import org.junit.Before; 22 | import org.junit.Test; 23 | import org.junit.runner.RunWith; 24 | import org.mockito.Mock; 25 | import org.mockito.runners.MockitoJUnitRunner; 26 | import org.trnltk.model.lexicon.Root; 27 | import org.trnltk.model.letter.TurkishSequence; 28 | import org.trnltk.morphology.phonetics.PhoneticsAnalyzer; 29 | 30 | import java.util.Arrays; 31 | import java.util.Collection; 32 | 33 | import static org.hamcrest.MatcherAssert.assertThat; 34 | import static org.hamcrest.Matchers.hasItem; 35 | import static org.hamcrest.Matchers.hasSize; 36 | 37 | @RunWith(MockitoJUnitRunner.class) 38 | public class DictionaryRootFinderTest { 39 | 40 | DictionaryRootFinder finder; 41 | PhoneticsAnalyzer phoneticsAnalyzer; 42 | 43 | @Mock 44 | Root root1_1; 45 | @Mock 46 | Root root1_2; 47 | @Mock 48 | Root root2_1; 49 | @Mock 50 | Root root2_2; 51 | 52 | @Before 53 | public void setUp() throws Exception { 54 | final ImmutableMultimap map = new ImmutableMultimap.Builder() 55 | .putAll("root1", Arrays.asList(root1_1, root1_2)) 56 | .putAll("root2", Arrays.asList(root2_1, root2_2)) 57 | .build(); 58 | 59 | finder = new DictionaryRootFinder(map); 60 | phoneticsAnalyzer = new PhoneticsAnalyzer(); 61 | } 62 | 63 | @Test 64 | public void shouldFindRoots() { 65 | final String rootStr = "root1"; 66 | final Collection roots = finder.findRootsForPartialInput(new TurkishSequence(rootStr), null); 67 | assertThat(roots, hasSize(2)); 68 | assertThat(roots, (Matcher) hasItem(root1_1)); 69 | assertThat(roots, (Matcher) hasItem(root1_2)); 70 | } 71 | 72 | @Test 73 | public void shouldNotFindRoots() { 74 | final String rootStr = "UNKNOWN"; 75 | final Collection roots = finder.findRootsForPartialInput(new TurkishSequence(rootStr), null); 76 | assertThat(roots, hasSize(0)); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/morphology/morphotactics/PrecachingSuffixFormSequenceApplierTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics; 18 | 19 | import com.google.common.collect.ImmutableSet; 20 | import org.junit.Before; 21 | import org.junit.Test; 22 | import org.trnltk.model.suffix.SuffixFormSequence; 23 | import org.trnltk.model.lexicon.PhoneticAttribute; 24 | 25 | import static org.hamcrest.MatcherAssert.assertThat; 26 | import static org.hamcrest.Matchers.equalTo; 27 | 28 | public class PrecachingSuffixFormSequenceApplierTest { 29 | PrecachingSuffixFormSequenceApplier applier; 30 | SuffixFormSequenceApplier delegate = new SuffixFormSequenceApplier(); 31 | 32 | @Before 33 | public void setUp() throws Exception { 34 | final BasicSuffixGraph suffixGraph = new BasicSuffixGraph(); 35 | suffixGraph.initialize(); 36 | applier = new PrecachingSuffixFormSequenceApplier(suffixGraph, delegate); 37 | } 38 | 39 | @Test 40 | public void shouldHaveValue() throws Exception { 41 | // check only once, cannot test all! 42 | { 43 | final String str = applier.apply(new SuffixFormSequence("+Im"), ImmutableSet.of(PhoneticAttribute.LastLetterVowel)); 44 | assertThat(str, equalTo("m")); 45 | } 46 | { 47 | final String str = applier.apply(new SuffixFormSequence("+Im"), ImmutableSet.of(PhoneticAttribute.LastLetterVowel, PhoneticAttribute.FirstLetterVowel)); 48 | assertThat(str, equalTo("m")); 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/morphology/morphotactics/SuffixFormSequenceApplierTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.morphology.morphotactics; 18 | 19 | import com.google.common.collect.ImmutableList; 20 | import com.google.common.collect.ImmutableSet; 21 | import org.junit.Before; 22 | import org.junit.Test; 23 | import org.junit.runner.RunWith; 24 | import org.mockito.Matchers; 25 | import org.mockito.Mock; 26 | import org.mockito.runners.MockitoJUnitRunner; 27 | import org.trnltk.model.suffix.SuffixFormSequence; 28 | import org.trnltk.model.lexicon.PhoneticAttribute; 29 | 30 | import static org.hamcrest.MatcherAssert.assertThat; 31 | import static org.hamcrest.Matchers.equalTo; 32 | import static org.mockito.Mockito.when; 33 | 34 | @RunWith(MockitoJUnitRunner.class) 35 | public class SuffixFormSequenceApplierTest { 36 | 37 | SuffixFormSequenceApplier applier; 38 | 39 | @Mock 40 | SuffixFormSequence.SuffixFormSequenceRule rule_A; 41 | 42 | @Mock 43 | SuffixFormSequence.SuffixFormSequenceRule rule_B; 44 | 45 | @Mock 46 | SuffixFormSequence suffixFormSequence; 47 | 48 | @Before 49 | public void setUp() throws Exception { 50 | applier = new SuffixFormSequenceApplier(); 51 | 52 | when(rule_A.apply(Matchers.>any())).thenReturn(null); 53 | 54 | when(rule_B.apply(ImmutableSet.of(PhoneticAttribute.LastLetterConsonant))).thenReturn('c'); 55 | 56 | when(rule_B.apply(ImmutableSet.of(PhoneticAttribute.LastLetterVoicelessStop))).thenReturn('b'); 57 | 58 | when(suffixFormSequence.getRules()).thenReturn(ImmutableList.of(rule_A, rule_B)); 59 | } 60 | 61 | @Test 62 | public void shouldApply() throws Exception { 63 | assertThat(applier.apply(suffixFormSequence, ImmutableSet.of()), equalTo("")); 64 | assertThat(applier.apply(suffixFormSequence, ImmutableSet.of(PhoneticAttribute.LastLetterConsonant)), equalTo("c")); 65 | assertThat(applier.apply(suffixFormSequence, ImmutableSet.of(PhoneticAttribute.LastLetterVoicelessStop)), equalTo("b")); 66 | assertThat(applier.apply(suffixFormSequence, ImmutableSet.of(PhoneticAttribute.LastLetterVowel)), equalTo("")); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/testutil/TestEnvironment.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.testutil; 2 | 3 | public class TestEnvironment { 4 | 5 | private static final String HAS_BIG_PARSESETS = "hasBigParseSets"; 6 | 7 | public static boolean hasBigParseSets(){ 8 | return "true".equalsIgnoreCase(System.getProperty(HAS_BIG_PARSESETS)); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/testutil/testmatchers/BaseParseResultsMatcher.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.testutil.testmatchers; 18 | 19 | import com.google.common.collect.Ordering; 20 | import com.google.common.primitives.Ints; 21 | import org.hamcrest.TypeSafeMatcher; 22 | 23 | import java.util.Arrays; 24 | import java.util.Collection; 25 | 26 | public abstract class BaseParseResultsMatcher extends TypeSafeMatcher> { 27 | 28 | public static final Ordering byLengthOrdering = new Ordering() { 29 | public int compare(String left, String right) { 30 | return Ints.compare(left.length(), right.length()); 31 | } 32 | }; 33 | 34 | public static final Ordering parseResultOrdering = Ordering.compound(Arrays.asList(byLengthOrdering, Ordering.natural())); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/testutil/testmatchers/ParseResultsDontExistMatcher.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.testutil.testmatchers; 18 | 19 | import com.google.common.collect.Lists; 20 | import org.apache.commons.collections.CollectionUtils; 21 | import org.apache.commons.lang3.Validate; 22 | import org.hamcrest.Description; 23 | 24 | import java.util.Arrays; 25 | import java.util.Collection; 26 | import java.util.Collections; 27 | import java.util.List; 28 | 29 | public class ParseResultsDontExistMatcher extends BaseParseResultsMatcher { 30 | private final List expectedParseResults; 31 | 32 | public ParseResultsDontExistMatcher(String... expectedParseResults) { 33 | Validate.notNull(expectedParseResults); 34 | this.expectedParseResults = Arrays.asList(expectedParseResults); 35 | } 36 | 37 | @Override 38 | public boolean matchesSafely(Collection item) { 39 | return CollectionUtils.isNotEmpty(item) && !CollectionUtils.containsAny(item, expectedParseResults); 40 | } 41 | 42 | @Override 43 | public void describeTo(Description description) { 44 | Collections.sort(this.expectedParseResults, BaseParseResultsMatcher.parseResultOrdering); 45 | description.appendValueList("parse results not containing any of <", ",", ">", this.expectedParseResults); 46 | } 47 | 48 | @Override 49 | protected void describeMismatchSafely(Collection item, Description mismatchDescription) { 50 | List itemList = Lists.newArrayList(item); 51 | Collections.sort(itemList, BaseParseResultsMatcher.parseResultOrdering); 52 | mismatchDescription.appendValueList("was <", ",", ">", itemList); 53 | } 54 | } -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/testutil/testmatchers/ParseResultsEqualMatcher.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.testutil.testmatchers; 18 | 19 | import com.google.common.base.Predicates; 20 | import com.google.common.collect.Collections2; 21 | import com.google.common.collect.Lists; 22 | import org.apache.commons.collections.CollectionUtils; 23 | import org.apache.commons.lang3.Validate; 24 | import org.hamcrest.Description; 25 | 26 | import java.util.Arrays; 27 | import java.util.Collection; 28 | import java.util.Collections; 29 | import java.util.List; 30 | 31 | public class ParseResultsEqualMatcher extends BaseParseResultsMatcher { 32 | 33 | private final List expectedParseResults; 34 | private final boolean ignoreVerbPresA3Sg; 35 | 36 | public ParseResultsEqualMatcher(boolean ignoreVerbPresA3Sg, final String... expectedParseResults) { 37 | this.ignoreVerbPresA3Sg = ignoreVerbPresA3Sg; 38 | this.expectedParseResults = Arrays.asList(expectedParseResults); 39 | Validate.notNull(expectedParseResults); 40 | } 41 | 42 | @Override 43 | public boolean matchesSafely(Collection item) { 44 | if (ignoreVerbPresA3Sg) // filter out some verb results to make the test have less results 45 | item = Collections2.filter(item, Predicates.not(Predicates.containsPattern("\\Zero\\+Pres\\+"))); 46 | return CollectionUtils.isEqualCollection(expectedParseResults, item); 47 | } 48 | 49 | @Override 50 | public void describeTo(Description description) { 51 | Collections.sort(this.expectedParseResults, BaseParseResultsMatcher.parseResultOrdering); 52 | description.appendValueList(" <", ",", ">", this.expectedParseResults); 53 | } 54 | 55 | @Override 56 | protected void describeMismatchSafely(Collection item, Description mismatchDescription) { 57 | List itemList = Lists.newArrayList(item); 58 | Collections.sort(itemList, BaseParseResultsMatcher.parseResultOrdering); 59 | mismatchDescription.appendValueList("was <", ",", ">", itemList); 60 | } 61 | } -------------------------------------------------------------------------------- /core/src/test/java/org/trnltk/testutil/testmatchers/ParseResultsExistMatcher.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.testutil.testmatchers; 18 | 19 | import com.google.common.collect.Lists; 20 | import org.apache.commons.lang3.Validate; 21 | import org.hamcrest.Description; 22 | 23 | import java.util.Arrays; 24 | import java.util.Collection; 25 | import java.util.Collections; 26 | import java.util.List; 27 | 28 | public class ParseResultsExistMatcher extends BaseParseResultsMatcher { 29 | 30 | private final List expectedParseResults; 31 | 32 | public ParseResultsExistMatcher(String... expectedParseResults) { 33 | Validate.notNull(expectedParseResults); 34 | this.expectedParseResults = Arrays.asList(expectedParseResults); 35 | } 36 | 37 | @Override 38 | public boolean matchesSafely(Collection item) { 39 | return item.containsAll(expectedParseResults); 40 | } 41 | 42 | @Override 43 | public void describeTo(Description description) { 44 | Collections.sort(this.expectedParseResults, BaseParseResultsMatcher.parseResultOrdering); 45 | description.appendValueList("parse results containing <", ",", ">", this.expectedParseResults); 46 | } 47 | 48 | @Override 49 | protected void describeMismatchSafely(Collection item, Description mismatchDescription) { 50 | List itemList = Lists.newArrayList(item); 51 | Collections.sort(itemList, BaseParseResultsMatcher.parseResultOrdering); 52 | mismatchDescription.appendValueList("was <", ",", ">", itemList); 53 | } 54 | } -------------------------------------------------------------------------------- /core/src/test/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /core/src/test/resources/simpleparsesets/.gitignore: -------------------------------------------------------------------------------- 1 | simpleparseset999.txt 2 | simpleparseset9998.txt -------------------------------------------------------------------------------- /core/src/test/resources/tokenizer/sentence-boundary-text.txt: -------------------------------------------------------------------------------- 1 | Ali gel. Okul açıldı... sınavda 2. oldum. Werder Bremen geçen hafta Bayern'e deplasmanda beş gol atmış ve Mesut Özil maçın yıldızı olmuştu. 'Mızıkacılar' bu hafta da tribünlerin tam dokuz gol izlediği karşılaşmada Hoffenheim'ı 5-4 ile geçti. Perdeyi açan Mesut, 81'de bir kez daha sahneye çıkarak takımının düellodan galip ayrılmasını sağladı. Kicker'e de kapan olan Mesut'un Diego'nun yerini doldurabileceği bile konuşuluyor artık. Bu arada beş gol yiyen Hoffenheim kalecisi Ramazan Özcan'a (ki bu sezon ikinci kez beş gol birden yiyor) teknik direktöründen gelen eleştiri de ilginçti: Arada bir ellerini de kullanmalı! 2 | Haftanın bir başka ilginç skoru da Köln-Schalke maçında yaşandı. Köln, yenilgisiz lider Schalke karşısında ilk yarının sonlarında bulduğu tek golle üç puanın sahibi oldu. Halil, Kuranyi'nin yerine son yirmi dakikada sahadaydı ve bir topu da direktön döndü. Schalke bu mağlubiyetle liderlik koltuğundan inerken, zirvenin yeni sahibi Hamburg oldu. Konuğu Mönchengladbach'ı Petriç'in 11. dakikada bulduğu golle geçen Hamburg puanını 13'e yükseltti. 3 | Geçen hafta ligde Bremen'den beş gol yinen Bayern hafta içinde kupada aldığı galibiyetle kendine gelir gibi olmuştu. Ne var ki bu hafta Hannover deplasmanından 1-0'lık mağlubiyetle döndüler. Altı maçta iki galibiyet, iki beraberlik ve iki yenilgi... 'Bavyeralılar' artık Klinsmann'ın Almanya Milli Takımı'na yaptığı sihirli dokunuşun bir benzerini Münih'te gerçekleştirmesini bekliyorlar ve sabırlar da giderek azalıyor. 4 | Haftanın bol gollü maçlarından birinde Leverkusen, Bochum'u deplasmanda 3-2 mağlup etti. Bir saat içinde üç gol yiyen Bochum, 79'da Sestak ve 81'de Sinan Kaloğlu ile goller bulsa da Yahia üç dakika sonra atılınca rüzgârları da dindi. Leverkusen gibi üç puanı üç golle alan bir diğer takımsa Dortmund. Geçen hafta Hoffenheim karşısında alınan 4-1'lik şok yenilgiyi bu hafta Stuttgart'ı 3-0 mağlup ederek unutturdular. Wolfsburg'un Karlsruhe'ye 2-1 yenilmesiyle ligde namağlup takım kalmazken, Cottbus Hertha Berlin deplasmanından tek gollü galibiyetle dönerek bu sezon ilk kez kazanmış oldu. Eintracht Frankfurt-Arminia Bielefeld maçından ise haftanın tek beraberliği çıktı: 1-1. 5 | Bu hafta dokuz maçta 26 gol atıldı. Dört maçta sadece tek gol kaydedilirken Bremen ve Hoffenheim istatistiği yukarı taşıyan ekiplerdi.Leverkusen ve Cottbus haricinde hiçbir deplasman takımı üç puan alamadı. -------------------------------------------------------------------------------- /core/src/test/resources/trnltk.apps.properties: -------------------------------------------------------------------------------- 1 | app.data.folder.large.files=/Users/ali/Desktop/devl-data/trnltk/largefiles 2 | app.data.folder.1msentences=/Users/ali/Desktop/devl-data/trnltk/1MSentences 3 | app.data.folder.general=/Users/ali/Desktop/devl-data/trnltk/general 4 | app.data.folder.criticalSurface=/Users/ali/Desktop/personal-code/trnltk-java/data/src/main/resources -------------------------------------------------------------------------------- /data/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 21 | 4.0.0 22 | 23 | 24 | org.trnltk 25 | trnltk 26 | 1.0.3-SNAPSHOT 27 | 28 | 29 | data 30 | TRNLTK Data 31 | TRNLTK Data 32 | 33 | 34 | ${encoding} 35 | 36 | 37 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | **NOTICE** : Please make use of project being open. If you don't understand what a part is doing, please check its 2 | tests. We believe that is the best way of learning how to extend things. Naming convention for the tests is very easy. 3 | Just search for a class named "NameOfTheMysteryClass" + "Test" : e.g. "PhoneticsAnalyzer" -> "PhoneticsAnalyzerTest" 4 | 5 | ## Glossary ## 6 | * [Glossary](tutorial/glossary.md) 7 | 8 | ## Documentation ## 9 | * [Simple parsing](tutorial/simple_parsing.md) 10 | * [Advanced parsing](tutorial/advanced_parsing.md) 11 | * [Suffix graphs explained](tutorial/suffix_graphs_explained.md) 12 | * [Root finders explained](tutorial/root_finders_explained.md) 13 | * [Tokenization](tutorial/tokenization.md) 14 | * [Caching](tutorial/caching.md) 15 | * [Brute force](tutorial/brute_force.md) 16 | * [Logging](logging.md) 17 | 18 | ## Cookbook ## 19 | * [Spell check](cookbook/spell_check.md) 20 | * [Numeral to text conversion][cookbook/numeral_to_text.md] 21 | * [Sample corpus statistics I](cookbook/sample_corpus_stats_1.md) (root and suffix histograms) 22 | * [Sample corpus statistics II](cookbook/sample_corpus_stats_2.md) (how formal is a corpus) 23 | * [Suffix graph for old Turkish](cookbook/old_turkish_suffix_graph.md) 24 | * [Recognizing special roots](cookbook/custom_root_finder.md) (custom root finder) 25 | 26 | ## Resolving ambiguity ## 27 | TBA 28 | 29 | ## Release Notes ## 30 | * TRNLTK 1.0.2 : [Release notes](102.md) 31 | 32 | -------------------------------------------------------------------------------- /docs/cookbook/README.md: -------------------------------------------------------------------------------- 1 | ** See [documentation index](../README.md) -------------------------------------------------------------------------------- /docs/cookbook/custom_root_finder.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/cookbook/custom_root_finder.md -------------------------------------------------------------------------------- /docs/cookbook/numeral_to_text.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/cookbook/numeral_to_text.md -------------------------------------------------------------------------------- /docs/cookbook/old_turkish_suffix_graph.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/cookbook/old_turkish_suffix_graph.md -------------------------------------------------------------------------------- /docs/cookbook/sample_corpus_stats_1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/cookbook/sample_corpus_stats_1.md -------------------------------------------------------------------------------- /docs/cookbook/sample_corpus_stats_2.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/cookbook/sample_corpus_stats_2.md -------------------------------------------------------------------------------- /docs/cookbook/spell_check.md: -------------------------------------------------------------------------------- 1 | ## Problem ## 2 | Turkish is an agglutinative language. Thus it is problematic to do spell checking, since it is impossible to 3 | store all words and then check input against them. That is still a good option since storing 10 million words 4 | will cover almost all text. 5 | 6 | However for example on internet, unique words are much more than that. In that case, one must parse words on the fly. 7 | 8 | ## Solution ## 9 | 10 | See [here](/core/src/doc/org/trnltk/cookbook/spellcheck/SpellChecker.java) 11 | 12 | ## Improvements ## 13 | One big improvement would be storing a large number of words in memory and doing the parsing whenever an input out of that 14 | list is received. 15 | 16 | This list must be chosen wisely. If the size is too big, then system would need too much memory. 17 | If the number is too small, then system would be slow since it has to do a lot of parsing. 18 | 19 | ## The right way ## 20 | Current solution finds unknown words, but it does not suggest correct words. 21 | 22 | A real world application would combine this approach with a keyboard centric approach. 23 | That means, if there is an unknown word system should suggest corrections based on the relevance in terms of keyboard 24 | layout. If you investigate spelling errors, they are caused by reasons like: 25 | * Mistyped letter 26 | * Missing letter 27 | * Extra letter 28 | 29 | System should be able to detect these and suggest corrections in the order of distance to given input. 30 | 31 | For this solution, check spelling module of Zemberek3 at https://github.com/ahmetaa/zemberek-nlp 32 | 33 | -------------------------------------------------------------------------------- /docs/resources_102/rootFinders.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/resources_102/rootFinders.png -------------------------------------------------------------------------------- /docs/resources_102/suffixGraphHierarchy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/resources_102/suffixGraphHierarchy.png -------------------------------------------------------------------------------- /docs/resources_102/z_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/resources_102/z_01.png -------------------------------------------------------------------------------- /docs/resources_102/z_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/resources_102/z_02.png -------------------------------------------------------------------------------- /docs/resources_102/z_03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/resources_102/z_03.png -------------------------------------------------------------------------------- /docs/resources_102/z_04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/resources_102/z_04.png -------------------------------------------------------------------------------- /docs/resources_102/z_05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/resources_102/z_05.png -------------------------------------------------------------------------------- /docs/resources_102/z_06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/resources_102/z_06.png -------------------------------------------------------------------------------- /docs/resources_102/z_07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/resources_102/z_07.png -------------------------------------------------------------------------------- /docs/tutorial/README.md: -------------------------------------------------------------------------------- 1 | ** See [documentation index](../README.md) -------------------------------------------------------------------------------- /docs/tutorial/brute_force.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/tutorial/brute_force.md -------------------------------------------------------------------------------- /docs/tutorial/caching.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/tutorial/caching.md -------------------------------------------------------------------------------- /docs/tutorial/glossary.md: -------------------------------------------------------------------------------- 1 | Some definitions 2 | ---------------- 3 | 26 | 29 |
ExampleTerm
gideceğiniSurface
gideceğini+NounSurface+SurfacePos
gidecekBody
gidecek+NounBody+BodyPos
gideceğBodySurface
gitmekLemma
gitmek+VerbLemma+LemmaPos=Lexeme
gitRoot
gidRootSurface
(y)AcAkSuffixForm
ecekSuffixFormBody
eceğSuffixFormSurface
30 | 31 |
TermValueValueValue
WordSurfacekitapçılığı
Stemskitapkitapçıkitapçılık
StemSurfaceskitapkitapçıkitapçılığ
Bodykitapçılık
BodySurfacekitapçılığ
32 | 33 | 34 | * Surface: Full word including the root and suffixes 35 | * Root : The root of a word. Root atomic part. 36 | * Derivation : Deriving a new word from another word. 37 | * Inflection : Conjugating a word with a person agreement / possession / tense etc. 38 | * Suffix form : Form of a suffix. For example, suffix 'Progressive' has 2 suffix forms; '-iyor' and '-makta' 39 | * Body : Root + derivations. Doesn't include the inflections 40 | * POS (part of speech) : Verb, Noun, Adjective etc. 41 | * Inflectional suffix : A suffix that doesn't change body nor the POS of a surface 42 | * Derivational suffix : A suffix that changes the body and might change the POS of a surface 43 | * Morpheme : Elements of a surface; root and suffixes 44 | * Lemma : The root text that can be found in a dictionary 45 | * Lexeme : Lemma + POS of the lemma 46 | * Morphology : How a surface is constructed and how can it be extracted to morphemes 47 | * Morphotactics : Rules when can a suffix can be applied. For example "Progressive suffix can only be applied to a 48 | Verb, and it can't be applied to a surface which has Progressive suffix already" 49 | * Ortographics : Rules of phonetics. For example rules for voicing (kitap+a --> kitaba), 50 | devoicing (kitap+cı --> kitapçı), vowel drop (omuz+u --> omzu), etc. -------------------------------------------------------------------------------- /docs/tutorial/root_finders_explained.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/tutorial/root_finders_explained.md -------------------------------------------------------------------------------- /docs/tutorial/suffix_graphs_explained.md: -------------------------------------------------------------------------------- 1 | decoration 2 | images for graphs 3 | plot graphs 4 | requirements while decorating : dependencies -------------------------------------------------------------------------------- /docs/tutorial/tokenization_resources/img01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/tutorial/tokenization_resources/img01.png -------------------------------------------------------------------------------- /docs/tutorial/tokenization_resources/img02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/tutorial/tokenization_resources/img02.png -------------------------------------------------------------------------------- /docs/tutorial/tokenization_resources/img03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/tutorial/tokenization_resources/img03.png -------------------------------------------------------------------------------- /docs/tutorial/tokenization_resources/img04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/tutorial/tokenization_resources/img04.png -------------------------------------------------------------------------------- /docs/tutorial/tokenization_resources/img05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/docs/tutorial/tokenization_resources/img05.png -------------------------------------------------------------------------------- /web/src/main/java/org/trnltk/web/common/Constants.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.web.common; 2 | 3 | public class Constants { 4 | public static final String TRAINING_FILES_FOLDER_PATH = "D:\\devl\\personal\\trnltk-java\\web\\src\\main\\resources\\trainingSets"; 5 | 6 | public static final String TRAINING_FILE_EXTENSION = "trainingset"; 7 | } 8 | -------------------------------------------------------------------------------- /web/src/main/java/org/trnltk/web/criticalsurface/CriticalSurfaceTaggingData.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.web.criticalsurface; 2 | 3 | import org.trnltk.apps.criticalsurface.CriticalSurfaceEntry; 4 | 5 | import javax.faces.bean.ApplicationScoped; 6 | import javax.faces.bean.ManagedBean; 7 | import java.io.Serializable; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.Map; 11 | 12 | /** 13 | * @author Ali Ok (ali.ok@apache.org) 14 | */ 15 | @ManagedBean(name = "criticalSurfaceTaggingData") 16 | @ApplicationScoped 17 | public class CriticalSurfaceTaggingData implements Serializable { 18 | 19 | private Map> tokenizedSentencesOfFiles; 20 | private List criticalSurfaceEntries; 21 | 22 | public Map> getTokenizedSentencesOfFiles() { 23 | return tokenizedSentencesOfFiles; 24 | } 25 | 26 | public void setTokenizedSentencesOfFiles(Map> tokenizedSentencesOfFiles) { 27 | this.tokenizedSentencesOfFiles = tokenizedSentencesOfFiles; 28 | } 29 | 30 | public List getCriticalSurfaceEntries() { 31 | return criticalSurfaceEntries; 32 | } 33 | 34 | public void setCriticalSurfaceEntries(List criticalSurfaceEntries) { 35 | this.criticalSurfaceEntries = criticalSurfaceEntries; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /web/src/main/java/org/trnltk/web/criticalsurface/CriticalSurfaceTaggingProgressData.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.web.criticalsurface; 2 | 3 | import org.trnltk.apps.criticalsurface.CriticalSurfaceEntry; 4 | 5 | import javax.faces.bean.ApplicationScoped; 6 | import javax.faces.bean.ManagedBean; 7 | import java.io.Serializable; 8 | 9 | /** 10 | * @author Ali Ok (ali.ok@apache.org) 11 | */ 12 | @ManagedBean(name = "criticalSurfaceTaggingProgressData") 13 | @ApplicationScoped 14 | public class CriticalSurfaceTaggingProgressData implements Serializable { 15 | 16 | private int currentSurfaceIndex = -1; 17 | private int currentOccurrenceIndex = -1; 18 | private CriticalSurfaceEntry currentEntry = null; 19 | 20 | public boolean isAtTheEnd() { 21 | return currentEntry == null; 22 | } 23 | 24 | public int getCurrentSurfaceIndex() { 25 | return currentSurfaceIndex; 26 | } 27 | 28 | public int getCurrentOccurrenceIndex() { 29 | return currentOccurrenceIndex; 30 | } 31 | 32 | public CriticalSurfaceEntry getCurrentEntry() { 33 | return currentEntry; 34 | } 35 | 36 | public void setCurrentSurfaceIndex(int currentSurfaceIndex) { 37 | this.currentSurfaceIndex = currentSurfaceIndex; 38 | } 39 | 40 | public void setCurrentOccurrenceIndex(int currentOccurrenceIndex) { 41 | this.currentOccurrenceIndex = currentOccurrenceIndex; 42 | } 43 | 44 | public void setCurrentEntry(CriticalSurfaceEntry currentEntry) { 45 | this.currentEntry = currentEntry; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /web/src/main/java/org/trnltk/web/criticalsurface/ParseResultWithSentencesContainer.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.web.criticalsurface; 2 | 3 | import org.apache.commons.lang3.tuple.Pair; 4 | 5 | import java.io.Serializable; 6 | import java.util.List; 7 | 8 | /** 9 | * @author Ali Ok (ali.ok@apache.org) 10 | */ 11 | public class ParseResultWithSentencesContainer implements Serializable { 12 | private final String parseResultStr; 13 | 14 | // list of Pair 15 | // Sentence : list of surfaces in a sentence 16 | // SurfaceIndexInSentence : index of the surface in the sentence. that means index of the surface that the parse result belongs to 17 | private final List> sentencesAndIndices; 18 | 19 | public ParseResultWithSentencesContainer(String parseResultStr, List> sentencesAndIndices) { 20 | this.parseResultStr = parseResultStr; 21 | this.sentencesAndIndices = sentencesAndIndices; 22 | } 23 | 24 | public String getParseResultStr() { 25 | return parseResultStr; 26 | } 27 | 28 | public List> getSentencesAndIndices() { 29 | return sentencesAndIndices; 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /web/src/main/java/org/trnltk/web/criticalsurface/SentenceContainer.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.web.criticalsurface; 2 | 3 | import org.trnltk.apps.criticalsurface.SentenceIdentifier; 4 | 5 | import java.io.Serializable; 6 | import java.util.List; 7 | 8 | /** 9 | * @author Ali Ok (ali.ok@apache.org) 10 | */ 11 | public class SentenceContainer implements Serializable { 12 | private final List surfaces; 13 | private final SentenceIdentifier sentenceIdentifier; 14 | 15 | public SentenceContainer(List surfaces, SentenceIdentifier sentenceIdentifier) { 16 | this.surfaces = surfaces; 17 | this.sentenceIdentifier = sentenceIdentifier; 18 | } 19 | 20 | public List getSurfaces() { 21 | return surfaces; 22 | } 23 | 24 | public SentenceIdentifier getSentenceIdentifier() { 25 | return sentenceIdentifier; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /web/src/main/java/org/trnltk/web/morphology/parser/SuffixGraphSelectionData.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.trnltk.web.morphology.parser; 18 | 19 | import org.trnltk.morphology.morphotactics.*; 20 | 21 | import javax.faces.bean.ManagedBean; 22 | import javax.faces.bean.SessionScoped; 23 | import java.io.Serializable; 24 | 25 | @ManagedBean(name = "suffixGraphSelectionData") 26 | @SessionScoped 27 | public class SuffixGraphSelectionData implements Serializable { 28 | private boolean includeNumeralGraph; 29 | private boolean includeProperNounGraph; 30 | private boolean includeCopulaGraph; 31 | 32 | 33 | public SuffixGraph getSelectedSuffixGraph() { 34 | SuffixGraph suffixGraph = new BasicSuffixGraph(); 35 | if (includeNumeralGraph) 36 | suffixGraph = new NumeralSuffixGraph(suffixGraph); 37 | if (includeProperNounGraph) 38 | suffixGraph = new ProperNounSuffixGraph(suffixGraph); 39 | if (includeCopulaGraph) 40 | suffixGraph = new CopulaSuffixGraph(suffixGraph); 41 | 42 | suffixGraph.initialize(); 43 | 44 | return suffixGraph; 45 | } 46 | 47 | public boolean isIncludeNumeralGraph() { 48 | return includeNumeralGraph; 49 | } 50 | 51 | public void setIncludeNumeralGraph(boolean includeNumeralGraph) { 52 | this.includeNumeralGraph = includeNumeralGraph; 53 | } 54 | 55 | public boolean isIncludeProperNounGraph() { 56 | return includeProperNounGraph; 57 | } 58 | 59 | public void setIncludeProperNounGraph(boolean includeProperNounGraph) { 60 | this.includeProperNounGraph = includeProperNounGraph; 61 | } 62 | 63 | public boolean isIncludeCopulaGraph() { 64 | return includeCopulaGraph; 65 | } 66 | 67 | public void setIncludeCopulaGraph(boolean includeCopulaGraph) { 68 | this.includeCopulaGraph = includeCopulaGraph; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /web/src/main/java/org/trnltk/web/training/TrainingFileData.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.web.training; 2 | 3 | import javax.faces.bean.ManagedBean; 4 | import javax.faces.bean.ViewScoped; 5 | import java.io.Serializable; 6 | 7 | @ManagedBean(name = "trainingFileData") 8 | @ViewScoped 9 | public class TrainingFileData implements Serializable { 10 | private String fileName; 11 | private String content; 12 | private boolean strictTokenization = true; 13 | 14 | public String getFileName() { 15 | return fileName; 16 | } 17 | 18 | public void setFileName(String fileName) { 19 | this.fileName = fileName; 20 | } 21 | 22 | public String getContent() { 23 | return content; 24 | } 25 | 26 | public void setContent(String content) { 27 | this.content = content; 28 | } 29 | 30 | public boolean isStrictTokenization() { 31 | return strictTokenization; 32 | } 33 | 34 | public void setStrictTokenization(boolean strictTokenization) { 35 | this.strictTokenization = strictTokenization; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /web/src/main/java/org/trnltk/web/training/TrainingSetCreatorBean.java: -------------------------------------------------------------------------------- 1 | package org.trnltk.web.training; 2 | 3 | public class TrainingSetCreatorBean { 4 | 5 | public void asd(){ 6 | // 1. provide text 7 | // 2. tokenize text 8 | // 3. write to file with BlockTypes : word blockType NOT_PARSED_YET 9 | // 4. parse : IMPORTANT - choose parser carefully. no results producing things like salak+Adj+Noun..+Verb+Cop+Pres 10 | // 5. write to file again! FLUSH button 11 | // 6. write SKIPPED as parse result for skipped 12 | // 7. introduce basic disambiguator and make use of it while training set creation process 13 | } 14 | 15 | 16 | 17 | } 18 | -------------------------------------------------------------------------------- /web/src/main/resources/commons-logging.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2013 Ali Ok (aliokATapacheDOTorg) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | org.apache.commons.logging.Log=org.apache.commons.logging.impl.Log4JLogger 18 | -------------------------------------------------------------------------------- /web/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /web/src/main/resources/trainingSets/sample.trainingset: -------------------------------------------------------------------------------- 1 | T elma WORD elma+Noun+Pnon+Nom 2 | T zirtirdim WORD SKIP 3 | T 1 NUMBER 1+Noun+Adj 4 | 2.'ye NUMBER+APOS+PERIOD+WORD -------------------------------------------------------------------------------- /web/src/main/webapp/WEB-INF/faces-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /web/src/main/webapp/basetemplate.xhtml: -------------------------------------------------------------------------------- 1 | 16 | 17 | 22 | 23 | 24 | TRNLTK Web 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |

33 | 43 | 53 | 54 |
55 | 56 |
57 |
58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /web/src/main/webapp/resources/components/criticalsurface/sentence.xhtml: -------------------------------------------------------------------------------- 1 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | #{cc.attrs.sentenceIdentifier.toString()} 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /web/src/main/webapp/resources/img/ajaxloading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/web/src/main/webapp/resources/img/ajaxloading.gif -------------------------------------------------------------------------------- /web/src/main/webapp/resources/style.css: -------------------------------------------------------------------------------- 1 | /*dark background starts*/ 2 | body{ 3 | background: #6e6e6e; 4 | } 5 | 6 | body, legend{ 7 | color: white; 8 | } 9 | 10 | .table tbody tr.info td{ 11 | background: #232941; 12 | } 13 | 14 | .table tbody tr.success td{ 15 | background: #575e53; 16 | } 17 | /*dark background ends*/ 18 | 19 | .criticalSurfaceSentence { 20 | 21 | } 22 | 23 | .sentenceSurface, .criticalSurface { 24 | display: inline-block; 25 | } 26 | 27 | .criticalSurface { 28 | color: #FFA300; 29 | } 30 | 31 | @media (max-width: 1199px) { 32 | *, .label, .badge { 33 | font-size: xx-small; 34 | } 35 | 36 | .btn, .navbar .nav > li > a { 37 | padding: 0; 38 | } 39 | 40 | .navbar .nav > li > a { 41 | padding-left: 5px; 42 | } 43 | 44 | fieldset { 45 | min-width: 0; 46 | } 47 | 48 | .sentenceSurface, .criticalSurface { 49 | margin-right: 2px; 50 | } 51 | 52 | h1, h2, h3, h4, h5, legend { 53 | font-size: smaller; 54 | } 55 | } 56 | 57 | @media (min-width: 1200px) { 58 | .sentenceSurface, .criticalSurface { 59 | font-size: larger; 60 | margin-right: 4px; 61 | } 62 | } 63 | 64 | 65 | -------------------------------------------------------------------------------- /web/src/main/webapp/resources/thirdparty/img/glyphicons-halflings-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/web/src/main/webapp/resources/thirdparty/img/glyphicons-halflings-white.png -------------------------------------------------------------------------------- /web/src/main/webapp/resources/thirdparty/img/glyphicons-halflings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliok/trnltk-java/6037051467b91de10faa2d442f37ca888115dd21/web/src/main/webapp/resources/thirdparty/img/glyphicons-halflings.png -------------------------------------------------------------------------------- /web/src/test/resources/trainingSetParserExpectation.txt: -------------------------------------------------------------------------------- 1 | W elma 2 | - elma+Noun+Pnon+A3sg 3 | - elma+Noun+Pnon+A3sg --------------------------------------------------------------------------------