├── solr ├── lib │ ├── zemberek-tr-2.1.3.jar │ └── zemberek-cekirdek-2.1.3.jar ├── zoo.cfg └── solr.xml ├── .travis.yml ├── .gitignore ├── src ├── main │ └── java │ │ └── org │ │ └── apache │ │ └── lucene │ │ └── analysis │ │ └── tr │ │ ├── util │ │ ├── Piper.java │ │ └── PatternTableFactory.java │ │ ├── DilbazStemFilterFactory.java │ │ ├── TurkishDeASCIIfyFilterFactory.java │ │ ├── TRMorphStemFilterFactory.java │ │ ├── TRMorphStemFilter.java │ │ ├── Zemberek2DeASCIIfyFilterFactory.java │ │ ├── Zemberek3StemFilter.java │ │ ├── Zemberek3StemFilterFactory.java │ │ ├── Zemberek2StemFilterFactory.java │ │ ├── MyTurkishMorphology.java │ │ └── TurkishDeASCIIfyFilter.java └── test │ └── java │ └── org │ └── apache │ └── lucene │ └── tr │ ├── TestTurkishDeASCIIfyFilter.java │ └── TestZemberek3StemFilter.java ├── pom.xml ├── README.md └── LICENSE /solr/lib/zemberek-tr-2.1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iorixxx/lucene-solr-analysis-turkish/HEAD/solr/lib/zemberek-tr-2.1.3.jar -------------------------------------------------------------------------------- /solr/lib/zemberek-cekirdek-2.1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iorixxx/lucene-solr-analysis-turkish/HEAD/solr/lib/zemberek-cekirdek-2.1.3.jar -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | sudo: false 4 | 5 | jdk: 6 | - oraclejdk8 7 | 8 | addons: 9 | apt: 10 | packages: 11 | - oracle-java8-installer 12 | 13 | 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Mobile Tools for Java (J2ME) 2 | .mtj.tmp/ 3 | 4 | # Package Files # 5 | *.jar 6 | *.war 7 | *.ear 8 | 9 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 10 | hs_err_pid* 11 | 12 | .idea 13 | target 14 | *.iml 15 | 16 | # Auto generated Java files 17 | src/main/java/org/apache/lucene/analysis/tr/util/Map*.java 18 | 19 | .DS_Store -------------------------------------------------------------------------------- /solr/zoo.cfg: -------------------------------------------------------------------------------- 1 | # The number of milliseconds of each tick 2 | tickTime=2000 3 | # The number of ticks that the initial 4 | # synchronization phase can take 5 | initLimit=10 6 | # The number of ticks that can pass between 7 | # sending a request and getting an acknowledgement 8 | syncLimit=5 9 | 10 | # the directory where the snapshot is stored. 11 | # dataDir=/opt/zookeeper/data 12 | # NOTE: Solr defaults the dataDir to /zoo_data 13 | 14 | # the port at which the clients will connect 15 | # clientPort=2181 16 | # NOTE: Solr sets this based on zkRun / zkHost params 17 | 18 | -------------------------------------------------------------------------------- /solr/solr.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 28 | 29 | 30 | 31 | 32 | ${host:} 33 | ${jetty.port:8983} 34 | ${hostContext:solr} 35 | ${zkClientTimeout:30000} 36 | ${genericCoreNodeNames:true} 37 | 38 | 39 | 41 | ${socketTimeout:0} 42 | ${connTimeout:0} 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/util/Piper.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr.util; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.io.InputStream; 21 | import java.io.OutputStream; 22 | 23 | /** 24 | * Modified from Piping Between Processes 25 | */ 26 | public final class Piper implements java.lang.Runnable { 27 | 28 | private final InputStream input; 29 | 30 | private final OutputStream output; 31 | 32 | public Piper(InputStream input, OutputStream output) { 33 | this.input = input; 34 | this.output = output; 35 | } 36 | 37 | @Override 38 | public void run() { 39 | try (InputStream input = this.input; OutputStream output = this.output) { 40 | // Create 512 bytes buffer 41 | byte[] b = new byte[512]; 42 | int read = 1; 43 | // As long as data is read; -1 means EOF 44 | while (read > -1) { 45 | // Read bytes into buffer 46 | read = input.read(b, 0, b.length); 47 | if (read > -1) { 48 | // Write bytes to output 49 | output.write(b, 0, read); 50 | } 51 | } 52 | } catch (Exception e) { 53 | // Something happened while reading or writing streams; pipe is broken 54 | throw new RuntimeException("Broken pipe", e); 55 | } 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /src/test/java/org/apache/lucene/tr/TestTurkishDeASCIIfyFilter.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.tr; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; 21 | import org.apache.lucene.analysis.TokenStream; 22 | import org.apache.lucene.analysis.tr.TurkishDeASCIIfyFilter; 23 | import org.junit.Test; 24 | 25 | /** 26 | * Simple tests to ensure Turkish deASCIIfy filter factory is working. 27 | */ 28 | public class TestTurkishDeASCIIfyFilter extends BaseTokenStreamTestCase { 29 | 30 | @Test 31 | public void testDeAscii2() throws Exception { 32 | TokenStream stream = whitespaceMockTokenizer("tatlises akgunduz sakip cernobil baslattigi dayanikliklarini"); 33 | stream = new TurkishDeASCIIfyFilter(stream, false); 34 | assertTokenStreamContents(stream, new String[]{"tatlıses", "akgündüz", "sakıp", "çernobil", "başlattığı", "dayanıklıklarını"}); 35 | } 36 | 37 | @Test 38 | public void testDeAscii() throws Exception { 39 | TokenStream stream = whitespaceMockTokenizer("kus fadil akgunduz dogalgaz ahmet"); 40 | stream = new TurkishDeASCIIfyFilter(stream, false); 41 | assertTokenStreamContents(stream, new String[]{"kuş", "fadıl", "akgündüz", "doğalgaz", "ahmet"}); 42 | } 43 | 44 | @Test 45 | public void testPreserveOriginal() throws Exception { 46 | TokenStream stream = whitespaceMockTokenizer("kus fadil akgunduz dogalgaz ahmet izmir"); 47 | stream = new TurkishDeASCIIfyFilter(stream, true); 48 | assertTokenStreamContents(stream, new String[]{ 49 | "kuş", "kus", 50 | "fadıl", "fadil", 51 | "akgündüz", "akgunduz", 52 | "doğalgaz", "dogalgaz", 53 | "ahmet", 54 | "izmir" 55 | }); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/DilbazStemFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr; 2 | 3 | import MorphologicalAnalysis.FsmMorphologicalAnalyzer; 4 | import MorphologicalAnalysis.FsmParse; 5 | import MorphologicalAnalysis.FsmParseList; 6 | import MorphologicalDisambiguation.LongestRootFirstDisambiguation; 7 | import MorphologicalDisambiguation.MorphologicalDisambiguator; 8 | 9 | import java.util.List; 10 | import java.util.Locale; 11 | 12 | public class DilbazStemFilterFactory { 13 | static FsmMorphologicalAnalyzer fsm = new FsmMorphologicalAnalyzer(); 14 | 15 | public static void main(String[] args) { 16 | 17 | String a = "0.25 4p.05 4p.x kuş asisi ortaklar çekişme masalı İCARETİN DE ARTMASI BEKLENİYOR\n" + 18 | "Savinykh, Ege Bölgesi Sanayi Odası'nda (EBSO) düzenlenen \"Belarus Türkiye Yatırım ve İşbirliği Olanakları Semineri\"nde yaptığı konuşmada, \" 2 Haziran'dan itibaren Türk halkı vizesiz olarak Belarus'a gidip gelebilecek. İki ülke arasındaki ticaret bu anlaşma ile daha da artacak\" dedi. Türkiye ile Belarus arasında ticari, kültürel ve sosyal ilişkilerin gelişmesini arzu ettiklerini kaydeden Andrei Savinykh, ülkesinin Kırgızistan ve Kazakistan ile Gümrük Birliği anlaşması bulunduğunu, önümüzdeki kuku birliğ"; 19 | 20 | a = a.toLowerCase(Locale.forLanguageTag("tr")); 21 | 22 | for (String s : a.split("\\s+")) { 23 | parse(s); 24 | } 25 | } 26 | 27 | static void parse(String word) { 28 | 29 | FsmParseList fsmParseList = fsm.morphologicalAnalysis(word); 30 | 31 | System.out.println("found " + fsmParseList.size() + " many solutions for " + word); 32 | 33 | if (fsmParseList.size() == 0) return; 34 | 35 | 36 | System.out.println("longest " + fsmParseList.getParseWithLongestRootWord().getWord().getName() + " lemma " + fsmParseList.getParseWithLongestRootWord().getLastLemma()); 37 | 38 | for (int i = 0; i < fsmParseList.size(); i++) { 39 | System.out.println(fsmParseList.getFsmParse(i).transitionList()); 40 | } 41 | MorphologicalDisambiguator morphologicalDisambiguator = new LongestRootFirstDisambiguation(); 42 | 43 | List dis = morphologicalDisambiguator.disambiguate(new FsmParseList[]{fsmParseList}); 44 | System.out.println("====disambiguator found " + dis.size() + " many candidates"); 45 | 46 | for (FsmParse parse : dis) { 47 | System.out.println(parse.transitionList()); 48 | System.out.println("stem: " + parse.getWord().getName()); 49 | System.out.println("lemma: " + parse.getLastLemma()); 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/TurkishDeASCIIfyFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.analysis.TokenStream; 21 | import org.apache.lucene.analysis.TokenFilterFactory; 22 | 23 | import java.util.Map; 24 | 25 | /** 26 | * Factory for {@link TurkishDeASCIIfyFilter}. 27 | *
28 |  * <fieldType name="text_tr_deascii" class="solr.TextField" positionIncrementGap="100">
29 |  * <analyzer type="index">
30 |  * <tokenizer class="solr.StandardTokenizerFactory"/>
31 |  * <filter class="solr.ApostropheFilterFactory"/>
32 |  * <filter class="solr.TurkishLowerCaseFilterFactory"/>
33 |  * <filter class="solr.Zemberek3StemFilterFactory"/>
34 |  * </analyzer>
35 |  * <analyzer type="query">
36 |  * <tokenizer class="solr.StandardTokenizerFactory"/>
37 |  * <filter class="solr.ApostropheFilterFactory"/>
38 |  * <filter class="solr.TurkishLowerCaseFilterFactory"/>
39 |  * <filter class="solr.TurkishDeASCIIfyFilterFactory" preserveOriginal="true"/>
40 |  * <filter class="solr.Zemberek3StemFilterFactory"/>
41 |  * </analyzer>
42 |  * </fieldType>
43 | */ 44 | public class TurkishDeASCIIfyFilterFactory extends TokenFilterFactory { 45 | 46 | private final boolean preserveOriginal; 47 | 48 | /** 49 | * Creates a new TurkishDeASCIIfyFilterFactory 50 | */ 51 | public TurkishDeASCIIfyFilterFactory(Map args) { 52 | super(args); 53 | preserveOriginal = getBoolean(args, "preserveOriginal", false); 54 | if (!args.isEmpty()) { 55 | throw new IllegalArgumentException("Unknown parameters: " + args); 56 | } 57 | } 58 | 59 | @Override 60 | public TurkishDeASCIIfyFilter create(TokenStream input) { 61 | return new TurkishDeASCIIfyFilter(input, preserveOriginal); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/TRMorphStemFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.analysis.TokenFilterFactory; 21 | import org.apache.lucene.analysis.TokenStream; 22 | import org.apache.lucene.util.ResourceLoader; 23 | import org.apache.lucene.util.ResourceLoaderAware; 24 | 25 | import java.io.File; 26 | import java.io.IOException; 27 | import java.util.Map; 28 | 29 | /** 30 | * Factory for {@link TRMorphStemFilterFactory}. 31 | *
32 |  * <fieldType name="text_tr_morph" class="solr.TextField" positionIncrementGap="100">
33 |  * <analyzer>
34 |  * <tokenizer class="solr.StandardTokenizerFactory"/>
35 |  * <filter class="solr.ApostropheFilterFactory"/>
36 |  * <filter class="solr.TurkishLowerCaseFilterFactory"/>
37 |  * <filter class="org.apache.lucene.analysis.tr.TRMorphStemFilterFactory" lookup="/Applications/foma/flookup" fst="/Volumes/datadisk/Desktop/TRmorph-master/stem.fst"/>
38 |  * </analyzer>
39 |  * </fieldType>
40 | */ 41 | public class TRMorphStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { 42 | 43 | private final String strategy; 44 | private final String lookup_fst; 45 | 46 | 47 | public TRMorphStemFilterFactory(Map args) { 48 | super(args); 49 | 50 | final String lookup; 51 | final String fst; 52 | 53 | strategy = get(args, "strategy", "max"); 54 | lookup = require(args, "lookup"); 55 | fst = require(args, "fst"); 56 | 57 | 58 | if (!args.isEmpty()) 59 | throw new IllegalArgumentException("Unknown parameters: " + args); 60 | 61 | if (!"min".equals(strategy) && !"max".equals(strategy)) 62 | throw new IllegalArgumentException("unknown strategy " + strategy); 63 | 64 | if (lookup != null) { 65 | File f = new File(lookup); 66 | if (!f.isAbsolute()) { 67 | throw new IllegalArgumentException("AbsolutePath must be provided for lookup executable: " + lookup); 68 | } 69 | if (!(f.isFile() && f.canRead())) { 70 | throw new IllegalArgumentException("Cannot read lookup executable: " + lookup); 71 | } 72 | } 73 | 74 | if (fst != null) { 75 | File f = new File(fst); 76 | if (!f.isAbsolute()) { 77 | throw new IllegalArgumentException("AbsolutePath must be provided for fst: " + fst); 78 | } 79 | if (!(f.isFile() && f.canRead())) { 80 | throw new IllegalArgumentException("Cannot read fst: " + fst); 81 | } 82 | } 83 | 84 | lookup_fst = lookup + " " + fst; 85 | } 86 | 87 | @Override 88 | public void inform(ResourceLoader loader) throws IOException { 89 | } 90 | 91 | @Override 92 | public TokenStream create(TokenStream input) { 93 | return new TRMorphStemFilter(input, lookup_fst, strategy); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/test/java/org/apache/lucene/tr/TestZemberek3StemFilter.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.tr; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; 5 | import org.apache.lucene.analysis.TokenStream; 6 | import org.apache.lucene.analysis.custom.CustomAnalyzer; 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.tr.MyTurkishMorphology; 9 | import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; 10 | import org.apache.lucene.analysis.tr.Zemberek3StemFilter; 11 | import org.apache.lucene.analysis.tr.Zemberek3StemFilterFactory; 12 | import org.junit.Test; 13 | 14 | import java.io.IOException; 15 | import java.io.StringReader; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | 19 | public class TestZemberek3StemFilter extends BaseTokenStreamTestCase { 20 | 21 | private static final MyTurkishMorphology morphology = MyTurkishMorphology.createWithDefaults(); 22 | 23 | @Test 24 | public void testSomeWords() throws Exception { 25 | TokenStream stream = whitespaceMockTokenizer("kuş gribi aşısı ortaklar çekişme masalı TİCARETİN DE ARTMASI BEKLENİYOR"); 26 | stream = new Zemberek3StemFilter(stream, morphology, "maxLength"); 27 | assertTokenStreamContents(stream, new String[]{"kuş", "grip", "aşı", "ortak", "çekişme", "masal", "ticaret", "de", "artma", "beklen"}); 28 | } 29 | 30 | @Test 31 | public void testUnrecognizedWords() throws Exception { 32 | TokenStream stream = whitespaceMockTokenizer("kuku euro"); 33 | stream = new Zemberek3StemFilter(stream, morphology, "maxLength"); 34 | assertTokenStreamContents(stream, new String[]{"kuku", "euro"}); 35 | } 36 | 37 | @Test 38 | public void test4SP() throws Exception { 39 | 40 | Analyzer analyzer = CustomAnalyzer.builder() 41 | .withTokenizer("standard") 42 | .addTokenFilter("turkishlowercase") 43 | .addTokenFilter(Zemberek3StemFilterFactory.class) 44 | .build(); 45 | 46 | System.out.println(getAnalyzedTokens("4g.x", analyzer)); 47 | System.out.println(getAnalyzedTokens("0.25", analyzer)); 48 | System.out.println(getAnalyzedTokens(".", analyzer)); 49 | System.out.println(getAnalyzedTokens("bulun.duğunu", analyzer)); 50 | assertTrue(getAnalyzedTokens(".", analyzer).isEmpty()); 51 | 52 | identity("4g.x"); 53 | identity("0.25"); 54 | identity("."); 55 | 56 | TokenStream stream = whitespaceMockTokenizer("4S.P"); 57 | stream = new TurkishLowerCaseFilter(stream); 58 | stream = new Zemberek3StemFilter(stream, morphology, "maxLength"); 59 | assertTokenStreamContents(stream, new String[]{"4s.p"}); 60 | } 61 | 62 | private void identity(String word) throws Exception { 63 | TokenStream stream = whitespaceMockTokenizer(word); 64 | stream = new TurkishLowerCaseFilter(stream); 65 | stream = new Zemberek3StemFilter(stream, morphology, "maxLength"); 66 | assertTokenStreamContents(stream, new String[]{word}); 67 | } 68 | 69 | /** 70 | * Modified from : http://lucene.apache.org/core/4_10_2/core/org/apache/lucene/analysis/package-summary.html 71 | */ 72 | public static List getAnalyzedTokens(String text, Analyzer analyzer) { 73 | 74 | final List list = new ArrayList<>(); 75 | try (TokenStream ts = analyzer.tokenStream("FIELD", new StringReader(text))) { 76 | 77 | final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); 78 | ts.reset(); // Resets this stream to the beginning. (Required) 79 | while (ts.incrementToken()) 80 | list.add(termAtt.toString()); 81 | 82 | ts.end(); // Perform end-of-stream operations, e.g. set the final offset. 83 | } catch (IOException ioe) { 84 | throw new RuntimeException("happened during string analysis", ioe); 85 | } 86 | return list; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/TRMorphStemFilter.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.analysis.TokenFilter; 21 | import org.apache.lucene.analysis.TokenStream; 22 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 23 | import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; 24 | import org.apache.lucene.analysis.tr.util.Piper; 25 | import org.slf4j.Logger; 26 | import org.slf4j.LoggerFactory; 27 | 28 | import java.io.BufferedReader; 29 | import java.io.ByteArrayInputStream; 30 | import java.io.IOException; 31 | import java.nio.charset.StandardCharsets; 32 | import java.util.ArrayList; 33 | import java.util.List; 34 | import java.util.TreeSet; 35 | 36 | /** 37 | * Stemmer based on TRmorph 38 | */ 39 | public final class TRMorphStemFilter extends TokenFilter { 40 | 41 | private static final Logger log = LoggerFactory.getLogger(TRMorphStemFilter.class); 42 | 43 | private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); 44 | private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); 45 | 46 | private final String aggregation; 47 | private final String lookup_fst; 48 | 49 | public TRMorphStemFilter(TokenStream input, String lookup_fst, String aggregation) { 50 | super(input); 51 | this.lookup_fst = lookup_fst; 52 | this.aggregation = aggregation; 53 | } 54 | 55 | @Override 56 | public boolean incrementToken() throws IOException { 57 | 58 | if (!input.incrementToken()) return false; 59 | if (keywordAttribute.isKeyword()) return true; 60 | 61 | /** 62 | * copied from {@link org.apache.lucene.analysis.br.BrazilianStemFilter#incrementToken} 63 | */ 64 | final String term = termAttribute.toString(); 65 | final String s = stem(term); 66 | // If not stemmed, don't waste the time adjusting the token. 67 | if ((s != null) && !s.equals(term)) 68 | termAttribute.setEmpty().append(s); 69 | 70 | return true; 71 | } 72 | 73 | private String stem(String word) throws IOException { 74 | 75 | List parses = parse(word); 76 | 77 | TreeSet set = new TreeSet<>(); 78 | 79 | for (String parse : parses) { 80 | String[] parts = parse.split("\\s+"); 81 | if (parts.length < 1) { 82 | log.warn("unexpected line " + parse); 83 | continue; 84 | } 85 | 86 | String stem = parts[1].trim(); 87 | 88 | int i = stem.indexOf("<"); 89 | 90 | if (i == -1) { 91 | if (stem.contains("+?")) 92 | return word; 93 | else { 94 | log.warn("unexpected stem " + stem); 95 | continue; 96 | } 97 | } 98 | 99 | set.add(stem.substring(0, i)); 100 | } 101 | 102 | if (set.size() == 1) return set.first(); 103 | 104 | switch (aggregation) { 105 | case "max": 106 | return set.pollLast(); 107 | case "min": 108 | return set.pollFirst(); 109 | default: 110 | throw new RuntimeException("unknown strategy " + aggregation); 111 | } 112 | } 113 | 114 | private List parse(String word) throws IOException { 115 | List list = new ArrayList<>(); 116 | java.lang.Runtime rt = java.lang.Runtime.getRuntime(); 117 | java.lang.Process p2 = rt.exec(lookup_fst); 118 | Piper pipe = new Piper(new ByteArrayInputStream(word.getBytes(StandardCharsets.UTF_8)), p2.getOutputStream()); 119 | new Thread(pipe).start(); 120 | try { 121 | p2.waitFor(); 122 | } catch (InterruptedException ie) { 123 | return list; 124 | } 125 | 126 | try (BufferedReader r = new BufferedReader(new java.io.InputStreamReader(p2.getInputStream()))) { 127 | String s; 128 | while ((s = r.readLine()) != null) { 129 | 130 | s = s.trim(); 131 | if (s.length() == 0) continue; 132 | 133 | if (s.startsWith(word)) 134 | list.add(s); 135 | else 136 | log.warn("unexpected line from word " + word + " " + s); 137 | } 138 | } 139 | return list; 140 | 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/Zemberek2DeASCIIfyFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr; 2 | 3 | import net.zemberek.erisim.Zemberek; 4 | import net.zemberek.islemler.KelimeKokFrekansKiyaslayici; 5 | import net.zemberek.islemler.cozumleme.CozumlemeSeviyesi; 6 | import net.zemberek.tr.yapi.TurkiyeTurkcesi; 7 | import net.zemberek.yapi.Kelime; 8 | import org.apache.lucene.analysis.TokenFilter; 9 | import org.apache.lucene.analysis.TokenStream; 10 | import org.apache.lucene.analysis.core.WhitespaceTokenizer; 11 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 12 | import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; 13 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 14 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 15 | import org.apache.lucene.analysis.TokenFilterFactory; 16 | import org.apache.lucene.util.AttributeSource; 17 | 18 | import java.io.IOException; 19 | import java.io.StringReader; 20 | import java.util.ArrayList; 21 | import java.util.Arrays; 22 | import java.util.HashMap; 23 | import java.util.Map; 24 | 25 | /** 26 | * Factory for {@link Zemberek2DeASCIIfyFilter}. 27 | */ 28 | public class Zemberek2DeASCIIfyFilterFactory extends TokenFilterFactory { 29 | 30 | private final Zemberek zemberek = new Zemberek(new TurkiyeTurkcesi()); 31 | static final String DEASCII_TOKEN_TYPE = ""; 32 | 33 | public Zemberek2DeASCIIfyFilterFactory(Map args) { 34 | super(args); 35 | if (!args.isEmpty()) { 36 | throw new IllegalArgumentException("Unknown parameters: " + args); 37 | } 38 | } 39 | 40 | 41 | @Override 42 | public TokenStream create(TokenStream input) { 43 | return new Zemberek2DeASCIIfyFilter(input); 44 | } 45 | 46 | /** 47 | * DeASCIIfier based on Zemberek2 48 | * Modified from 49 | * org.apache.lucene.wordnet.SynonymTokenFilter 50 | */ 51 | private final class Zemberek2DeASCIIfyFilter extends TokenFilter { 52 | 53 | private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); 54 | private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); 55 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 56 | private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); 57 | 58 | private String[] stack = null; 59 | private int index = 0; 60 | private AttributeSource.State current = null; 61 | private int todo = 0; 62 | 63 | public Zemberek2DeASCIIfyFilter(TokenStream input) { 64 | super(input); 65 | } 66 | 67 | @Override 68 | public boolean incrementToken() throws IOException { 69 | 70 | while (todo > 0 && index < stack.length) { // pop from stack 71 | if (createToken(stack[index++], current)) { 72 | todo--; 73 | return true; 74 | } 75 | } 76 | 77 | if (!input.incrementToken()) return false; 78 | if (keywordAttribute.isKeyword()) return true; 79 | 80 | // stack = zemberek.asciidenTurkceye(termAttribute.toString()); 81 | 82 | Kelime[] kelimeler = zemberek.asciiToleransliCozumleyici().cozumle(termAttribute.toString(), CozumlemeSeviyesi.TUM_KOKLER); 83 | Arrays.sort(kelimeler, new KelimeKokFrekansKiyaslayici()); 84 | 85 | ArrayList olusumlar = new ArrayList<>(kelimeler.length); 86 | 87 | for (Kelime kelime : kelimeler) { 88 | String olusum = kelime.icerikStr(); 89 | if (!olusumlar.contains(olusum)) 90 | olusumlar.add(olusum); 91 | } 92 | 93 | olusumlar.remove(termAttribute.toString()); 94 | stack = olusumlar.toArray(new String[olusumlar.size()]); 95 | 96 | index = 0; 97 | current = captureState(); 98 | todo = stack.length; 99 | return true; 100 | } 101 | 102 | private boolean createToken(String synonym, AttributeSource.State current) { 103 | restoreState(current); 104 | termAttribute.setEmpty().append(synonym); 105 | typeAtt.setType(DEASCII_TOKEN_TYPE); 106 | posIncrAtt.setPositionIncrement(0); 107 | return true; 108 | } 109 | 110 | @Override 111 | public void reset() throws IOException { 112 | super.reset(); 113 | stack = null; 114 | index = 0; 115 | current = null; 116 | todo = 0; 117 | } 118 | } 119 | 120 | 121 | public static void main(String[] args) throws IOException { 122 | 123 | StringReader reader = new StringReader("kus asisi ortaklar çekişme masali"); 124 | 125 | Map map = new HashMap<>(); 126 | 127 | 128 | Zemberek2DeASCIIfyFilterFactory factory = new Zemberek2DeASCIIfyFilterFactory(map); 129 | WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer(); 130 | whitespaceTokenizer.setReader(reader); 131 | 132 | TokenStream stream = factory.create(whitespaceTokenizer); 133 | 134 | CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); 135 | 136 | stream.reset(); 137 | while (stream.incrementToken()) { 138 | 139 | String term = termAttribute.toString(); 140 | System.out.println(term); 141 | } 142 | stream.end(); 143 | reader.close(); 144 | } 145 | } 146 | 147 | 148 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/Zemberek3StemFilter.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.analysis.TokenFilter; 21 | import org.apache.lucene.analysis.TokenStream; 22 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 23 | import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; 24 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 25 | import zemberek.morphology.analysis.SingleAnalysis; 26 | import zemberek.morphology.analysis.WordAnalysis; 27 | 28 | import java.io.IOException; 29 | import java.util.*; 30 | import java.util.stream.Collectors; 31 | 32 | /** 33 | * Stemmer based on Zemberek3 34 | */ 35 | public final class Zemberek3StemFilter extends TokenFilter { 36 | 37 | private static final HashSet skipTypes = new HashSet<>(Arrays.asList("", "", "", "", "", "", "")); 38 | private final MyTurkishMorphology morphology; 39 | private final String aggregation; 40 | 41 | private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); 42 | private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); 43 | private final TypeAttribute typeAtt = this.addAttribute(TypeAttribute.class); 44 | 45 | public Zemberek3StemFilter(TokenStream input, MyTurkishMorphology morphology, String aggregation) { 46 | super(input); 47 | this.morphology = morphology; 48 | this.aggregation = aggregation; 49 | } 50 | 51 | private static List selectMorphemes(WordAnalysis results, String strategy) { 52 | 53 | // if 0 or 1 54 | if (results.analysisCount() < 2) return results.stream().collect(Collectors.toList()); 55 | 56 | switch (strategy) { 57 | case "all": 58 | return results.stream().collect(Collectors.toList()); 59 | case "maxMorpheme": 60 | final int max = results.stream().map(morphParse -> morphParse.getMorphemeDataList().size()).max(Comparator.naturalOrder()).get(); 61 | return results.stream().filter(parse -> parse.getMorphemeDataList().size() == max).collect(Collectors.toList()); 62 | case "minMorpheme": 63 | final int min = results.stream().map(morphParse -> morphParse.getMorphemeDataList().size()).min(Comparator.naturalOrder()).get(); 64 | return results.stream().filter(parse -> parse.getMorphemeDataList().size() == min).collect(Collectors.toList()); 65 | default: 66 | throw new RuntimeException("unknown strategy " + strategy); 67 | 68 | } 69 | } 70 | 71 | private static List morphToString(List results, String methodName) { 72 | 73 | List list = new ArrayList<>(); 74 | 75 | switch (methodName) { 76 | case "stems": 77 | for (SingleAnalysis result : results) 78 | list.addAll(result.getStems()); 79 | return list; 80 | case "lemmas": 81 | for (SingleAnalysis result : results) { 82 | if (result.isUnknown()) { 83 | System.out.println("unknown"); 84 | list.addAll(result.getStems()); 85 | } else 86 | list.addAll(result.getLemmas()); 87 | } 88 | return list; 89 | default: 90 | throw new RuntimeException("unknown method name " + methodName); 91 | } 92 | 93 | 94 | } 95 | 96 | static String stem(WordAnalysis results, String aggregation) { 97 | 98 | List alternatives = selectMorphemes(results, "minMorpheme"); 99 | 100 | List candidates = morphToString(alternatives, "lemmas"); 101 | 102 | switch (aggregation) { 103 | case "maxLength": 104 | return Collections.max(candidates, Comparator.comparing(String::length)); 105 | case "minLength": 106 | return Collections.min(candidates, Comparator.comparing(String::length)); 107 | default: 108 | throw new RuntimeException("unknown strategy " + aggregation); 109 | } 110 | } 111 | 112 | @Override 113 | public boolean incrementToken() throws IOException { 114 | 115 | if (!input.incrementToken()) return false; 116 | if (keywordAttribute.isKeyword()) return true; 117 | if (skipTypes.contains(typeAtt.type())) return true; 118 | 119 | /* 120 | * copied from {@link org.apache.lucene.analysis.br.BrazilianStemFilter#incrementToken} 121 | */ 122 | final String word = termAttribute.toString(); 123 | 124 | final WordAnalysis parses = morphology.analyze(word); 125 | if (parses.analysisCount() == 0) return true; 126 | 127 | final String s = stem(parses, aggregation); 128 | // If not stemmed, don't waste the time adjusting the token. 129 | if ((s != null) && !s.equals(word)) 130 | termAttribute.setEmpty().append(s); 131 | 132 | return true; 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/Zemberek3StemFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | 21 | import org.apache.lucene.analysis.TokenStream; 22 | import org.apache.lucene.util.ResourceLoader; 23 | import org.apache.lucene.util.ResourceLoaderAware; 24 | import org.apache.lucene.analysis.TokenFilterFactory; 25 | import zemberek.morphology.analysis.SingleAnalysis; 26 | import zemberek.morphology.analysis.WordAnalysis; 27 | 28 | 29 | import java.io.IOException; 30 | import java.util.ArrayList; 31 | import java.util.List; 32 | import java.util.Locale; 33 | import java.util.Map; 34 | 35 | /** 36 | * Factory for {@link Zemberek3StemFilter}. 37 | *
 38 |  * <fieldType name="zemberek3" class="solr.TextField" positionIncrementGap="100">
 39 |  * <analyzer>
 40 |  * <tokenizer class="solr.StandardTokenizerFactory"/>
 41 |  * <filter class="solr.ApostropheFilterFactory"/>
 42 |  * <filter class="solr.TurkishLowerCaseFilterFactory"/>
 43 |  * <filter class="solr.Zemberek3StemFilterFactory" strategy="maxLength" dictionary="master-dictionary.dict,secondary-dictionary.dict,non-tdk.dict,proper.dict"/>
 44 |  * </analyzer>
 45 |  * </fieldType>
46 | */ 47 | public class Zemberek3StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { 48 | 49 | private MyTurkishMorphology morphology; 50 | 51 | private final String strategy; 52 | private final String dictionaryFiles; 53 | 54 | public Zemberek3StemFilterFactory(Map args) { 55 | super(args); 56 | dictionaryFiles = get(args, "dictionary"); 57 | strategy = get(args, "strategy", "maxLength"); 58 | 59 | if (!args.isEmpty()) { 60 | throw new IllegalArgumentException("Unknown parameters: " + args); 61 | } 62 | } 63 | 64 | @Override 65 | public void inform(ResourceLoader loader) throws IOException { 66 | 67 | if (dictionaryFiles == null || dictionaryFiles.trim().isEmpty()) { 68 | this.morphology = MyTurkishMorphology.createWithDefaults(); 69 | // Use default dictionaries shipped with Zemberek3. 70 | return; 71 | } 72 | List lines = new ArrayList<>(); 73 | 74 | List files = splitFileNames(dictionaryFiles); 75 | if (files.size() > 0) { 76 | for (String file : files) { 77 | List wlist = getLines(loader, file.trim()); 78 | lines.addAll(wlist); 79 | } 80 | } 81 | 82 | if (lines.isEmpty()) { 83 | this.morphology = MyTurkishMorphology.createWithDefaults(); 84 | // Use default dictionaries shipped with Zemberek3. 85 | return; 86 | } 87 | 88 | 89 | String[] linesArray = new String[lines.size()]; 90 | linesArray = lines.toArray(linesArray); 91 | morphology = (new MyTurkishMorphology.Builder()).setLexicon(linesArray).build(); 92 | 93 | 94 | } 95 | 96 | @Override 97 | public TokenStream create(TokenStream input) { 98 | return new Zemberek3StemFilter(input, morphology, strategy); 99 | } 100 | 101 | private static void parse(String word, MyTurkishMorphology morphology) { 102 | 103 | WordAnalysis results = morphology.analyze(word); 104 | System.out.println("Word = " + word + " has " + results.analysisCount() + " many solutions"); 105 | 106 | if (results.analysisCount() == 0) return; 107 | 108 | System.out.println("Parses: "); 109 | 110 | for (SingleAnalysis result : results) { 111 | System.out.println("number of morphemes = " + result.getMorphemeDataList().size()); 112 | System.out.println(result.formatLong()); 113 | System.out.println("\tStems = " + result.getStems()); 114 | System.out.println("\tLemmas = " + result.getLemmas()); 115 | System.out.println("\tStemAndEnding = " + result.getStemAndEnding()); 116 | System.out.println("-------------------"); 117 | } 118 | 119 | System.out.println("final selected stem : " + Zemberek3StemFilter.stem(results, "maxLength")); 120 | System.out.println("=================================="); 121 | } 122 | 123 | public static void main(String[] args) throws IOException { 124 | 125 | MyTurkishMorphology morphology = MyTurkishMorphology.createWithDefaults(); 126 | 127 | 128 | String a = "0.25 4p.05 4p.x kuş asisi ortaklar çekişme masalı İCARETİN DE ARTMASI BEKLENİYOR\n" + 129 | "Savinykh, Ege Bölgesi Sanayi Odası'nda (EBSO) düzenlenen \"Belarus Türkiye Yatırım ve İşbirliği Olanakları Semineri\"nde yaptığı konuşmada, \" 2 Haziran'dan itibaren Türk halkı vizesiz olarak Belarus'a gidip gelebilecek. İki ülke arasındaki ticaret bu anlaşma ile daha da artacak\" dedi. Türkiye ile Belarus arasında ticari, kültürel ve sosyal ilişkilerin gelişmesini arzu ettiklerini kaydeden Andrei Savinykh, ülkesinin Kırgızistan ve Kazakistan ile Gümrük Birliği anlaşması bulunduğunu, önümüzdeki kuku birliğ"; 130 | 131 | a = a.toLowerCase(Locale.forLanguageTag("tr")); 132 | 133 | for (String s : a.split("\\s+")) { 134 | parse(s, morphology); 135 | } 136 | 137 | } 138 | } -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/Zemberek2StemFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import net.zemberek.erisim.Zemberek; 21 | import net.zemberek.islemler.KelimeKokFrekansKiyaslayici; 22 | import net.zemberek.islemler.cozumleme.CozumlemeSeviyesi; 23 | import net.zemberek.tr.yapi.TurkiyeTurkcesi; 24 | import net.zemberek.yapi.Kelime; 25 | import net.zemberek.yapi.Kok; 26 | import org.apache.lucene.analysis.TokenFilter; 27 | import org.apache.lucene.analysis.TokenStream; 28 | import org.apache.lucene.analysis.core.WhitespaceTokenizer; 29 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 30 | import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; 31 | import org.apache.lucene.util.ResourceLoader; 32 | import org.apache.lucene.util.ResourceLoaderAware; 33 | import org.apache.lucene.analysis.TokenFilterFactory; 34 | 35 | import java.io.IOException; 36 | import java.io.StringReader; 37 | import java.util.Arrays; 38 | import java.util.Comparator; 39 | import java.util.HashMap; 40 | import java.util.Map; 41 | 42 | /** 43 | * Factory for {@link Zemberek2StemFilter}. 44 | */ 45 | public class Zemberek2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { 46 | 47 | private static final RootLengthComparator ROOT_LENGTH_COMPARATOR = new RootLengthComparator(); 48 | private static final RootMorphemeComparator ROOT_MORPHEME_COMPARATOR = new RootMorphemeComparator(); 49 | private static final KelimeKokFrekansKiyaslayici FREQUENCY_COMPARATOR = new KelimeKokFrekansKiyaslayici(); 50 | 51 | private final Zemberek zemberek = new Zemberek(new TurkiyeTurkcesi()); 52 | private final String strategy; 53 | 54 | public Zemberek2StemFilterFactory(Map args) { 55 | super(args); 56 | strategy = get(args, "strategy", "maxLength"); 57 | if (!args.isEmpty()) { 58 | throw new IllegalArgumentException("Unknown parameters: " + args); 59 | } 60 | } 61 | 62 | @Override 63 | public void inform(ResourceLoader loader) throws IOException { 64 | } 65 | 66 | 67 | @Override 68 | public TokenStream create(TokenStream input) { 69 | return new Zemberek2StemFilter(input); 70 | } 71 | 72 | /** 73 | * Stemmer based on Zemberek2 74 | */ 75 | private final class Zemberek2StemFilter extends TokenFilter { 76 | 77 | private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); 78 | private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); 79 | 80 | public Zemberek2StemFilter(TokenStream input) { 81 | super(input); 82 | } 83 | 84 | private String stem(Kelime[] cozumler, String aggregation) { 85 | 86 | if ("first".equals(aggregation) || cozumler.length == 1) { 87 | return cozumler[0].kok().icerik(); 88 | } 89 | 90 | switch (aggregation) { 91 | case "frequency": 92 | Arrays.sort(cozumler, FREQUENCY_COMPARATOR); 93 | return cozumler[0].kok().icerik(); 94 | case "maxLength": 95 | Arrays.sort(cozumler, ROOT_LENGTH_COMPARATOR); 96 | return cozumler[0].kok().icerik(); 97 | case "minLength": 98 | Arrays.sort(cozumler, ROOT_LENGTH_COMPARATOR); 99 | return cozumler[cozumler.length - 1].kok().icerik(); 100 | case "maxMorpheme": 101 | Arrays.sort(cozumler, ROOT_MORPHEME_COMPARATOR); 102 | return cozumler[0].kok().icerik(); 103 | case "minMorpheme": 104 | Arrays.sort(cozumler, ROOT_MORPHEME_COMPARATOR); 105 | return cozumler[cozumler.length - 1].kok().icerik(); 106 | default: 107 | throw new RuntimeException("unknown strategy " + aggregation); 108 | } 109 | } 110 | 111 | @Override 112 | public boolean incrementToken() throws IOException { 113 | 114 | if (!input.incrementToken()) return false; 115 | if (keywordAttribute.isKeyword()) return true; 116 | 117 | final String term = termAttribute.toString(); 118 | final Kelime[] cozumler = zemberek.kelimeCozumle(term, CozumlemeSeviyesi.TUM_KOKLER); 119 | if (cozumler.length == 0) return true; 120 | 121 | final String s = stem(cozumler, strategy); 122 | // If not stemmed, don't waste the time adjusting the token. 123 | if ((s != null) && !s.equals(term)) 124 | termAttribute.setEmpty().append(s); 125 | 126 | return true; 127 | } 128 | } 129 | 130 | private static class RootLengthComparator implements Comparator { 131 | @Override 132 | public int compare(Kelime o1, Kelime o2) { 133 | if (o1 == null || o2 == null) return -1; 134 | final Kok k1 = o1.kok(); 135 | final Kok k2 = o2.kok(); 136 | return k2.icerik().length() - k1.icerik().length(); 137 | } 138 | } 139 | 140 | private static class RootMorphemeComparator implements Comparator { 141 | @Override 142 | public int compare(Kelime o1, Kelime o2) { 143 | if (o1 == null || o2 == null) return -1; 144 | return o2.ekler().size() - o1.ekler().size(); 145 | } 146 | } 147 | 148 | public static void main(String[] args) throws IOException { 149 | 150 | StringReader reader = new StringReader("elması utansın ortaklar çekişme ile"); 151 | 152 | Map map = new HashMap<>(); 153 | map.put("strategy", "frequency"); 154 | 155 | Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map); 156 | 157 | WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer(); 158 | whitespaceTokenizer.setReader(reader); 159 | 160 | TokenStream stream = factory.create(whitespaceTokenizer); 161 | 162 | CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); 163 | 164 | stream.reset(); 165 | while (stream.incrementToken()) { 166 | 167 | String term = termAttribute.toString(); 168 | System.out.println(term); 169 | } 170 | stream.end(); 171 | reader.close(); 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | org.apache.lucene 6 | TurkishAnalysis 7 | 9.4.0 8 | jar 9 | 10 | TurkishAnalysis 11 | Turkish analysis components for Apache Lucene/Solr 12 | http://iorixxx.github.io 13 | 14 | 15 | 16 | The Apache License, Version 2.0 17 | http://www.apache.org/licenses/LICENSE-2.0.txt 18 | 19 | 20 | 21 | 22 | 23 | Ahmet Arslan 24 | aarslan2@eskisehir.edu.tr 25 | Eskisehir Technical University 26 | https://www.eskisehir.edu.tr 27 | 28 | 29 | 30 | 31 | scm:git:git://github.com/iorixxx/lucene-solr-analysis-turkish.git 32 | scm:git:ssh://github.com/iorixxx/lucene-solr-analysis-turkish.git 33 | https://github.com/iorixxx/lucene-solr-analysis-turkish/tree/master 34 | 35 | 36 | 37 | 1.8 38 | 1.8 39 | UTF-8 40 | 41 | 42 | 43 | 44 | ahmetaa-repo 45 | ahmetaa Maven Repo on Github 46 | https://raw.github.com/ahmetaa/maven-repo/master 47 | 48 | 49 | 50 | 51 | 52 | internal.repo 53 | Temporary Staging Repository 54 | file://${project.build.directory}/mvn-repo 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | org.apache.maven.plugins 63 | maven-source-plugin 64 | 3.2.1 65 | 66 | 67 | attach-sources 68 | 69 | jar 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | org.apache.maven.plugins 78 | maven-jar-plugin 79 | 3.3.0 80 | 81 | 82 | **/analysis/tr/**/* 83 | 84 | 85 | 86 | 87 | 88 | org.codehaus.mojo 89 | exec-maven-plugin 90 | 3.1.0 91 | 92 | 93 | build-maps 94 | generate-resources 95 | 96 | java 97 | 98 | 99 | 100 | 101 | org.apache.lucene.App 102 | 103 | 104 | 105 | 106 | org.apache.maven.plugins 107 | maven-compiler-plugin 108 | 3.10.1 109 | 110 | 111 | default-compile 112 | generate-resources 113 | 114 | 115 | org/apache/lucene/App.java 116 | 117 | 118 | 119 | 120 | default-cli 121 | compile 122 | 123 | compile 124 | 125 | 126 | 127 | 128 | 1.8 129 | 1.8 130 | UTF-8 131 | 132 | 133 | 134 | 135 | 136 | org.apache.maven.plugins 137 | maven-dependency-plugin 138 | 3.3.0 139 | 140 | compile 141 | zemberek-nlp 142 | zemberek-core,zemberek-morphology 143 | ${project.build.directory}/lib 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | io.github.starlangsoftware 155 | MorphologicalAnalysis 156 | 1.0.57 157 | jar 158 | compile 159 | 160 | 161 | 162 | io.github.starlangsoftware 163 | MorphologicalDisambiguation 164 | 1.0.21 165 | jar 166 | compile 167 | 168 | 169 | 170 | zemberek-nlp 171 | zemberek-morphology 172 | 0.17.1 173 | jar 174 | compile 175 | 176 | 177 | 178 | org.apache.lucene 179 | lucene-test-framework 180 | ${project.version} 181 | jar 182 | test 183 | 184 | 185 | 186 | org.apache.lucene 187 | lucene-analysis-common 188 | ${project.version} 189 | jar 190 | compile 191 | 192 | 193 | 194 | org.slf4j 195 | slf4j-api 196 | 2.0.3 197 | compile 198 | jar 199 | 200 | 201 | 202 | zemberek 203 | zemberek-tr 204 | 2.1.3 205 | jar 206 | system 207 | ${project.basedir}/solr/lib/zemberek-tr-2.1.3.jar 208 | 209 | 210 | 211 | zemberek 212 | zemberek-cekirdek 213 | 2.1.3 214 | jar 215 | system 216 | ${project.basedir}/solr/lib/zemberek-cekirdek-2.1.3.jar 217 | 218 | 219 | 220 | 221 | 222 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/MyTurkishMorphology.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr; 2 | 3 | import com.google.common.base.Stopwatch; 4 | import zemberek.core.logging.Log; 5 | import zemberek.core.text.TextUtil; 6 | import zemberek.core.turkish.PrimaryPos; 7 | import zemberek.core.turkish.StemAndEnding; 8 | import zemberek.core.turkish.Turkish; 9 | import zemberek.core.turkish.TurkishAlphabet; 10 | import zemberek.morphology.analysis.RuleBasedAnalyzer; 11 | import zemberek.morphology.analysis.SingleAnalysis; 12 | import zemberek.morphology.analysis.WordAnalysis; 13 | import zemberek.morphology.lexicon.RootLexicon; 14 | import zemberek.morphology.morphotactics.InformalTurkishMorphotactics; 15 | import zemberek.morphology.morphotactics.TurkishMorphotactics; 16 | 17 | import java.util.Collections; 18 | import java.util.List; 19 | import java.util.concurrent.TimeUnit; 20 | import java.util.stream.Collectors; 21 | 22 | /** 23 | * A variant of {@link zemberek.morphology.TurkishMorphology} simplified for a pre-tokenized input. 24 | */ 25 | public final class MyTurkishMorphology { 26 | 27 | private final RootLexicon lexicon; 28 | private final RuleBasedAnalyzer analyzer; 29 | private final TurkishMorphotactics morphotactics; 30 | 31 | 32 | private MyTurkishMorphology(MyTurkishMorphology.Builder builder) { 33 | 34 | this.lexicon = builder.lexicon; 35 | if (lexicon.isEmpty()) { 36 | Log.warn("TurkishMorphology class is being instantiated with empty root lexicon."); 37 | } 38 | 39 | this.morphotactics = builder.informalAnalysis ? 40 | new InformalTurkishMorphotactics(this.lexicon) : new TurkishMorphotactics(this.lexicon); 41 | 42 | this.analyzer = builder.ignoreDiacriticsInAnalysis ? 43 | RuleBasedAnalyzer.ignoreDiacriticsInstance(morphotactics) : 44 | RuleBasedAnalyzer.instance(morphotactics); 45 | 46 | } 47 | 48 | 49 | public static MyTurkishMorphology createWithDefaults() { 50 | Stopwatch sw = Stopwatch.createStarted(); 51 | MyTurkishMorphology instance = new MyTurkishMorphology.Builder().setLexicon(RootLexicon.getDefault()).build(); 52 | Log.info("Initialized in %d ms.", sw.elapsed(TimeUnit.MILLISECONDS)); 53 | return instance; 54 | } 55 | 56 | public static zemberek.morphology.TurkishMorphology create(RootLexicon lexicon) { 57 | return new zemberek.morphology.TurkishMorphology.Builder().setLexicon(lexicon).build(); 58 | } 59 | 60 | public TurkishMorphotactics getMorphotactics() { 61 | return morphotactics; 62 | } 63 | 64 | public WordAnalysis analyze(String word) { 65 | return analyzeWithoutCache(word); 66 | } 67 | 68 | public RootLexicon getLexicon() { 69 | return lexicon; 70 | } 71 | 72 | /** 73 | * Normalizes the input word and analyses it. If word cannot be parsed following occurs: - if 74 | * input is a number, system tries to parse it by creating a number DictionaryEntry. - if input 75 | * starts with a capital letter, or contains ['] adds a Dictionary entry as a proper noun. - if 76 | * above options does not generate a result, it generates an UNKNOWN dictionary entry and returns 77 | * a parse with it. 78 | * 79 | * @param word input word. 80 | * @return WordAnalysis list. 81 | */ 82 | 83 | public static String normalizeForAnalysis(String word) { 84 | // TODO: This may cause problems for some foreign words with letter I. 85 | String s = word.toLowerCase(Turkish.LOCALE); 86 | s = TurkishAlphabet.INSTANCE.normalizeCircumflex(s); 87 | String noDot = s.replace(".", ""); 88 | if (noDot.length() == 0) { 89 | noDot = s; 90 | 91 | } 92 | return TextUtil.normalizeApostrophes(noDot); 93 | } 94 | 95 | /** 96 | * This should be the entry point to stemming 97 | * 98 | * @param word a word to be stemmed 99 | * @return the stem of the word 100 | */ 101 | List analyzeList(String word) { 102 | 103 | String s = normalizeForAnalysis(word); 104 | 105 | if (s.length() == 0) { 106 | System.out.println("empty " + word); 107 | return Collections.emptyList(); 108 | } 109 | 110 | List result; 111 | 112 | if (TurkishAlphabet.INSTANCE.containsApostrophe(s)) { 113 | s = TurkishAlphabet.INSTANCE.normalizeApostrophe(s); 114 | result = analyzeWordsWithApostrophe(s); 115 | } else { 116 | result = analyzer.analyze(s); 117 | } 118 | 119 | if (result.size() == 0) { 120 | System.out.println("unknown word: " + word); 121 | return Collections.emptyList(); 122 | } 123 | 124 | if (result.size() == 1 && result.get(0).getDictionaryItem().isUnknown()) { 125 | return Collections.emptyList(); 126 | } 127 | 128 | return result; 129 | } 130 | 131 | private WordAnalysis analyzeWithoutCache(String word) { 132 | 133 | String s = normalizeForAnalysis(word); 134 | 135 | if (s.length() == 0) { 136 | System.out.println("empty " + word); 137 | return WordAnalysis.EMPTY_INPUT_RESULT; 138 | } 139 | 140 | List result; 141 | 142 | if (TurkishAlphabet.INSTANCE.containsApostrophe(s)) { 143 | s = TurkishAlphabet.INSTANCE.normalizeApostrophe(s); 144 | result = analyzeWordsWithApostrophe(s); 145 | } else { 146 | result = analyzer.analyze(s); 147 | } 148 | 149 | if (result.size() == 0) { 150 | System.out.println("unknown word: " + word); 151 | result = Collections.emptyList(); 152 | } 153 | 154 | if (result.size() == 1 && result.get(0).getDictionaryItem().isUnknown()) { 155 | result = Collections.emptyList(); 156 | } 157 | 158 | return new WordAnalysis(word, s, result); 159 | } 160 | 161 | public List analyzeWordsWithApostrophe(String word) { 162 | 163 | int index = word.indexOf('\''); 164 | 165 | if (index <= 0 || index == word.length() - 1) { 166 | return Collections.emptyList(); 167 | } 168 | 169 | StemAndEnding se = new StemAndEnding( 170 | word.substring(0, index), 171 | word.substring(index + 1)); 172 | 173 | String stem = TurkishAlphabet.INSTANCE.normalize(se.stem); 174 | 175 | String withoutQuote = word.replace("'", ""); 176 | 177 | List noQuotesParses = analyzer.analyze(withoutQuote); 178 | if (noQuotesParses.size() == 0) { 179 | return Collections.emptyList(); 180 | } 181 | 182 | // TODO: this is somewhat a hack.Correct here once we decide what to do about 183 | // words like "Hastanesi'ne". Should we accept Hastanesi or Hastane? 184 | return noQuotesParses.stream() 185 | .filter( 186 | a -> a.getDictionaryItem().primaryPos == PrimaryPos.Noun && 187 | (a.containsMorpheme(TurkishMorphotactics.p3sg) || a.getStem().equals(stem))) 188 | .collect(Collectors.toList()); 189 | } 190 | 191 | 192 | public static MyTurkishMorphology.Builder builder() { 193 | return new MyTurkishMorphology.Builder(); 194 | } 195 | 196 | public static MyTurkishMorphology.Builder builder(RootLexicon lexicon) { 197 | return new MyTurkishMorphology.Builder().setLexicon(lexicon); 198 | } 199 | 200 | public static class Builder { 201 | 202 | RootLexicon lexicon = new RootLexicon(); 203 | 204 | boolean informalAnalysis = false; 205 | boolean ignoreDiacriticsInAnalysis = false; 206 | 207 | public MyTurkishMorphology.Builder setLexicon(RootLexicon lexicon) { 208 | this.lexicon = lexicon; 209 | return this; 210 | } 211 | 212 | public MyTurkishMorphology.Builder setLexicon(String... dictionaryLines) { 213 | this.lexicon = RootLexicon.fromLines(dictionaryLines); 214 | return this; 215 | } 216 | 217 | public MyTurkishMorphology.Builder useInformalAnalysis() { 218 | this.informalAnalysis = true; 219 | return this; 220 | } 221 | 222 | public MyTurkishMorphology.Builder ignoreDiacriticsInAnalysis() { 223 | this.ignoreDiacriticsInAnalysis = true; 224 | return this; 225 | } 226 | 227 | public MyTurkishMorphology build() { 228 | return new MyTurkishMorphology(this); 229 | } 230 | } 231 | } -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/TurkishDeASCIIfyFilter.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.analysis.CharArrayMap; 21 | import org.apache.lucene.analysis.TokenFilter; 22 | import org.apache.lucene.analysis.TokenStream; 23 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 24 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 25 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 26 | import org.apache.lucene.analysis.tr.util.PatternTableFactory; 27 | 28 | import java.io.IOException; 29 | import java.util.Arrays; 30 | 31 | import static org.apache.lucene.analysis.tr.util.PatternTableFactory.*; 32 | 33 | /** 34 | * Translation of Turkish Deasciifier from Lisp into Java 35 | */ 36 | public final class TurkishDeASCIIfyFilter extends TokenFilter { 37 | 38 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 39 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 40 | private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class); 41 | private final boolean preserveOriginal; 42 | private State state; 43 | 44 | public TurkishDeASCIIfyFilter(TokenStream input, boolean preserveOriginal) { 45 | super(input); 46 | this.preserveOriginal = preserveOriginal; 47 | } 48 | 49 | /** 50 | * Determine if char at cursor needs correction. 51 | */ 52 | private static boolean turkish_need_correction(char c, int point, char[] turkish_string, int length) { 53 | 54 | final Character tr; 55 | 56 | if (turkish_asciify_table.containsKey(c)) 57 | tr = turkish_asciify_table.get(c); 58 | else 59 | tr = c; 60 | 61 | CharArrayMap pl = PatternTableFactory.getMap(Character.toLowerCase(tr)); 62 | 63 | boolean m = false; 64 | if (pl != null) { 65 | m = turkish_match_pattern(pl, point, turkish_string, length); 66 | } 67 | 68 | if (tr.equals('I')) { 69 | if (c == tr) { 70 | return !m; 71 | } else { 72 | return m; 73 | } 74 | } else { 75 | if (c == tr) { 76 | return m; 77 | } else { 78 | return !m; 79 | } 80 | } 81 | } 82 | 83 | private static char[] turkish_get_context(int size, int point, char[] turkish_string, int length) { 84 | 85 | char[] s = new char[1 + (2 * size)]; 86 | Arrays.fill(s, ' '); 87 | 88 | s[size] = 'X'; 89 | 90 | int i = size + 1; 91 | boolean space = false; 92 | int index = point; 93 | index++; 94 | 95 | char current_char; 96 | 97 | while (i < s.length && !space && index < length) { 98 | current_char = turkish_string[index]; 99 | 100 | Character x = turkish_downcase_asciify_table.get(current_char); 101 | 102 | if (x == null) { 103 | i++; 104 | space = true; 105 | 106 | } else { 107 | s[i] = x; 108 | i++; 109 | space = false; 110 | } 111 | index++; 112 | } 113 | 114 | /* 115 | System.out.println("before "); 116 | System.out.println(s.length); 117 | System.out.println(s); 118 | System.out.println(i); 119 | */ 120 | 121 | System.arraycopy(s, 0, s, 0, i); 122 | 123 | /* 124 | System.out.println("after "); 125 | 126 | System.out.println(s); 127 | System.out.println(s.length); 128 | */ 129 | index = point; 130 | i = size - 1; 131 | space = false; 132 | 133 | index--; 134 | 135 | while (i >= 0 && index >= 0) { 136 | current_char = turkish_string[index]; 137 | Character x = turkish_upcase_accents_table.get(current_char); 138 | 139 | if (x == null) { 140 | if (!space) { 141 | i--; 142 | space = true; 143 | } 144 | } else { 145 | s[i] = x; 146 | i--; 147 | space = false; 148 | } 149 | index--; 150 | } 151 | 152 | //System.out.println("return"); 153 | //System.out.println(s); 154 | return s; 155 | } 156 | 157 | private static boolean turkish_match_pattern(CharArrayMap dlist, int point, char[] turkish_string, int length) { 158 | final int turkish_context_size = 10; 159 | int rank = dlist.size() * 2; 160 | char[] str = turkish_get_context(turkish_context_size, point, turkish_string, length); 161 | 162 | //System.out.println("length = " + str.length); 163 | int start = 0; 164 | int end; 165 | int _len = str.length; 166 | 167 | while (start <= turkish_context_size) { 168 | end = turkish_context_size + 1; 169 | while (end <= _len) { 170 | 171 | Integer r = dlist.get(str, start, end - start); 172 | 173 | if (r != null && Math.abs(r) < Math.abs(rank)) { 174 | rank = r; 175 | } 176 | end++; 177 | } 178 | start++; 179 | } 180 | return rank > 0; 181 | } 182 | 183 | /** 184 | * Adds necessary accents to the words in the region. 185 | */ 186 | public static String convert_to_turkish(char[] turkish_string) { 187 | 188 | for (int i = 0; i < turkish_string.length; i++) { 189 | char c = turkish_string[i]; 190 | if (turkish_toggle_accent_table.containsKey(c)) { 191 | if (turkish_need_correction(c, i, turkish_string, turkish_string.length)) { 192 | turkish_string[i] = turkish_toggle_accent_table.get(c); 193 | } 194 | } 195 | } 196 | return new String(turkish_string); 197 | } 198 | 199 | /** 200 | * Adds necessary accents to the words in the region. 201 | */ 202 | public boolean convert_to_turkish(char[] turkish_string, int length) { 203 | 204 | boolean returnValue = false; 205 | boolean flag = true; 206 | 207 | for (int i = 0; i < length; i++) { 208 | char c = turkish_string[i]; 209 | if (turkish_toggle_accent_table.containsKey(c)) { 210 | if (turkish_need_correction(c, i, turkish_string, length)) { 211 | /** works only once **/ 212 | if (flag && preserveOriginal) { 213 | // we are about to make a change 214 | // capture original state 215 | state = captureState(); 216 | flag = false; 217 | } 218 | turkish_string[i] = turkish_toggle_accent_table.get(c); 219 | returnValue = true; 220 | } 221 | } 222 | } 223 | 224 | return returnValue; 225 | } 226 | 227 | 228 | @Override 229 | public boolean incrementToken() throws IOException { 230 | if (state != null) { 231 | assert preserveOriginal : "state should only be captured if preserveOriginal is true"; 232 | restoreState(state); 233 | posIncAttr.setPositionIncrement(0); 234 | state = null; 235 | return true; 236 | } 237 | if (input.incrementToken()) { 238 | final char[] buffer = termAtt.buffer(); 239 | final int length = termAtt.length(); 240 | if (convert_to_turkish(buffer, length)) 241 | typeAtt.setType(Zemberek2DeASCIIfyFilterFactory.DEASCII_TOKEN_TYPE); 242 | return true; 243 | } else { 244 | return false; 245 | } 246 | } 247 | 248 | @Override 249 | public void reset() throws IOException { 250 | super.reset(); 251 | state = null; 252 | } 253 | } 254 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Turkish analysis components for Apache Lucene/Solr 2 | [![Build Status](https://travis-ci.org/iorixxx/lucene-solr-analysis-turkish.svg?branch=master)](https://travis-ci.org/iorixxx/lucene-solr-analysis-turkish) 3 | 4 | The use of *Open Source Software* is gaining increasing momentum in Turkey. 5 | Turkish users on Apache Lucene/Solr (and other [Apache Projects](https://projects.apache.org/projects.html)) mailing lists are increasing. 6 | This project makes use of publicly available Turkish NLP tools to create [Apache Lucene/Solr plugins](https://cwiki.apache.org/confluence/display/solr/Solr+Plugins) from them. 7 | I created this project in order to promote and support open source. 8 | Stock Lucene/Solr has [SnowballPorterFilter(Factory)](https://cwiki.apache.org/confluence/display/solr/Language+Analysis#LanguageAnalysis-Turkish) for the Turkish language. 9 | However, this stemmer performs poorly and has funny collisions. 10 | For example; *altın*, *alim*, *alın*, *altan*, and *alıntı* are all reduced to a same stem. 11 | In other words, they are treated as if they were the same word even though they have completely different meanings. 12 | I will post some other harmful collisions here. 13 | 14 | #### How to enable this plugin? Quick way :new: :purple_heart: 15 | If you do not want to build this library and patch solr: To avoid all the hassle, just download my solr-7.3.0.tgz build from [https://www.dropbox.com/s/yygdvwoe4cc7d46/solr-7.3.0.tgz](https://www.dropbox.com/s/yygdvwoe4cc7d46/solr-7.3.0.tgz) 16 | It is a link to my Dropbox account. The plugin is enabled in this distribution. All you need to download it and run `bin/solr -start` 17 | It has a core named `zemberek` activated by default. Just go to the admin/analysis page and select `text_tr` type and enter some Turkish text. 18 | You you press the analyze button, you would see Zemberek stem filter working nicely. 19 | 20 | If you are a Docker user, please use [Dockerfile](https://github.com/docker-solr/docker-solr/blob/78b52ecefa3441518561bdd504a2ac8b53755540/7.3/Dockerfile) and override the solr download location with e.g.: 21 | `docker build -t mine --build-arg SOLR_DOWNLOAD_SERVER=https://www.dropbox.com/s/yygdvwoe4cc7d46/solr-7.3.0.tgz .` 22 | 23 | 24 | To make the best out of this library quickly, without going much into details, please do either: 25 | 26 | ##### TurkishAnalyzer for Solr Users 27 | If you are a Solr user, please use the following field type definition for Turkish. 28 | ``` xml 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | ``` 40 | 41 | ##### TurkishAnalyzer for Lucene Users 42 | If you are a Lucene user, please use the following custom analyzer declaration to create an analyzer for Turkish. 43 | ```java 44 | Analyzer analyzer = CustomAnalyzer.builder() 45 | .withTokenizer("standard") 46 | .addTokenFilter("apostrophe") 47 | .addTokenFilter("turkishlowercase") 48 | .addTokenFilter(Zemberek3StemFilterFactory.class) 49 | .build(); 50 | ``` 51 | 52 | ##### How to obtain necessary JAR files? 53 | To obtain the JAR files required to active Turkish Analysis plugin, please use the `mvn clean package dependency:copy-dependencies` maven command. 54 | It copies required jar files to the target/lib directory. Plus you need to manually copy target/TurkishAnalysis-*.jar to the lib directory. 55 | 56 | 57 | 58 | Currently we have five custom TokenFilters. 59 | To load the plugins, place specified JAR files (along with TurkishAnalysis-*.jar, which can be created by executing `mvn package` command) in a `lib` directory in the Solr Home directory. 60 | This directory does not exist in the distribution, so you would need to create it for the first time. 61 | The location for the `lib` directory is near the solr.xml file. 62 | #### TurkishDeASCIIfyFilter(Factory) 63 | ___ 64 | Translation of [Emacs Turkish mode](http://www.denizyuret.com/2006/11/emacs-turkish-mode.html) from Lisp into Java. 65 | This filter is intended to be used to allow *diacritics-insensitive search* for Turkish. 66 | 67 | **Arguments**: 68 | * `preserveOriginal`: (true/false) If **true**, the original token is preserved. The default is **false**. 69 | 70 | **Example**: 71 | ``` xml 72 | 73 | 74 | 75 | 76 | ``` 77 | 78 | #### Zemberek3StemFilter(Factory) 79 | ___ 80 | Turkish Stemmer based on [Zemberek3](https://github.com/ahmetaa/zemberek-nlp). 81 | 82 | **JARs**: zemberek-morphology-0.11.1.jar zemberek-core-0.11.1.jar 83 | 84 | **Arguments**: 85 | * `strategy`: Strategy to choose one of the multiple stem forms by selecting either longest or shortest stem. Valid values are maxLength (the default) or minLength. 86 | * `dictionary`: Zemberek3's dictionary (*.dict) files, which can be download from [here](https://github.com/ahmetaa/zemberek-nlp/tree/master/morphology/src/main/resources/tr) and could be modified if required. 87 | You may want to add new dictionary items especially for product search. Usually product titles and descriptions are not pure Turkish. 88 | When it comes to product search, you may be well familiar with product titles such as `Amigalar için oyun`, `iPadler için çanta`, and so on. 89 | If you want to handle such non-Turkish product names inflected with Turkish suffixes, the most elegant way is to modify the dictionaries. 90 | See the [example](https://github.com/ahmetaa/turkish-nlp-examples/blob/master/src/main/java/morphology/AddNewDictionaryItem.java) that adds `tweetlemek` as a verb to the dictionary, so that `tweetledim`, `tweetlemişler`, etc get recognized and stemmed correctly. 91 | 92 | **Example**: 93 | ``` xml 94 | 95 | 96 | 97 | 98 | ``` 99 | 100 | If you are happy with the standard dictionaries that shipped with Zemberek3, or you don't intent to alter them, you may prefer to use the no-args directive. 101 | ``` xml 102 | 103 | ``` 104 | 105 | #### Zemberek2StemFilter(Factory) 106 | ___ 107 | Turkish Stemmer based on [Zemberek2](https://code.google.com/p/zemberek/). 108 | 109 | **JARs**: zemberek-cekirdek-2.1.3.jar zemberek-tr-2.1.3.jar 110 | 111 | **Arguments**: 112 | * `strategy`: Strategy to choose one of the multiple stem forms. Valid values are maxLength (the default), minLength, maxMorpheme, minMorpheme, frequency, or first. 113 | 114 | **Example**: 115 | ``` xml 116 | 117 | 118 | 119 | 120 | ``` 121 | 122 | #### Zemberek2DeASCIIfyFilter(Factory) 123 | ___ 124 | Turkish DeASCIIfier based on [Zemberek2](https://code.google.com/p/zemberek/). 125 | 126 | **JARs**: zemberek-cekirdek-2.1.3.jar zemberek-tr-2.1.3.jar 127 | 128 | **Arguments**: None 129 | 130 | **Example**: 131 | ``` xml 132 | 133 | 134 | 135 | 136 | ``` 137 | 138 | #### TRMorphStemFilter(Factory) 139 | ___ 140 | Turkish Stemmer based on [TRmorph](https://github.com/coltekin/TRmorph). 141 | This one is not production ready yet. 142 | It requires Operating System specific [foma](https://code.google.com/p/foma/) executable. 143 | I couldn't find an elegant way to convert `foma` to java. 144 | I am using *"executing shell commands in Java to call `flookup`"* workaround advised in [FAQ] (http://code.google.com/p/foma/wiki/FAQ). 145 | If you know something better please let me know. 146 | 147 | **Arguments**: 148 | * `lookup`: Absolute path of the OS specific [foma](https://code.google.com/p/foma/) executable. 149 | * `fst`: Absolute path of the stem.fst file. 150 | 151 | **Example**: 152 | ``` xml 153 | 154 | 155 | 156 | 157 | ``` 158 | 159 | I will post benchmark results of different field types (different stemmers) designed for different use-cases. 160 | 161 | ## Dependencies 162 | * JRE 1.8 or above 163 | * Apache Maven 3.0.3 or above 164 | * Apache Lucene (Solr) 6.2.1 or 165 | 166 | ## Author 167 | Please feel free to contact Ahmet Arslan at `iorixxx at yahoo dot com` if you have any questions, comments or contributions. 168 | 169 | ## Citation Policy 170 | If you use this library for a research purpose, please use the following citation: 171 | 172 | ``` tex 173 | @article{ 174 | author = "Ahmet Arslan", 175 | title = "DeASCIIfication approach to handle diacritics in Turkish information retrieval", 176 | journal = "Information Processing & Management", 177 | volume = "52", 178 | number = "2", 179 | pages = "326 - 339", 180 | year = "2016", 181 | doi = "http://dx.doi.org/10.1016/j.ipm.2015.08.004", 182 | url = "http://www.sciencedirect.com/science/article/pii/S0306457315001053" 183 | } 184 | ``` 185 | -------------------------------------------------------------------------------- /src/main/java/org/apache/lucene/analysis/tr/util/PatternTableFactory.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.tr.util; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.lucene.analysis.CharArrayMap; 21 | 22 | import java.util.Collections; 23 | import java.util.HashMap; 24 | import java.util.Map; 25 | 26 | /** 27 | * Compiles a decision list into a hash where keys are patterns and 28 | * values give the rank and the classification of the decision list. The 29 | * sign of a value gives the classification (positive implies t, negative 30 | * implies nil), and the absolute value gives the rank (smaller rank 31 | * means higher priority). 32 | */ 33 | public final class PatternTableFactory { 34 | 35 | /** 36 | * Converts turkish characters into ascii equivalent 37 | */ 38 | public static final Map turkish_asciify_table = 39 | 40 | Collections.unmodifiableMap( 41 | new HashMap() {{ 42 | 43 | put('ç', 'c'); 44 | put('Ç', 'C'); 45 | put('ğ', 'g'); 46 | put('Ğ', 'G'); 47 | put('ö', 'o'); 48 | put('Ö', 'O'); 49 | put('ü', 'u'); 50 | put('Ü', 'U'); 51 | put('ı', 'i'); 52 | put('İ', 'I'); 53 | put('ş', 's'); 54 | put('Ş', 'S'); 55 | 56 | }} 57 | ); 58 | public static final Map turkish_downcase_asciify_table = 59 | 60 | Collections.unmodifiableMap( 61 | new HashMap() {{ 62 | 63 | put('ç', 'c'); 64 | put('Ç', 'c'); 65 | put('ğ', 'g'); 66 | put('Ğ', 'g'); 67 | put('ö', 'o'); 68 | put('Ö', 'o'); 69 | put('ı', 'i'); 70 | put('İ', 'i'); 71 | put('ş', 's'); 72 | put('Ş', 's'); 73 | put('ü', 'u'); 74 | put('Ü', 'u'); 75 | 76 | // for ch in string.uppercase: 77 | // turkish_downcase_asciify_table[ch] = ch.lower() 78 | // turkish_downcase_asciify_table[ch.lower()] = ch.lower() 79 | 80 | 81 | put('A', 'a'); 82 | put('B', 'b'); 83 | put('C', 'c'); 84 | put('D', 'd'); 85 | put('E', 'e'); 86 | put('F', 'f'); 87 | put('G', 'g'); 88 | put('H', 'h'); 89 | put('I', 'i'); 90 | put('J', 'j'); 91 | put('K', 'k'); 92 | put('L', 'l'); 93 | put('M', 'm'); 94 | put('N', 'n'); 95 | put('O', 'o'); 96 | put('P', 'p'); 97 | put('Q', 'q'); 98 | put('R', 'r'); 99 | put('S', 's'); 100 | put('T', 't'); 101 | put('U', 'u'); 102 | put('V', 'v'); 103 | put('W', 'w'); 104 | put('X', 'x'); 105 | put('Y', 'y'); 106 | put('Z', 'z'); 107 | 108 | put('a', 'a'); 109 | put('b', 'b'); 110 | put('c', 'c'); 111 | put('d', 'd'); 112 | put('e', 'e'); 113 | put('f', 'f'); 114 | put('g', 'g'); 115 | put('h', 'h'); 116 | put('i', 'i'); 117 | put('j', 'j'); 118 | put('k', 'k'); 119 | put('l', 'l'); 120 | put('m', 'm'); 121 | put('n', 'n'); 122 | put('o', 'o'); 123 | put('p', 'p'); 124 | put('q', 'q'); 125 | put('r', 'r'); 126 | put('s', 's'); 127 | put('t', 't'); 128 | put('u', 'u'); 129 | put('v', 'v'); 130 | put('w', 'w'); 131 | put('x', 'x'); 132 | put('y', 'y'); 133 | put('z', 'z'); 134 | 135 | 136 | }} 137 | ); 138 | /** 139 | * Lowercase the string except for Turkish accented characters which are converted to uppercase ascii equivalent. 140 | * Useful for pattern matching. Handles all 3 encodings. 141 | * The confusing case of i is as follows: i => i, dotted I => i, dotless i => I, I => I" 142 | */ 143 | public static final Map turkish_upcase_accents_table = 144 | 145 | Collections.unmodifiableMap( 146 | new HashMap() {{ 147 | 148 | put('ç', 'C'); 149 | put('Ç', 'C'); 150 | put('ğ', 'G'); 151 | put('Ğ', 'G'); 152 | put('ö', 'O'); 153 | put('Ö', 'O'); 154 | put('ı', 'I'); 155 | put('İ', 'i'); 156 | put('ş', 'S'); 157 | put('Ş', 'S'); 158 | put('ü', 'U'); 159 | put('Ü', 'U'); 160 | 161 | put('A', 'a'); 162 | put('B', 'b'); 163 | put('C', 'c'); 164 | put('D', 'd'); 165 | put('E', 'e'); 166 | put('F', 'f'); 167 | put('G', 'g'); 168 | put('H', 'h'); 169 | put('I', 'i'); 170 | put('J', 'j'); 171 | put('K', 'k'); 172 | put('L', 'l'); 173 | put('M', 'm'); 174 | put('N', 'n'); 175 | put('O', 'o'); 176 | put('P', 'p'); 177 | put('Q', 'q'); 178 | put('R', 'r'); 179 | put('S', 's'); 180 | put('T', 't'); 181 | put('U', 'u'); 182 | put('V', 'v'); 183 | put('W', 'w'); 184 | put('X', 'x'); 185 | put('Y', 'y'); 186 | put('Z', 'z'); 187 | 188 | put('a', 'a'); 189 | put('b', 'b'); 190 | put('c', 'c'); 191 | put('d', 'd'); 192 | put('e', 'e'); 193 | put('f', 'f'); 194 | put('g', 'g'); 195 | put('h', 'h'); 196 | put('i', 'i'); 197 | put('j', 'j'); 198 | put('k', 'k'); 199 | put('l', 'l'); 200 | put('m', 'm'); 201 | put('n', 'n'); 202 | put('o', 'o'); 203 | put('p', 'p'); 204 | put('q', 'q'); 205 | put('r', 'r'); 206 | put('s', 's'); 207 | put('t', 't'); 208 | put('u', 'u'); 209 | put('v', 'v'); 210 | put('w', 'w'); 211 | put('x', 'x'); 212 | put('y', 'y'); 213 | put('z', 'z'); 214 | 215 | }} 216 | ); 217 | /** 218 | * Converts turkish characters into ascii equivalent and appropriate 219 | * ascii characters to utf-8 turkish accented versions. 220 | */ 221 | public static final Map turkish_toggle_accent_table = 222 | 223 | Collections.unmodifiableMap( 224 | new HashMap() {{ 225 | 226 | put('c', 'ç'); 227 | put('C', 'Ç'); 228 | put('g', 'ğ'); 229 | put('G', 'Ğ'); 230 | put('o', 'ö'); 231 | put('O', 'Ö'); 232 | put('u', 'ü'); 233 | put('U', 'Ü'); 234 | put('i', 'ı'); 235 | put('I', 'İ'); 236 | put('s', 'ş'); 237 | put('S', 'Ş'); 238 | put('ç', 'c'); 239 | put('Ç', 'C'); 240 | put('ğ', 'g'); 241 | put('Ğ', 'G'); 242 | put('ö', 'o'); 243 | put('Ö', 'O'); 244 | put('ü', 'u'); 245 | put('Ü', 'U'); 246 | put('ı', 'i'); 247 | put('İ', 'I'); 248 | put('ş', 's'); 249 | put('Ş', 'S'); 250 | 251 | }} 252 | ); 253 | 254 | static final boolean ignoreCase = false; 255 | 256 | private PatternTableFactory() { 257 | } 258 | 259 | public static CharArrayMap getMap(char c) { 260 | switch (c) { 261 | case 'c': 262 | return MapC.map; 263 | case 'g': 264 | return MapG.map; 265 | case 'i': 266 | return MapI.map; 267 | case 'o': 268 | return MapO.map; 269 | case 's': 270 | return MapS.map; 271 | case 'u': 272 | return MapU.map; 273 | default: 274 | return null; 275 | } 276 | } 277 | 278 | @Override 279 | public Object clone() throws CloneNotSupportedException { 280 | throw new CloneNotSupportedException(); 281 | } 282 | } 283 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2016 Ahmet Arslan 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. --------------------------------------------------------------------------------