├── .gitignore ├── LICENSE ├── README.md ├── pom.xml └── src ├── main ├── java │ └── com │ │ └── nicholasding │ │ └── search │ │ ├── lemmatization │ │ ├── ExceptionList.java │ │ ├── Lemmatizer.java │ │ ├── POS.java │ │ ├── WordNetReader.java │ │ └── impl │ │ │ ├── DetachmentRules.java │ │ │ ├── DirectoryWordNetReader.java │ │ │ ├── PackagedWordNetReader.java │ │ │ └── WordNetLemmatizer.java │ │ ├── solr │ │ ├── LemmatizerFilter.java │ │ └── LemmatizerFilterFactory.java │ │ └── util │ │ ├── RTrie.java │ │ ├── TernarySearchTree.java │ │ └── Trie.java └── resources │ └── wordnet.zip └── test └── java └── com └── nicholasding └── search ├── lemmatization ├── ExceptionListTest.java ├── LemmatizerTest.java ├── WordNetReaderStub.java └── impl │ └── BenchmarkTest.java ├── solr ├── LemmatizerFilterFactoryTest.java └── LemmatizerFilterTest.java └── util └── TrieTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Mobile Tools for Java (J2ME) 4 | .mtj.tmp/ 5 | 6 | # Package Files # 7 | *.jar 8 | *.war 9 | *.ear 10 | 11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 12 | hs_err_pid* 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Nicholas Ding 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # solr-lemmatizer 2 | 3 | A TokenFilter that applies [lemmatization](http://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html) to lemmatize *English* words. It doesn't like stemmer that uses algorithm to drop the suffix, instead, it uses [WordNet](https://wordnet.princeton.edu/wordnet/documentation/) dictionary to lookup the base form of the word (lemma). 4 | 5 | For example, word *better* will be lemmatized to *good*, and *radii* will be lemmatized to *radius*. 6 | 7 | ``` 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | ``` 18 | 19 | By default, the code will use a bundled WordNet database. But you can specify your own WordNet *dict* directory. 20 | 21 | ``` 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | ``` 32 | 33 | For performance improvement, it uses an implementation of [Ternary Search Tree](https://en.wikipedia.org/wiki/Ternary_search_tree) to reduce memory usage and to provide average O(log n) lookup. 34 | 35 | Benchmark with WordNet database. 36 | 37 | ``` 38 | CPU: Intel(R) Core(TM) i7-4770K CPU @ 3.50GHz 39 | 40 | TST initialize: 431 ms 41 | Total access time: 143 ms, 147306 lookups, 970.7683 ns/lookup 42 | 43 | RTrie initialize: 533 ms 44 | Total access time: 3725 ms, 147306 lookups, 25287.496 ns/lookup 45 | ``` -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.nicholasding 8 | solr-lemmatizer 9 | 1.0.0 10 | jar 11 | 12 | ${project.groupId}:${project.artifactId} 13 | A TokenFilter that applies lemmatization to lemmatize English words. 14 | https://github.com/nicholasding/solr-lemmatizer 15 | 16 | 17 | 18 | The MIT License (MIT) 19 | https://github.com/nicholasding/solr-lemmatizer/blob/master/LICENSE 20 | 21 | 22 | 23 | 24 | 25 | Nicholas Ding 26 | nicholasdsj@gmail.com 27 | 28 | 29 | 30 | 31 | scm:git:git://github.com/nicholasding/solr-lemmatizer.git 32 | https://github.com/nicholasding/solr-lemmatizer 33 | 34 | 35 | 36 | 37 | ossrh 38 | https://oss.sonatype.org/content/repositories/snapshots 39 | 40 | 41 | ossrh 42 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 43 | 44 | 45 | 46 | 47 | 48 | 49 | org.apache.maven.plugins 50 | maven-compiler-plugin 51 | 52 | 1.7 53 | 1.7 54 | 55 | 56 | 57 | 58 | org.apache.maven.plugins 59 | maven-gpg-plugin 60 | 1.5 61 | 62 | 63 | sign-artifacts 64 | verify 65 | 66 | sign 67 | 68 | 69 | 70 | 71 | 72 | 73 | org.sonatype.plugins 74 | nexus-staging-maven-plugin 75 | 1.6.7 76 | true 77 | 78 | ossrh 79 | https://oss.sonatype.org/ 80 | true 81 | 82 | 83 | 84 | 85 | 86 | 87 | 6.1.0 88 | 89 | 90 | 91 | 92 | org.apache.lucene 93 | lucene-core 94 | ${lucene.version} 95 | 96 | 97 | org.apache.lucene 98 | lucene-analyzers-common 99 | ${lucene.version} 100 | 101 | 102 | org.apache.lucene 103 | lucene-test-framework 104 | ${lucene.version} 105 | test 106 | 107 | 108 | junit 109 | junit 110 | 4.12 111 | test 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/lemmatization/ExceptionList.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | /** 7 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 8 | */ 9 | public class ExceptionList { 10 | 11 | private Map nounExc, verbExc, adjectiveExc, adverbExc; 12 | 13 | public ExceptionList() {} 14 | 15 | public void addExceptionList(POS pos, Map map) { 16 | switch (pos) { 17 | case NOUN: 18 | if (nounExc == null) nounExc = new HashMap<>(); 19 | nounExc.putAll(map); 20 | break; 21 | case ADJECTIVE: 22 | if (adjectiveExc == null) adjectiveExc = new HashMap<>(); 23 | adjectiveExc.putAll(map); 24 | break; 25 | case VERB: 26 | if (verbExc == null) verbExc = new HashMap<>(); 27 | verbExc.putAll(map); 28 | break; 29 | case ADVERB: 30 | if (adverbExc == null) adverbExc = new HashMap<>(); 31 | adverbExc.putAll(map); 32 | break; 33 | } 34 | } 35 | 36 | public String lookupException(String word) { 37 | return lookupException(word, null); 38 | } 39 | 40 | public String lookupException(String word, POS pos) { 41 | if (pos == null) { 42 | String exc = lookupLists(word, nounExc, verbExc, adjectiveExc, adverbExc); 43 | if (exc != null) { 44 | return exc; 45 | } 46 | 47 | return null; 48 | } 49 | 50 | switch (pos) { 51 | case NOUN: return lookupList(word, nounExc); 52 | case ADJECTIVE: return lookupList(word, adjectiveExc); 53 | case VERB: return lookupList(word, verbExc); 54 | case ADVERB: return lookupList(word, adverbExc); 55 | } 56 | 57 | return null; 58 | } 59 | 60 | private String lookupList(String word, Map list) { 61 | if (list != null) { 62 | return list.get(word); 63 | } 64 | 65 | return null; 66 | } 67 | 68 | private String lookupLists(String word, Map... lists) { 69 | for (Map list : lists) { 70 | if (list != null && list.containsKey(word)) return list.get(word); 71 | } 72 | return null; 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/lemmatization/Lemmatizer.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization; 2 | 3 | /** 4 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28 5 | */ 6 | public interface Lemmatizer { 7 | 8 | /** 9 | * It tries to find the basic form (lemma) for a given word. 10 | * 11 | * @param word 12 | * @return the original form of the word 13 | */ 14 | String stem(String word, POS pos); 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/lemmatization/POS.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization; 2 | 3 | /** 4 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28. 5 | */ 6 | public enum POS { 7 | NOUN, VERB, ADJECTIVE, ADVERB 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/lemmatization/WordNetReader.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization; 2 | 3 | import java.util.Collection; 4 | 5 | /** 6 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 7 | */ 8 | public interface WordNetReader { 9 | 10 | /** 11 | * @return exception list for four categories 12 | */ 13 | ExceptionList readExceptionList(); 14 | 15 | /** 16 | * @return all the lemmas 17 | */ 18 | Collection readLemmas(); 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/lemmatization/impl/DetachmentRules.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization.impl; 2 | 3 | import com.nicholasding.search.lemmatization.POS; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | /** 9 | * Detachment Rules in WordNet's morphological processing. 10 | * 11 | * @see morphy (7WN) 12 | * 13 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 14 | */ 15 | public class DetachmentRules { 16 | public String[] apply(String word, POS pos) { 17 | List candidates = new ArrayList<>(); 18 | switch (pos) { 19 | case NOUN: 20 | if (word.endsWith("s")) { 21 | candidates.add(word.substring(0, word.length() - 1)); 22 | } 23 | if (word.endsWith("ses")) { 24 | candidates.add(word.substring(0, word.length() - 2)); 25 | break; 26 | } 27 | if (word.endsWith("xes")) { 28 | candidates.add(word.substring(0, word.length() - 2)); 29 | break; 30 | } 31 | if (word.endsWith("zes")) { 32 | candidates.add(word.substring(0, word.length() - 2)); 33 | break; 34 | } 35 | if (word.endsWith("ches")) { 36 | candidates.add(word.substring(0, word.length() - 2)); 37 | break; 38 | } 39 | if (word.endsWith("shes")) { 40 | candidates.add(word.substring(0, word.length() - 2)); 41 | break; 42 | } 43 | if (word.endsWith("men")) { 44 | candidates.add(word.substring(0, word.length() - 3) + "man"); 45 | break; 46 | } 47 | if (word.endsWith("ies")) { 48 | candidates.add(word.substring(0, word.length() - 3) + "y"); 49 | break; 50 | } 51 | case VERB: 52 | if (word.endsWith("s")) { 53 | candidates.add(word.substring(0, word.length() - 1)); 54 | } 55 | if (word.endsWith("ies")) { 56 | candidates.add(word.substring(0, word.length() - 3) + "y"); 57 | } 58 | if (word.endsWith("es")) { 59 | candidates.add(word.substring(0, word.length() - 1)); 60 | candidates.add(word); 61 | } 62 | if (word.endsWith("ed")) { 63 | candidates.add(word.substring(0, word.length() - 1)); 64 | candidates.add(word); 65 | break; 66 | } 67 | if (word.endsWith("ing")) { 68 | candidates.add(word.substring(0, word.length() - 3) + "e"); 69 | candidates.add(word); 70 | break; 71 | } 72 | break; 73 | case ADJECTIVE: 74 | if (word.endsWith("er") || word.endsWith("est")) { 75 | candidates.add(word); 76 | } 77 | if (word.endsWith("er")) { 78 | candidates.add(word.substring(0, word.length() - 1)); 79 | } 80 | if (word.endsWith("est")) { 81 | candidates.add(word.substring(0, word.length() - 2)); 82 | } 83 | break; 84 | default: 85 | candidates.add(word); 86 | } 87 | 88 | return candidates.toArray(new String[0]); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/lemmatization/impl/DirectoryWordNetReader.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization.impl; 2 | 3 | import com.nicholasding.search.lemmatization.ExceptionList; 4 | import com.nicholasding.search.lemmatization.POS; 5 | import com.nicholasding.search.lemmatization.WordNetReader; 6 | import java.io.BufferedReader; 7 | import java.io.File; 8 | import java.io.FileFilter; 9 | import java.io.FileNotFoundException; 10 | import java.io.FileReader; 11 | import java.io.IOException; 12 | import java.util.ArrayList; 13 | import java.util.Collection; 14 | import java.util.HashMap; 15 | import java.util.Iterator; 16 | import java.util.LinkedList; 17 | import java.util.List; 18 | import java.util.Map; 19 | 20 | /** 21 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28. 22 | */ 23 | public class DirectoryWordNetReader implements WordNetReader { 24 | 25 | private String dictPath; 26 | 27 | public DirectoryWordNetReader(String dictPath) { 28 | this.dictPath = dictPath; 29 | } 30 | 31 | protected Map loadExceptionList(File file) { 32 | Map m = new HashMap<>(); 33 | try { 34 | BufferedReader reader = new BufferedReader(new FileReader(file)); 35 | String line = null; 36 | while ((line = reader.readLine()) != null) { 37 | String[] cols = line.split("\\s+"); 38 | m.put(cols[0], cols[1]); 39 | } 40 | } catch (FileNotFoundException e) { 41 | e.printStackTrace(); 42 | } catch (IOException e) { 43 | e.printStackTrace(); 44 | } 45 | 46 | return m; 47 | } 48 | 49 | protected void readIndexFile(File file, Collection collector) { 50 | try { 51 | BufferedReader reader = new BufferedReader(new FileReader(file)); 52 | String line = null; 53 | while ((line = reader.readLine()) != null) { 54 | if (!line.startsWith(" ")) { 55 | String[] cols = line.split("\\s+"); 56 | collector.add(cols[0]); 57 | } 58 | } 59 | } catch (FileNotFoundException e) { 60 | e.printStackTrace(); 61 | } catch (IOException e) { 62 | e.printStackTrace(); 63 | } 64 | } 65 | 66 | @Override 67 | public ExceptionList readExceptionList() { 68 | ExceptionList list = new ExceptionList(); 69 | 70 | File dir = new File(dictPath); 71 | File[] files = dir.listFiles(new FileFilter() { 72 | @Override 73 | public boolean accept(File pathname) { 74 | return pathname.getName().endsWith(".exc"); 75 | } 76 | }); 77 | 78 | for (File file : files) { 79 | Map m = loadExceptionList(file); 80 | if ("noun.exc".equals(file.getName())) { 81 | list.addExceptionList(POS.NOUN, m); 82 | } else if ("adj.exc".equals(file.getName())) { 83 | list.addExceptionList(POS.ADJECTIVE, m); 84 | } else if ("adv.exc".equals(file.getName())) { 85 | list.addExceptionList(POS.ADVERB, m); 86 | } else if ("verb.exc".equals(file.getName())) { 87 | list.addExceptionList(POS.VERB, m); 88 | } 89 | } 90 | 91 | return list; 92 | } 93 | 94 | @Override 95 | public Collection readLemmas() { 96 | Collection lemmas = new LinkedList<>(); 97 | 98 | File dir = new File(dictPath); 99 | File[] files = dir.listFiles(new FileFilter() { 100 | @Override 101 | public boolean accept(File pathname) { 102 | return pathname.getName().startsWith("index.") && !pathname.getName().endsWith(".sense"); 103 | } 104 | }); 105 | 106 | for (File f : files) { 107 | readIndexFile(f, lemmas); 108 | } 109 | 110 | return lemmas; 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/lemmatization/impl/PackagedWordNetReader.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization.impl; 2 | 3 | import com.nicholasding.search.lemmatization.ExceptionList; 4 | import com.nicholasding.search.lemmatization.POS; 5 | import com.nicholasding.search.lemmatization.WordNetReader; 6 | import java.io.BufferedReader; 7 | import java.io.FileNotFoundException; 8 | import java.io.IOException; 9 | import java.io.InputStream; 10 | import java.io.InputStreamReader; 11 | import java.util.Collection; 12 | import java.util.HashMap; 13 | import java.util.LinkedList; 14 | import java.util.List; 15 | import java.util.Map; 16 | import java.util.zip.ZipEntry; 17 | import java.util.zip.ZipInputStream; 18 | 19 | /** 20 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 21 | */ 22 | public class PackagedWordNetReader implements WordNetReader { 23 | 24 | private ExceptionList exceptionList = new ExceptionList(); 25 | private Collection lemmas = new LinkedList<>(); 26 | 27 | public PackagedWordNetReader(String resource) { 28 | InputStream zipfile = getClass().getClassLoader().getResourceAsStream(resource); 29 | ZipInputStream zipIn = new ZipInputStream(zipfile); 30 | ZipEntry entry = null; 31 | 32 | try { 33 | while ((entry = zipIn.getNextEntry()) != null) { 34 | String name = entry.getName(); 35 | if (name.endsWith(".exc")) { 36 | Map m = loadExceptionList(zipIn); 37 | if ("noun.exc".equals(name)) { 38 | exceptionList.addExceptionList(POS.NOUN, m); 39 | } else if ("adj.exc".equals(name)) { 40 | exceptionList.addExceptionList(POS.ADJECTIVE, m); 41 | } else if ("adv.exc".equals(name)) { 42 | exceptionList.addExceptionList(POS.ADVERB, m); 43 | } else if ("verb.exc".equals(name)) { 44 | exceptionList.addExceptionList(POS.VERB, m); 45 | } 46 | } else { 47 | readIndexFile(zipIn, lemmas); 48 | } 49 | } 50 | } catch (IOException e) { 51 | e.printStackTrace(); 52 | } 53 | 54 | } 55 | 56 | protected Map loadExceptionList(ZipInputStream stream) { 57 | Map m = new HashMap<>(); 58 | try { 59 | BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); 60 | String line = null; 61 | while ((line = reader.readLine()) != null) { 62 | String[] cols = line.split("\\s+"); 63 | m.put(cols[0], cols[1]); 64 | } 65 | } catch (FileNotFoundException e) { 66 | e.printStackTrace(); 67 | } catch (IOException e) { 68 | e.printStackTrace(); 69 | } 70 | 71 | return m; 72 | } 73 | 74 | protected void readIndexFile(ZipInputStream stream, Collection collector) { 75 | List list = new LinkedList<>(); 76 | try { 77 | BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); 78 | String line = null; 79 | while ((line = reader.readLine()) != null) { 80 | if (!line.startsWith(" ")) { 81 | String[] cols = line.split("\\s+"); 82 | collector.add(cols[0]); 83 | } 84 | } 85 | } catch (FileNotFoundException e) { 86 | e.printStackTrace(); 87 | } catch (IOException e) { 88 | e.printStackTrace(); 89 | } 90 | } 91 | 92 | @Override 93 | public ExceptionList readExceptionList() { 94 | return exceptionList; 95 | } 96 | 97 | @Override 98 | public Collection readLemmas() { 99 | return lemmas; 100 | } 101 | 102 | } 103 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/lemmatization/impl/WordNetLemmatizer.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization.impl; 2 | 3 | import com.nicholasding.search.lemmatization.ExceptionList; 4 | import com.nicholasding.search.lemmatization.Lemmatizer; 5 | import com.nicholasding.search.lemmatization.POS; 6 | import com.nicholasding.search.lemmatization.WordNetReader; 7 | import com.nicholasding.search.util.RTrie; 8 | import com.nicholasding.search.util.TernarySearchTree; 9 | import com.nicholasding.search.util.Trie; 10 | 11 | import java.util.ArrayList; 12 | import java.util.Collections; 13 | import java.util.List; 14 | 15 | /** 16 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28 17 | */ 18 | public class WordNetLemmatizer implements Lemmatizer { 19 | 20 | private Trie trie; 21 | private ExceptionList exceptionList; 22 | private DetachmentRules rules; 23 | 24 | /** 25 | * Default constructor will load the required resources and construct the trie. 26 | * 27 | * @param reader 28 | */ 29 | public WordNetLemmatizer(WordNetReader reader, Trie trie) { 30 | this.exceptionList = reader.readExceptionList(); 31 | this.trie = trie; 32 | this.rules = new DetachmentRules(); 33 | 34 | buildTrie(reader, trie); 35 | } 36 | 37 | private void buildTrie(WordNetReader reader, Trie trie) { 38 | for (String word : reader.readLemmas()) { 39 | trie.put(word, Boolean.TRUE); 40 | } 41 | } 42 | 43 | public String stem(String word, POS pos) { 44 | if (word == null || word.length() == 0) return null; 45 | 46 | String exception = checkExceptionList(word, pos); 47 | if (exception != null) return exception; 48 | 49 | String[] candidates = transform(word, pos); 50 | for (String candidate : candidates) { 51 | if (!candidate.isEmpty() && trie.contains(candidate)) return candidate; 52 | } 53 | 54 | return word; 55 | } 56 | 57 | private String[] transform(String word, POS pos) { 58 | if (pos != null) { 59 | return rules.apply(word, pos); 60 | } else { 61 | List candidates = new ArrayList<>(); 62 | Collections.addAll(candidates, rules.apply(word, POS.NOUN)); 63 | if (candidates.size() == 0) Collections.addAll(candidates, rules.apply(word, POS.VERB)); 64 | if (candidates.size() == 0) Collections.addAll(candidates, rules.apply(word, POS.ADJECTIVE)); 65 | if (candidates.size() == 0) Collections.addAll(candidates, rules.apply(word, POS.ADVERB)); 66 | return candidates.toArray(new String[0]); 67 | } 68 | } 69 | 70 | protected String checkExceptionList(String word, POS pos) { 71 | return exceptionList.lookupException(word, pos); 72 | } 73 | 74 | protected Trie getTrie() { 75 | return trie; 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/solr/LemmatizerFilter.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.solr; 2 | 3 | import com.nicholasding.search.lemmatization.Lemmatizer; 4 | import com.nicholasding.search.lemmatization.POS; 5 | import java.io.IOException; 6 | import org.apache.lucene.analysis.TokenFilter; 7 | import org.apache.lucene.analysis.TokenStream; 8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 9 | 10 | /** 11 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 12 | */ 13 | public class LemmatizerFilter extends TokenFilter { 14 | 15 | private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class); 16 | private final Lemmatizer lemmatizer; 17 | 18 | public LemmatizerFilter(TokenStream tokenStream, Lemmatizer lemmatizer) { 19 | super(tokenStream); 20 | this.lemmatizer = lemmatizer; 21 | } 22 | 23 | @Override 24 | public final boolean incrementToken() throws IOException { 25 | if (!this.input.incrementToken()) { 26 | return false; 27 | } else { 28 | char[] termBuffer = this.termAtt.buffer(); 29 | String lemma = lemmatizer.stem(new String(termBuffer, 0, this.termAtt.length()), null); 30 | this.termAtt.copyBuffer(lemma.toCharArray(), 0, lemma.length()); 31 | return true; 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/solr/LemmatizerFilterFactory.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.solr; 2 | 3 | import com.nicholasding.search.lemmatization.Lemmatizer; 4 | import com.nicholasding.search.lemmatization.WordNetReader; 5 | import com.nicholasding.search.lemmatization.impl.DirectoryWordNetReader; 6 | import com.nicholasding.search.lemmatization.impl.PackagedWordNetReader; 7 | import com.nicholasding.search.lemmatization.impl.WordNetLemmatizer; 8 | import java.util.Map; 9 | 10 | import com.nicholasding.search.util.TernarySearchTree; 11 | import org.apache.lucene.analysis.TokenStream; 12 | import org.apache.lucene.analysis.util.TokenFilterFactory; 13 | 14 | /** 15 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 16 | */ 17 | public class LemmatizerFilterFactory extends TokenFilterFactory { 18 | 19 | private static final String KEY_DICT_PATH = "dictPath"; 20 | 21 | private Lemmatizer lemmatizer; 22 | 23 | public LemmatizerFilterFactory(Map args) { 24 | super(args); 25 | 26 | WordNetReader reader; 27 | 28 | if (args.containsKey(KEY_DICT_PATH)) { 29 | String path = args.get(KEY_DICT_PATH); 30 | reader = new DirectoryWordNetReader(path); 31 | } else { 32 | reader = new PackagedWordNetReader("wordnet.zip"); 33 | } 34 | 35 | lemmatizer = new WordNetLemmatizer(reader, new TernarySearchTree()); 36 | } 37 | 38 | @Override 39 | public TokenStream create(TokenStream tokenStream) { 40 | return new LemmatizerFilter(tokenStream, lemmatizer); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/util/RTrie.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.util; 2 | 3 | import java.util.HashMap; 4 | import java.util.Iterator; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | /** 10 | * This is a R way trie implementation. 11 | * 12 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28. 13 | */ 14 | public class RTrie implements Trie { 15 | 16 | private static class Node { 17 | Object value; 18 | Character key; 19 | Map children = new HashMap<>(); 20 | } 21 | 22 | private Node root; 23 | 24 | @Override 25 | public void put(String key, Object value) { 26 | root = put(root, key, value, 0); 27 | } 28 | 29 | protected Node put(Node node, String key, Object value, int level) { 30 | if (node == null) { node = new Node(); } 31 | if (level == key.length()) { node.value = value; return node; } 32 | Character c = key.charAt(level); 33 | 34 | node.children.put(c, put(node.children.get(c), key, value, level + 1)); 35 | node.children.get(c).key = c; 36 | 37 | return node; 38 | } 39 | 40 | @Override 41 | public Object get(String key) { 42 | Node node = get(root, key, 0); 43 | if (node == null) return null; 44 | return node.value; 45 | } 46 | 47 | protected Node get(Node node, String key, int level) { 48 | if (node == null) return null; 49 | if (level == key.length()) return node; 50 | Character c = key.charAt(level); 51 | return get(node.children.get(c), key, level + 1); 52 | } 53 | 54 | @Override 55 | public boolean contains(String key) { 56 | return get(key) != null; 57 | } 58 | 59 | @Override 60 | public Iterator keys() { 61 | List collector = new LinkedList(); 62 | 63 | for (Node n : root.children.values()) { 64 | collect(n, "", 1, collector); 65 | } 66 | 67 | return collector.iterator(); 68 | } 69 | 70 | private void collect(Node node, String prefix, int level, List collector) { 71 | if (node == null) return; 72 | if (node.value != null) collector.add(prefix + node.key); 73 | 74 | for (Node n : node.children.values()) { 75 | collect(n, prefix + node.key, level + 1, collector); 76 | } 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/util/TernarySearchTree.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.util; 2 | 3 | import java.util.Iterator; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | 7 | /** 8 | * An simple implementation of @see Ternary Search Tree. 9 | * 10 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 11 | */ 12 | public class TernarySearchTree implements Trie { 13 | 14 | private static class Node { 15 | Object value; 16 | Character key; 17 | Node left, middle, right; 18 | } 19 | 20 | private Node root; 21 | 22 | @Override 23 | public void put(String key, Object value) { 24 | root = put(root, key, value, 0); 25 | } 26 | 27 | protected Node put(Node node, String key, Object value, int level) { 28 | Character c = key.charAt(level); 29 | 30 | if (node == null) { node = new Node(); node.key = c; } 31 | 32 | if (c < node.key) node.left = put(node.left, key, value, level); 33 | else if (c > node.key) node.right = put(node.right, key, value, level); 34 | else if (level < key.length() - 1) node.middle = put(node.middle, key, value, level + 1); 35 | else node.value = value; 36 | 37 | return node; 38 | } 39 | 40 | @Override 41 | public Object get(String key) { 42 | Node node = get(root, key, 0); 43 | if (node == null) return null; 44 | return node.value; 45 | } 46 | 47 | protected Node get(Node node, String key, int level) { 48 | if (node == null) return null; 49 | 50 | Character c = key.charAt(level); 51 | if (c < node.key) return get(node.left, key, level); 52 | else if (c > node.key) return get(node.right, key, level); 53 | else if (level < key.length() - 1) return get(node.middle, key, level + 1); 54 | else return node; 55 | } 56 | 57 | @Override 58 | public Iterator keys() { 59 | List collector = new LinkedList(); 60 | collect(root, "", 0, collector); 61 | return collector.iterator(); 62 | } 63 | 64 | @Override 65 | public boolean contains(String key) { 66 | return get(key) != null; 67 | } 68 | 69 | private void collect(Node node, String prefix, int level, List collector) { 70 | if (node == null) return; 71 | if (node.value != null) collector.add(prefix + node.key); 72 | 73 | collect(node.left, prefix, level, collector); 74 | collect(node.middle, prefix + node.key, level + 1, collector); 75 | collect(node.right, prefix, level, collector); 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/com/nicholasding/search/util/Trie.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.util; 2 | 3 | import java.util.Iterator; 4 | 5 | /** 6 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28. 7 | */ 8 | public interface Trie { 9 | 10 | /** 11 | * Store the object into the Trie by providing a string key 12 | * 13 | * @param key 14 | * @param value 15 | */ 16 | void put(String key, Object value); 17 | 18 | /** 19 | * Retrieve the object stored in the Trie 20 | * 21 | * @param key 22 | * @return null if key doesn't exist 23 | */ 24 | Object get(String key); 25 | 26 | /** 27 | * @return iterable over all the keys 28 | */ 29 | Iterator keys(); 30 | 31 | /** 32 | * @param key 33 | * @return true if key was found in the trie 34 | */ 35 | boolean contains(String key); 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/resources/wordnet.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nicholasding/solr-lemmatizer/37feb0961bda036823a7ac46658e84dd6246f354/src/main/resources/wordnet.zip -------------------------------------------------------------------------------- /src/test/java/com/nicholasding/search/lemmatization/ExceptionListTest.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | /** 10 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 11 | */ 12 | public class ExceptionListTest { 13 | 14 | @Test 15 | public void testInsertAndLookup() { 16 | ExceptionList excList = new ExceptionList(); 17 | Map m = new HashMap<>(); 18 | m.put("a", "b"); 19 | excList.addExceptionList(POS.NOUN, m); 20 | 21 | Assert.assertEquals("b", excList.lookupException("a")); 22 | Assert.assertEquals("b", excList.lookupException("a", POS.NOUN)); 23 | Assert.assertEquals(null, excList.lookupException("a", POS.VERB)); 24 | } 25 | 26 | @Test 27 | public void testMergeInput() { 28 | ExceptionList excList = new ExceptionList(); 29 | Map m = new HashMap<>(); 30 | m.put("a", "b"); 31 | 32 | excList.addExceptionList(POS.NOUN, m); 33 | 34 | m.put("c", "d"); 35 | excList.addExceptionList(POS.NOUN, m); 36 | 37 | Assert.assertEquals("b", excList.lookupException("a")); 38 | Assert.assertEquals("d", excList.lookupException("c")); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/test/java/com/nicholasding/search/lemmatization/LemmatizerTest.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization; 2 | 3 | import com.nicholasding.search.lemmatization.impl.PackagedWordNetReader; 4 | import com.nicholasding.search.lemmatization.impl.WordNetLemmatizer; 5 | import com.nicholasding.search.util.RTrie; 6 | import com.nicholasding.search.util.TernarySearchTree; 7 | import org.junit.Assert; 8 | import org.junit.Test; 9 | 10 | /** 11 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28 12 | */ 13 | public class LemmatizerTest { 14 | 15 | @Test 16 | public void testStemShouldReturnNull() { 17 | Lemmatizer lemmatizer = new WordNetLemmatizer(new WordNetReaderStub(), new RTrie()); 18 | Assert.assertNull(lemmatizer.stem(null, null)); 19 | } 20 | 21 | @Test 22 | public void testStemShouldGetResultFromExceptionList() { 23 | Lemmatizer lemmatizer = new WordNetLemmatizer(new WordNetReaderStub(), new RTrie()); 24 | Assert.assertEquals("radius", lemmatizer.stem("radii", null)); 25 | } 26 | 27 | @Test 28 | public void testStemShouldGetResultFromTrie() { 29 | Lemmatizer lemmatizer = new WordNetLemmatizer(new WordNetReaderStub(), new RTrie()); 30 | Assert.assertEquals("toy", lemmatizer.stem("toys", POS.NOUN)); 31 | } 32 | 33 | @Test 34 | public void testStemShouldReturnOriginalForm() { 35 | Lemmatizer lemmatizer = new WordNetLemmatizer(new WordNetReaderStub(), new RTrie()); 36 | Assert.assertEquals("toy", lemmatizer.stem("toy", POS.NOUN)); 37 | } 38 | 39 | @Test 40 | public void testRealFilesFromResources() { 41 | Lemmatizer lemmatizer = new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new RTrie()); 42 | Assert.assertEquals("toy", lemmatizer.stem("toy", POS.NOUN)); 43 | Assert.assertEquals("plant", lemmatizer.stem("plants", POS.NOUN)); 44 | Assert.assertEquals("radius", lemmatizer.stem("radii", POS.NOUN)); 45 | } 46 | 47 | @Test 48 | public void testWhenInputIsS() { 49 | Lemmatizer lemmatizer = new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new TernarySearchTree()); 50 | Assert.assertEquals("s", lemmatizer.stem("s", POS.NOUN)); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/test/java/com/nicholasding/search/lemmatization/WordNetReaderStub.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization; 2 | 3 | import java.util.*; 4 | 5 | /** 6 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28 7 | */ 8 | public class WordNetReaderStub implements WordNetReader { 9 | 10 | private Map createExceptionList(POS noun) { 11 | Map m = new HashMap(); 12 | m.put("radii", "radius"); 13 | return m; 14 | } 15 | 16 | @Override 17 | public ExceptionList readExceptionList() { 18 | ExceptionList list = new ExceptionList(); 19 | list.addExceptionList(POS.NOUN, createExceptionList(POS.NOUN)); 20 | return list; 21 | } 22 | 23 | @Override 24 | public Collection readLemmas() { 25 | List data = new ArrayList(); 26 | data.add("toy"); 27 | return data; 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/test/java/com/nicholasding/search/lemmatization/impl/BenchmarkTest.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.lemmatization.impl; 2 | 3 | import com.nicholasding.search.util.RTrie; 4 | import com.nicholasding.search.util.TernarySearchTree; 5 | import com.nicholasding.search.util.Trie; 6 | import org.junit.Test; 7 | 8 | import java.util.Iterator; 9 | 10 | /** 11 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 12 | */ 13 | public class BenchmarkTest { 14 | 15 | @Test 16 | public void benchmarkRTrie() { 17 | long start = System.currentTimeMillis(); 18 | WordNetLemmatizer lemmatizer = new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new RTrie()); 19 | System.out.println("RTrie initialize: " + (System.currentTimeMillis() - start) + " ms"); 20 | 21 | Trie trie = lemmatizer.getTrie(); 22 | benchmark(trie); 23 | } 24 | 25 | @Test 26 | public void benchmarkTST() { 27 | long start = System.currentTimeMillis(); 28 | WordNetLemmatizer lemmatizer = new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new TernarySearchTree()); 29 | System.out.println("TST initialize: " + (System.currentTimeMillis() - start) + " ms"); 30 | 31 | Trie trie = lemmatizer.getTrie(); 32 | benchmark(trie); 33 | } 34 | 35 | private void benchmark(Trie trie) { 36 | long start = System.currentTimeMillis(); 37 | 38 | long counter = 0; 39 | Iterator keys = trie.keys(); 40 | while (keys.hasNext()) { 41 | trie.get(keys.next()); 42 | counter++; 43 | } 44 | 45 | long end = System.currentTimeMillis(); 46 | 47 | float avg = (float) (end - start) / counter * 1000000; 48 | System.out.println("Total access time: " + (end - start) + " ms, " + counter + " lookups, " + avg + " ns/lookup"); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/test/java/com/nicholasding/search/solr/LemmatizerFilterFactoryTest.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.solr; 2 | 3 | import org.apache.lucene.analysis.BaseTokenStreamTestCase; 4 | import org.apache.lucene.analysis.MockTokenizer; 5 | import org.apache.lucene.analysis.TokenStream; 6 | import org.junit.Test; 7 | 8 | import java.io.IOException; 9 | import java.io.StringReader; 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | 13 | /** 14 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 15 | */ 16 | public class LemmatizerFilterFactoryTest extends BaseTokenStreamTestCase { 17 | 18 | @Test 19 | public void testUsingPackagedWordNetReader() throws IOException { 20 | Map args = new HashMap<>(); 21 | LemmatizerFilterFactory factory = new LemmatizerFilterFactory(args); 22 | StringReader reader = new StringReader("it better works"); 23 | final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false); 24 | in.setReader(reader); 25 | TokenStream stream = factory.create(in); 26 | assertTokenStreamContents(stream, new String[] { "it", "good", "work" }); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/com/nicholasding/search/solr/LemmatizerFilterTest.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.solr; 2 | 3 | import com.nicholasding.search.lemmatization.impl.PackagedWordNetReader; 4 | import com.nicholasding.search.lemmatization.impl.WordNetLemmatizer; 5 | import java.io.IOException; 6 | import java.io.StringReader; 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | 10 | import com.nicholasding.search.util.RTrie; 11 | import com.nicholasding.search.util.TernarySearchTree; 12 | import org.apache.lucene.analysis.BaseTokenStreamTestCase; 13 | import org.apache.lucene.analysis.MockTokenizer; 14 | import org.apache.lucene.analysis.TokenStream; 15 | import org.junit.Test; 16 | 17 | /** 18 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29 19 | */ 20 | public class LemmatizerFilterTest extends BaseTokenStreamTestCase { 21 | 22 | @Test 23 | public void testWithSamplePhrase() throws IOException { 24 | StringReader reader = new StringReader("it better works"); 25 | final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false); 26 | in.setReader(reader); 27 | TokenStream stream = new LemmatizerFilter(in, new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new RTrie())); 28 | assertTokenStreamContents(stream, new String[] { "it", "good", "work" }); 29 | } 30 | 31 | @Test 32 | public void testUsingPackagedWordNetReaderFromFilterFactory() throws IOException { 33 | Map args = new HashMap<>(); 34 | LemmatizerFilterFactory factory = new LemmatizerFilterFactory(args); 35 | 36 | StringReader reader = new StringReader("it better works"); 37 | final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false); 38 | in.setReader(reader); 39 | TokenStream stream = factory.create(in); 40 | assertTokenStreamContents(stream, new String[] { "it", "good", "work" }); 41 | } 42 | 43 | @Test 44 | public void testUsingDirectoryWordNetReaderWithDummyPathShouldFailSilently() throws IOException { 45 | Map args = new HashMap<>(); 46 | args.put("dictPath", "/tmp"); 47 | LemmatizerFilterFactory factory = new LemmatizerFilterFactory(args); 48 | 49 | StringReader reader = new StringReader("it better works"); 50 | final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false); 51 | in.setReader(reader); 52 | TokenStream stream = factory.create(in); 53 | assertTokenStreamContents(stream, new String[] { "it", "better", "works" }); 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/test/java/com/nicholasding/search/util/TrieTest.java: -------------------------------------------------------------------------------- 1 | package com.nicholasding.search.util; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.Iterator; 7 | 8 | /** 9 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28. 10 | */ 11 | public class TrieTest { 12 | 13 | @Test 14 | public void testPut() { 15 | testPut(new RTrie()); 16 | testPut(new TernarySearchTree()); 17 | } 18 | 19 | private void testPut(Trie trie) { 20 | trie.put("ab", "ab"); 21 | Assert.assertTrue(trie.contains("ab")); 22 | Assert.assertEquals("ab", trie.get("ab")); 23 | } 24 | 25 | @Test 26 | public void testIterateAllKeys() { 27 | testTrieKeys(new RTrie()); 28 | testTrieKeys(new TernarySearchTree()); 29 | } 30 | 31 | private void testTrieKeys(Trie trie) { 32 | trie.put("ab", "ab"); 33 | trie.put("bc", "bc"); 34 | trie.put("cd", "cd"); 35 | 36 | Iterator keys = trie.keys(); 37 | Assert.assertEquals("ab", keys.next()); 38 | Assert.assertEquals("bc", keys.next()); 39 | Assert.assertEquals("cd", keys.next()); 40 | } 41 | } 42 | --------------------------------------------------------------------------------