├── .gitignore
├── LICENSE
├── README.md
├── pom.xml
└── src
├── main
├── java
│ └── com
│ │ └── nicholasding
│ │ └── search
│ │ ├── lemmatization
│ │ ├── ExceptionList.java
│ │ ├── Lemmatizer.java
│ │ ├── POS.java
│ │ ├── WordNetReader.java
│ │ └── impl
│ │ │ ├── DetachmentRules.java
│ │ │ ├── DirectoryWordNetReader.java
│ │ │ ├── PackagedWordNetReader.java
│ │ │ └── WordNetLemmatizer.java
│ │ ├── solr
│ │ ├── LemmatizerFilter.java
│ │ └── LemmatizerFilterFactory.java
│ │ └── util
│ │ ├── RTrie.java
│ │ ├── TernarySearchTree.java
│ │ └── Trie.java
└── resources
│ └── wordnet.zip
└── test
└── java
└── com
└── nicholasding
└── search
├── lemmatization
├── ExceptionListTest.java
├── LemmatizerTest.java
├── WordNetReaderStub.java
└── impl
│ └── BenchmarkTest.java
├── solr
├── LemmatizerFilterFactoryTest.java
└── LemmatizerFilterTest.java
└── util
└── TrieTest.java
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
3 | # Mobile Tools for Java (J2ME)
4 | .mtj.tmp/
5 |
6 | # Package Files #
7 | *.jar
8 | *.war
9 | *.ear
10 |
11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
12 | hs_err_pid*
13 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Nicholas Ding
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # solr-lemmatizer
2 |
3 | A TokenFilter that applies [lemmatization](http://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html) to lemmatize *English* words. It doesn't like stemmer that uses algorithm to drop the suffix, instead, it uses [WordNet](https://wordnet.princeton.edu/wordnet/documentation/) dictionary to lookup the base form of the word (lemma).
4 |
5 | For example, word *better* will be lemmatized to *good*, and *radii* will be lemmatized to *radius*.
6 |
7 | ```
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | ```
18 |
19 | By default, the code will use a bundled WordNet database. But you can specify your own WordNet *dict* directory.
20 |
21 | ```
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 | ```
32 |
33 | For performance improvement, it uses an implementation of [Ternary Search Tree](https://en.wikipedia.org/wiki/Ternary_search_tree) to reduce memory usage and to provide average O(log n) lookup.
34 |
35 | Benchmark with WordNet database.
36 |
37 | ```
38 | CPU: Intel(R) Core(TM) i7-4770K CPU @ 3.50GHz
39 |
40 | TST initialize: 431 ms
41 | Total access time: 143 ms, 147306 lookups, 970.7683 ns/lookup
42 |
43 | RTrie initialize: 533 ms
44 | Total access time: 3725 ms, 147306 lookups, 25287.496 ns/lookup
45 | ```
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.nicholasding
8 | solr-lemmatizer
9 | 1.0.0
10 | jar
11 |
12 | ${project.groupId}:${project.artifactId}
13 | A TokenFilter that applies lemmatization to lemmatize English words.
14 | https://github.com/nicholasding/solr-lemmatizer
15 |
16 |
17 |
18 | The MIT License (MIT)
19 | https://github.com/nicholasding/solr-lemmatizer/blob/master/LICENSE
20 |
21 |
22 |
23 |
24 |
25 | Nicholas Ding
26 | nicholasdsj@gmail.com
27 |
28 |
29 |
30 |
31 | scm:git:git://github.com/nicholasding/solr-lemmatizer.git
32 | https://github.com/nicholasding/solr-lemmatizer
33 |
34 |
35 |
36 |
37 | ossrh
38 | https://oss.sonatype.org/content/repositories/snapshots
39 |
40 |
41 | ossrh
42 | https://oss.sonatype.org/service/local/staging/deploy/maven2/
43 |
44 |
45 |
46 |
47 |
48 |
49 | org.apache.maven.plugins
50 | maven-compiler-plugin
51 |
52 | 1.7
53 | 1.7
54 |
55 |
56 |
57 |
58 | org.apache.maven.plugins
59 | maven-gpg-plugin
60 | 1.5
61 |
62 |
63 | sign-artifacts
64 | verify
65 |
66 | sign
67 |
68 |
69 |
70 |
71 |
72 |
73 | org.sonatype.plugins
74 | nexus-staging-maven-plugin
75 | 1.6.7
76 | true
77 |
78 | ossrh
79 | https://oss.sonatype.org/
80 | true
81 |
82 |
83 |
84 |
85 |
86 |
87 | 6.1.0
88 |
89 |
90 |
91 |
92 | org.apache.lucene
93 | lucene-core
94 | ${lucene.version}
95 |
96 |
97 | org.apache.lucene
98 | lucene-analyzers-common
99 | ${lucene.version}
100 |
101 |
102 | org.apache.lucene
103 | lucene-test-framework
104 | ${lucene.version}
105 | test
106 |
107 |
108 | junit
109 | junit
110 | 4.12
111 | test
112 |
113 |
114 |
115 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/lemmatization/ExceptionList.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization;
2 |
3 | import java.util.HashMap;
4 | import java.util.Map;
5 |
6 | /**
7 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
8 | */
9 | public class ExceptionList {
10 |
11 | private Map nounExc, verbExc, adjectiveExc, adverbExc;
12 |
13 | public ExceptionList() {}
14 |
15 | public void addExceptionList(POS pos, Map map) {
16 | switch (pos) {
17 | case NOUN:
18 | if (nounExc == null) nounExc = new HashMap<>();
19 | nounExc.putAll(map);
20 | break;
21 | case ADJECTIVE:
22 | if (adjectiveExc == null) adjectiveExc = new HashMap<>();
23 | adjectiveExc.putAll(map);
24 | break;
25 | case VERB:
26 | if (verbExc == null) verbExc = new HashMap<>();
27 | verbExc.putAll(map);
28 | break;
29 | case ADVERB:
30 | if (adverbExc == null) adverbExc = new HashMap<>();
31 | adverbExc.putAll(map);
32 | break;
33 | }
34 | }
35 |
36 | public String lookupException(String word) {
37 | return lookupException(word, null);
38 | }
39 |
40 | public String lookupException(String word, POS pos) {
41 | if (pos == null) {
42 | String exc = lookupLists(word, nounExc, verbExc, adjectiveExc, adverbExc);
43 | if (exc != null) {
44 | return exc;
45 | }
46 |
47 | return null;
48 | }
49 |
50 | switch (pos) {
51 | case NOUN: return lookupList(word, nounExc);
52 | case ADJECTIVE: return lookupList(word, adjectiveExc);
53 | case VERB: return lookupList(word, verbExc);
54 | case ADVERB: return lookupList(word, adverbExc);
55 | }
56 |
57 | return null;
58 | }
59 |
60 | private String lookupList(String word, Map list) {
61 | if (list != null) {
62 | return list.get(word);
63 | }
64 |
65 | return null;
66 | }
67 |
68 | private String lookupLists(String word, Map... lists) {
69 | for (Map list : lists) {
70 | if (list != null && list.containsKey(word)) return list.get(word);
71 | }
72 | return null;
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/lemmatization/Lemmatizer.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization;
2 |
3 | /**
4 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28
5 | */
6 | public interface Lemmatizer {
7 |
8 | /**
9 | * It tries to find the basic form (lemma) for a given word.
10 | *
11 | * @param word
12 | * @return the original form of the word
13 | */
14 | String stem(String word, POS pos);
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/lemmatization/POS.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization;
2 |
3 | /**
4 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28.
5 | */
6 | public enum POS {
7 | NOUN, VERB, ADJECTIVE, ADVERB
8 | }
9 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/lemmatization/WordNetReader.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization;
2 |
3 | import java.util.Collection;
4 |
5 | /**
6 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
7 | */
8 | public interface WordNetReader {
9 |
10 | /**
11 | * @return exception list for four categories
12 | */
13 | ExceptionList readExceptionList();
14 |
15 | /**
16 | * @return all the lemmas
17 | */
18 | Collection readLemmas();
19 |
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/lemmatization/impl/DetachmentRules.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization.impl;
2 |
3 | import com.nicholasding.search.lemmatization.POS;
4 |
5 | import java.util.ArrayList;
6 | import java.util.List;
7 |
8 | /**
9 | * Detachment Rules in WordNet's morphological processing.
10 | *
11 | * @see morphy (7WN)
12 | *
13 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
14 | */
15 | public class DetachmentRules {
16 | public String[] apply(String word, POS pos) {
17 | List candidates = new ArrayList<>();
18 | switch (pos) {
19 | case NOUN:
20 | if (word.endsWith("s")) {
21 | candidates.add(word.substring(0, word.length() - 1));
22 | }
23 | if (word.endsWith("ses")) {
24 | candidates.add(word.substring(0, word.length() - 2));
25 | break;
26 | }
27 | if (word.endsWith("xes")) {
28 | candidates.add(word.substring(0, word.length() - 2));
29 | break;
30 | }
31 | if (word.endsWith("zes")) {
32 | candidates.add(word.substring(0, word.length() - 2));
33 | break;
34 | }
35 | if (word.endsWith("ches")) {
36 | candidates.add(word.substring(0, word.length() - 2));
37 | break;
38 | }
39 | if (word.endsWith("shes")) {
40 | candidates.add(word.substring(0, word.length() - 2));
41 | break;
42 | }
43 | if (word.endsWith("men")) {
44 | candidates.add(word.substring(0, word.length() - 3) + "man");
45 | break;
46 | }
47 | if (word.endsWith("ies")) {
48 | candidates.add(word.substring(0, word.length() - 3) + "y");
49 | break;
50 | }
51 | case VERB:
52 | if (word.endsWith("s")) {
53 | candidates.add(word.substring(0, word.length() - 1));
54 | }
55 | if (word.endsWith("ies")) {
56 | candidates.add(word.substring(0, word.length() - 3) + "y");
57 | }
58 | if (word.endsWith("es")) {
59 | candidates.add(word.substring(0, word.length() - 1));
60 | candidates.add(word);
61 | }
62 | if (word.endsWith("ed")) {
63 | candidates.add(word.substring(0, word.length() - 1));
64 | candidates.add(word);
65 | break;
66 | }
67 | if (word.endsWith("ing")) {
68 | candidates.add(word.substring(0, word.length() - 3) + "e");
69 | candidates.add(word);
70 | break;
71 | }
72 | break;
73 | case ADJECTIVE:
74 | if (word.endsWith("er") || word.endsWith("est")) {
75 | candidates.add(word);
76 | }
77 | if (word.endsWith("er")) {
78 | candidates.add(word.substring(0, word.length() - 1));
79 | }
80 | if (word.endsWith("est")) {
81 | candidates.add(word.substring(0, word.length() - 2));
82 | }
83 | break;
84 | default:
85 | candidates.add(word);
86 | }
87 |
88 | return candidates.toArray(new String[0]);
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/lemmatization/impl/DirectoryWordNetReader.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization.impl;
2 |
3 | import com.nicholasding.search.lemmatization.ExceptionList;
4 | import com.nicholasding.search.lemmatization.POS;
5 | import com.nicholasding.search.lemmatization.WordNetReader;
6 | import java.io.BufferedReader;
7 | import java.io.File;
8 | import java.io.FileFilter;
9 | import java.io.FileNotFoundException;
10 | import java.io.FileReader;
11 | import java.io.IOException;
12 | import java.util.ArrayList;
13 | import java.util.Collection;
14 | import java.util.HashMap;
15 | import java.util.Iterator;
16 | import java.util.LinkedList;
17 | import java.util.List;
18 | import java.util.Map;
19 |
20 | /**
21 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28.
22 | */
23 | public class DirectoryWordNetReader implements WordNetReader {
24 |
25 | private String dictPath;
26 |
27 | public DirectoryWordNetReader(String dictPath) {
28 | this.dictPath = dictPath;
29 | }
30 |
31 | protected Map loadExceptionList(File file) {
32 | Map m = new HashMap<>();
33 | try {
34 | BufferedReader reader = new BufferedReader(new FileReader(file));
35 | String line = null;
36 | while ((line = reader.readLine()) != null) {
37 | String[] cols = line.split("\\s+");
38 | m.put(cols[0], cols[1]);
39 | }
40 | } catch (FileNotFoundException e) {
41 | e.printStackTrace();
42 | } catch (IOException e) {
43 | e.printStackTrace();
44 | }
45 |
46 | return m;
47 | }
48 |
49 | protected void readIndexFile(File file, Collection collector) {
50 | try {
51 | BufferedReader reader = new BufferedReader(new FileReader(file));
52 | String line = null;
53 | while ((line = reader.readLine()) != null) {
54 | if (!line.startsWith(" ")) {
55 | String[] cols = line.split("\\s+");
56 | collector.add(cols[0]);
57 | }
58 | }
59 | } catch (FileNotFoundException e) {
60 | e.printStackTrace();
61 | } catch (IOException e) {
62 | e.printStackTrace();
63 | }
64 | }
65 |
66 | @Override
67 | public ExceptionList readExceptionList() {
68 | ExceptionList list = new ExceptionList();
69 |
70 | File dir = new File(dictPath);
71 | File[] files = dir.listFiles(new FileFilter() {
72 | @Override
73 | public boolean accept(File pathname) {
74 | return pathname.getName().endsWith(".exc");
75 | }
76 | });
77 |
78 | for (File file : files) {
79 | Map m = loadExceptionList(file);
80 | if ("noun.exc".equals(file.getName())) {
81 | list.addExceptionList(POS.NOUN, m);
82 | } else if ("adj.exc".equals(file.getName())) {
83 | list.addExceptionList(POS.ADJECTIVE, m);
84 | } else if ("adv.exc".equals(file.getName())) {
85 | list.addExceptionList(POS.ADVERB, m);
86 | } else if ("verb.exc".equals(file.getName())) {
87 | list.addExceptionList(POS.VERB, m);
88 | }
89 | }
90 |
91 | return list;
92 | }
93 |
94 | @Override
95 | public Collection readLemmas() {
96 | Collection lemmas = new LinkedList<>();
97 |
98 | File dir = new File(dictPath);
99 | File[] files = dir.listFiles(new FileFilter() {
100 | @Override
101 | public boolean accept(File pathname) {
102 | return pathname.getName().startsWith("index.") && !pathname.getName().endsWith(".sense");
103 | }
104 | });
105 |
106 | for (File f : files) {
107 | readIndexFile(f, lemmas);
108 | }
109 |
110 | return lemmas;
111 | }
112 |
113 | }
114 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/lemmatization/impl/PackagedWordNetReader.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization.impl;
2 |
3 | import com.nicholasding.search.lemmatization.ExceptionList;
4 | import com.nicholasding.search.lemmatization.POS;
5 | import com.nicholasding.search.lemmatization.WordNetReader;
6 | import java.io.BufferedReader;
7 | import java.io.FileNotFoundException;
8 | import java.io.IOException;
9 | import java.io.InputStream;
10 | import java.io.InputStreamReader;
11 | import java.util.Collection;
12 | import java.util.HashMap;
13 | import java.util.LinkedList;
14 | import java.util.List;
15 | import java.util.Map;
16 | import java.util.zip.ZipEntry;
17 | import java.util.zip.ZipInputStream;
18 |
19 | /**
20 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
21 | */
22 | public class PackagedWordNetReader implements WordNetReader {
23 |
24 | private ExceptionList exceptionList = new ExceptionList();
25 | private Collection lemmas = new LinkedList<>();
26 |
27 | public PackagedWordNetReader(String resource) {
28 | InputStream zipfile = getClass().getClassLoader().getResourceAsStream(resource);
29 | ZipInputStream zipIn = new ZipInputStream(zipfile);
30 | ZipEntry entry = null;
31 |
32 | try {
33 | while ((entry = zipIn.getNextEntry()) != null) {
34 | String name = entry.getName();
35 | if (name.endsWith(".exc")) {
36 | Map m = loadExceptionList(zipIn);
37 | if ("noun.exc".equals(name)) {
38 | exceptionList.addExceptionList(POS.NOUN, m);
39 | } else if ("adj.exc".equals(name)) {
40 | exceptionList.addExceptionList(POS.ADJECTIVE, m);
41 | } else if ("adv.exc".equals(name)) {
42 | exceptionList.addExceptionList(POS.ADVERB, m);
43 | } else if ("verb.exc".equals(name)) {
44 | exceptionList.addExceptionList(POS.VERB, m);
45 | }
46 | } else {
47 | readIndexFile(zipIn, lemmas);
48 | }
49 | }
50 | } catch (IOException e) {
51 | e.printStackTrace();
52 | }
53 |
54 | }
55 |
56 | protected Map loadExceptionList(ZipInputStream stream) {
57 | Map m = new HashMap<>();
58 | try {
59 | BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
60 | String line = null;
61 | while ((line = reader.readLine()) != null) {
62 | String[] cols = line.split("\\s+");
63 | m.put(cols[0], cols[1]);
64 | }
65 | } catch (FileNotFoundException e) {
66 | e.printStackTrace();
67 | } catch (IOException e) {
68 | e.printStackTrace();
69 | }
70 |
71 | return m;
72 | }
73 |
74 | protected void readIndexFile(ZipInputStream stream, Collection collector) {
75 | List list = new LinkedList<>();
76 | try {
77 | BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
78 | String line = null;
79 | while ((line = reader.readLine()) != null) {
80 | if (!line.startsWith(" ")) {
81 | String[] cols = line.split("\\s+");
82 | collector.add(cols[0]);
83 | }
84 | }
85 | } catch (FileNotFoundException e) {
86 | e.printStackTrace();
87 | } catch (IOException e) {
88 | e.printStackTrace();
89 | }
90 | }
91 |
92 | @Override
93 | public ExceptionList readExceptionList() {
94 | return exceptionList;
95 | }
96 |
97 | @Override
98 | public Collection readLemmas() {
99 | return lemmas;
100 | }
101 |
102 | }
103 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/lemmatization/impl/WordNetLemmatizer.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization.impl;
2 |
3 | import com.nicholasding.search.lemmatization.ExceptionList;
4 | import com.nicholasding.search.lemmatization.Lemmatizer;
5 | import com.nicholasding.search.lemmatization.POS;
6 | import com.nicholasding.search.lemmatization.WordNetReader;
7 | import com.nicholasding.search.util.RTrie;
8 | import com.nicholasding.search.util.TernarySearchTree;
9 | import com.nicholasding.search.util.Trie;
10 |
11 | import java.util.ArrayList;
12 | import java.util.Collections;
13 | import java.util.List;
14 |
15 | /**
16 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28
17 | */
18 | public class WordNetLemmatizer implements Lemmatizer {
19 |
20 | private Trie trie;
21 | private ExceptionList exceptionList;
22 | private DetachmentRules rules;
23 |
24 | /**
25 | * Default constructor will load the required resources and construct the trie.
26 | *
27 | * @param reader
28 | */
29 | public WordNetLemmatizer(WordNetReader reader, Trie trie) {
30 | this.exceptionList = reader.readExceptionList();
31 | this.trie = trie;
32 | this.rules = new DetachmentRules();
33 |
34 | buildTrie(reader, trie);
35 | }
36 |
37 | private void buildTrie(WordNetReader reader, Trie trie) {
38 | for (String word : reader.readLemmas()) {
39 | trie.put(word, Boolean.TRUE);
40 | }
41 | }
42 |
43 | public String stem(String word, POS pos) {
44 | if (word == null || word.length() == 0) return null;
45 |
46 | String exception = checkExceptionList(word, pos);
47 | if (exception != null) return exception;
48 |
49 | String[] candidates = transform(word, pos);
50 | for (String candidate : candidates) {
51 | if (!candidate.isEmpty() && trie.contains(candidate)) return candidate;
52 | }
53 |
54 | return word;
55 | }
56 |
57 | private String[] transform(String word, POS pos) {
58 | if (pos != null) {
59 | return rules.apply(word, pos);
60 | } else {
61 | List candidates = new ArrayList<>();
62 | Collections.addAll(candidates, rules.apply(word, POS.NOUN));
63 | if (candidates.size() == 0) Collections.addAll(candidates, rules.apply(word, POS.VERB));
64 | if (candidates.size() == 0) Collections.addAll(candidates, rules.apply(word, POS.ADJECTIVE));
65 | if (candidates.size() == 0) Collections.addAll(candidates, rules.apply(word, POS.ADVERB));
66 | return candidates.toArray(new String[0]);
67 | }
68 | }
69 |
70 | protected String checkExceptionList(String word, POS pos) {
71 | return exceptionList.lookupException(word, pos);
72 | }
73 |
74 | protected Trie getTrie() {
75 | return trie;
76 | }
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/solr/LemmatizerFilter.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.solr;
2 |
3 | import com.nicholasding.search.lemmatization.Lemmatizer;
4 | import com.nicholasding.search.lemmatization.POS;
5 | import java.io.IOException;
6 | import org.apache.lucene.analysis.TokenFilter;
7 | import org.apache.lucene.analysis.TokenStream;
8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
9 |
10 | /**
11 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
12 | */
13 | public class LemmatizerFilter extends TokenFilter {
14 |
15 | private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class);
16 | private final Lemmatizer lemmatizer;
17 |
18 | public LemmatizerFilter(TokenStream tokenStream, Lemmatizer lemmatizer) {
19 | super(tokenStream);
20 | this.lemmatizer = lemmatizer;
21 | }
22 |
23 | @Override
24 | public final boolean incrementToken() throws IOException {
25 | if (!this.input.incrementToken()) {
26 | return false;
27 | } else {
28 | char[] termBuffer = this.termAtt.buffer();
29 | String lemma = lemmatizer.stem(new String(termBuffer, 0, this.termAtt.length()), null);
30 | this.termAtt.copyBuffer(lemma.toCharArray(), 0, lemma.length());
31 | return true;
32 | }
33 | }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/solr/LemmatizerFilterFactory.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.solr;
2 |
3 | import com.nicholasding.search.lemmatization.Lemmatizer;
4 | import com.nicholasding.search.lemmatization.WordNetReader;
5 | import com.nicholasding.search.lemmatization.impl.DirectoryWordNetReader;
6 | import com.nicholasding.search.lemmatization.impl.PackagedWordNetReader;
7 | import com.nicholasding.search.lemmatization.impl.WordNetLemmatizer;
8 | import java.util.Map;
9 |
10 | import com.nicholasding.search.util.TernarySearchTree;
11 | import org.apache.lucene.analysis.TokenStream;
12 | import org.apache.lucene.analysis.util.TokenFilterFactory;
13 |
14 | /**
15 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
16 | */
17 | public class LemmatizerFilterFactory extends TokenFilterFactory {
18 |
19 | private static final String KEY_DICT_PATH = "dictPath";
20 |
21 | private Lemmatizer lemmatizer;
22 |
23 | public LemmatizerFilterFactory(Map args) {
24 | super(args);
25 |
26 | WordNetReader reader;
27 |
28 | if (args.containsKey(KEY_DICT_PATH)) {
29 | String path = args.get(KEY_DICT_PATH);
30 | reader = new DirectoryWordNetReader(path);
31 | } else {
32 | reader = new PackagedWordNetReader("wordnet.zip");
33 | }
34 |
35 | lemmatizer = new WordNetLemmatizer(reader, new TernarySearchTree());
36 | }
37 |
38 | @Override
39 | public TokenStream create(TokenStream tokenStream) {
40 | return new LemmatizerFilter(tokenStream, lemmatizer);
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/util/RTrie.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.util;
2 |
3 | import java.util.HashMap;
4 | import java.util.Iterator;
5 | import java.util.LinkedList;
6 | import java.util.List;
7 | import java.util.Map;
8 |
9 | /**
10 | * This is a R way trie implementation.
11 | *
12 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28.
13 | */
14 | public class RTrie implements Trie {
15 |
16 | private static class Node {
17 | Object value;
18 | Character key;
19 | Map children = new HashMap<>();
20 | }
21 |
22 | private Node root;
23 |
24 | @Override
25 | public void put(String key, Object value) {
26 | root = put(root, key, value, 0);
27 | }
28 |
29 | protected Node put(Node node, String key, Object value, int level) {
30 | if (node == null) { node = new Node(); }
31 | if (level == key.length()) { node.value = value; return node; }
32 | Character c = key.charAt(level);
33 |
34 | node.children.put(c, put(node.children.get(c), key, value, level + 1));
35 | node.children.get(c).key = c;
36 |
37 | return node;
38 | }
39 |
40 | @Override
41 | public Object get(String key) {
42 | Node node = get(root, key, 0);
43 | if (node == null) return null;
44 | return node.value;
45 | }
46 |
47 | protected Node get(Node node, String key, int level) {
48 | if (node == null) return null;
49 | if (level == key.length()) return node;
50 | Character c = key.charAt(level);
51 | return get(node.children.get(c), key, level + 1);
52 | }
53 |
54 | @Override
55 | public boolean contains(String key) {
56 | return get(key) != null;
57 | }
58 |
59 | @Override
60 | public Iterator keys() {
61 | List collector = new LinkedList();
62 |
63 | for (Node n : root.children.values()) {
64 | collect(n, "", 1, collector);
65 | }
66 |
67 | return collector.iterator();
68 | }
69 |
70 | private void collect(Node node, String prefix, int level, List collector) {
71 | if (node == null) return;
72 | if (node.value != null) collector.add(prefix + node.key);
73 |
74 | for (Node n : node.children.values()) {
75 | collect(n, prefix + node.key, level + 1, collector);
76 | }
77 | }
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/util/TernarySearchTree.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.util;
2 |
3 | import java.util.Iterator;
4 | import java.util.LinkedList;
5 | import java.util.List;
6 |
7 | /**
8 | * An simple implementation of @see Ternary Search Tree.
9 | *
10 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
11 | */
12 | public class TernarySearchTree implements Trie {
13 |
14 | private static class Node {
15 | Object value;
16 | Character key;
17 | Node left, middle, right;
18 | }
19 |
20 | private Node root;
21 |
22 | @Override
23 | public void put(String key, Object value) {
24 | root = put(root, key, value, 0);
25 | }
26 |
27 | protected Node put(Node node, String key, Object value, int level) {
28 | Character c = key.charAt(level);
29 |
30 | if (node == null) { node = new Node(); node.key = c; }
31 |
32 | if (c < node.key) node.left = put(node.left, key, value, level);
33 | else if (c > node.key) node.right = put(node.right, key, value, level);
34 | else if (level < key.length() - 1) node.middle = put(node.middle, key, value, level + 1);
35 | else node.value = value;
36 |
37 | return node;
38 | }
39 |
40 | @Override
41 | public Object get(String key) {
42 | Node node = get(root, key, 0);
43 | if (node == null) return null;
44 | return node.value;
45 | }
46 |
47 | protected Node get(Node node, String key, int level) {
48 | if (node == null) return null;
49 |
50 | Character c = key.charAt(level);
51 | if (c < node.key) return get(node.left, key, level);
52 | else if (c > node.key) return get(node.right, key, level);
53 | else if (level < key.length() - 1) return get(node.middle, key, level + 1);
54 | else return node;
55 | }
56 |
57 | @Override
58 | public Iterator keys() {
59 | List collector = new LinkedList();
60 | collect(root, "", 0, collector);
61 | return collector.iterator();
62 | }
63 |
64 | @Override
65 | public boolean contains(String key) {
66 | return get(key) != null;
67 | }
68 |
69 | private void collect(Node node, String prefix, int level, List collector) {
70 | if (node == null) return;
71 | if (node.value != null) collector.add(prefix + node.key);
72 |
73 | collect(node.left, prefix, level, collector);
74 | collect(node.middle, prefix + node.key, level + 1, collector);
75 | collect(node.right, prefix, level, collector);
76 | }
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/java/com/nicholasding/search/util/Trie.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.util;
2 |
3 | import java.util.Iterator;
4 |
5 | /**
6 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28.
7 | */
8 | public interface Trie {
9 |
10 | /**
11 | * Store the object into the Trie by providing a string key
12 | *
13 | * @param key
14 | * @param value
15 | */
16 | void put(String key, Object value);
17 |
18 | /**
19 | * Retrieve the object stored in the Trie
20 | *
21 | * @param key
22 | * @return null if key doesn't exist
23 | */
24 | Object get(String key);
25 |
26 | /**
27 | * @return iterable over all the keys
28 | */
29 | Iterator keys();
30 |
31 | /**
32 | * @param key
33 | * @return true if key was found in the trie
34 | */
35 | boolean contains(String key);
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/resources/wordnet.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nicholasding/solr-lemmatizer/37feb0961bda036823a7ac46658e84dd6246f354/src/main/resources/wordnet.zip
--------------------------------------------------------------------------------
/src/test/java/com/nicholasding/search/lemmatization/ExceptionListTest.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization;
2 |
3 | import org.junit.Assert;
4 | import org.junit.Test;
5 |
6 | import java.util.HashMap;
7 | import java.util.Map;
8 |
9 | /**
10 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
11 | */
12 | public class ExceptionListTest {
13 |
14 | @Test
15 | public void testInsertAndLookup() {
16 | ExceptionList excList = new ExceptionList();
17 | Map m = new HashMap<>();
18 | m.put("a", "b");
19 | excList.addExceptionList(POS.NOUN, m);
20 |
21 | Assert.assertEquals("b", excList.lookupException("a"));
22 | Assert.assertEquals("b", excList.lookupException("a", POS.NOUN));
23 | Assert.assertEquals(null, excList.lookupException("a", POS.VERB));
24 | }
25 |
26 | @Test
27 | public void testMergeInput() {
28 | ExceptionList excList = new ExceptionList();
29 | Map m = new HashMap<>();
30 | m.put("a", "b");
31 |
32 | excList.addExceptionList(POS.NOUN, m);
33 |
34 | m.put("c", "d");
35 | excList.addExceptionList(POS.NOUN, m);
36 |
37 | Assert.assertEquals("b", excList.lookupException("a"));
38 | Assert.assertEquals("d", excList.lookupException("c"));
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/src/test/java/com/nicholasding/search/lemmatization/LemmatizerTest.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization;
2 |
3 | import com.nicholasding.search.lemmatization.impl.PackagedWordNetReader;
4 | import com.nicholasding.search.lemmatization.impl.WordNetLemmatizer;
5 | import com.nicholasding.search.util.RTrie;
6 | import com.nicholasding.search.util.TernarySearchTree;
7 | import org.junit.Assert;
8 | import org.junit.Test;
9 |
10 | /**
11 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28
12 | */
13 | public class LemmatizerTest {
14 |
15 | @Test
16 | public void testStemShouldReturnNull() {
17 | Lemmatizer lemmatizer = new WordNetLemmatizer(new WordNetReaderStub(), new RTrie());
18 | Assert.assertNull(lemmatizer.stem(null, null));
19 | }
20 |
21 | @Test
22 | public void testStemShouldGetResultFromExceptionList() {
23 | Lemmatizer lemmatizer = new WordNetLemmatizer(new WordNetReaderStub(), new RTrie());
24 | Assert.assertEquals("radius", lemmatizer.stem("radii", null));
25 | }
26 |
27 | @Test
28 | public void testStemShouldGetResultFromTrie() {
29 | Lemmatizer lemmatizer = new WordNetLemmatizer(new WordNetReaderStub(), new RTrie());
30 | Assert.assertEquals("toy", lemmatizer.stem("toys", POS.NOUN));
31 | }
32 |
33 | @Test
34 | public void testStemShouldReturnOriginalForm() {
35 | Lemmatizer lemmatizer = new WordNetLemmatizer(new WordNetReaderStub(), new RTrie());
36 | Assert.assertEquals("toy", lemmatizer.stem("toy", POS.NOUN));
37 | }
38 |
39 | @Test
40 | public void testRealFilesFromResources() {
41 | Lemmatizer lemmatizer = new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new RTrie());
42 | Assert.assertEquals("toy", lemmatizer.stem("toy", POS.NOUN));
43 | Assert.assertEquals("plant", lemmatizer.stem("plants", POS.NOUN));
44 | Assert.assertEquals("radius", lemmatizer.stem("radii", POS.NOUN));
45 | }
46 |
47 | @Test
48 | public void testWhenInputIsS() {
49 | Lemmatizer lemmatizer = new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new TernarySearchTree());
50 | Assert.assertEquals("s", lemmatizer.stem("s", POS.NOUN));
51 | }
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/src/test/java/com/nicholasding/search/lemmatization/WordNetReaderStub.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization;
2 |
3 | import java.util.*;
4 |
5 | /**
6 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28
7 | */
8 | public class WordNetReaderStub implements WordNetReader {
9 |
10 | private Map createExceptionList(POS noun) {
11 | Map m = new HashMap();
12 | m.put("radii", "radius");
13 | return m;
14 | }
15 |
16 | @Override
17 | public ExceptionList readExceptionList() {
18 | ExceptionList list = new ExceptionList();
19 | list.addExceptionList(POS.NOUN, createExceptionList(POS.NOUN));
20 | return list;
21 | }
22 |
23 | @Override
24 | public Collection readLemmas() {
25 | List data = new ArrayList();
26 | data.add("toy");
27 | return data;
28 | }
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/src/test/java/com/nicholasding/search/lemmatization/impl/BenchmarkTest.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.lemmatization.impl;
2 |
3 | import com.nicholasding.search.util.RTrie;
4 | import com.nicholasding.search.util.TernarySearchTree;
5 | import com.nicholasding.search.util.Trie;
6 | import org.junit.Test;
7 |
8 | import java.util.Iterator;
9 |
10 | /**
11 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
12 | */
13 | public class BenchmarkTest {
14 |
15 | @Test
16 | public void benchmarkRTrie() {
17 | long start = System.currentTimeMillis();
18 | WordNetLemmatizer lemmatizer = new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new RTrie());
19 | System.out.println("RTrie initialize: " + (System.currentTimeMillis() - start) + " ms");
20 |
21 | Trie trie = lemmatizer.getTrie();
22 | benchmark(trie);
23 | }
24 |
25 | @Test
26 | public void benchmarkTST() {
27 | long start = System.currentTimeMillis();
28 | WordNetLemmatizer lemmatizer = new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new TernarySearchTree());
29 | System.out.println("TST initialize: " + (System.currentTimeMillis() - start) + " ms");
30 |
31 | Trie trie = lemmatizer.getTrie();
32 | benchmark(trie);
33 | }
34 |
35 | private void benchmark(Trie trie) {
36 | long start = System.currentTimeMillis();
37 |
38 | long counter = 0;
39 | Iterator keys = trie.keys();
40 | while (keys.hasNext()) {
41 | trie.get(keys.next());
42 | counter++;
43 | }
44 |
45 | long end = System.currentTimeMillis();
46 |
47 | float avg = (float) (end - start) / counter * 1000000;
48 | System.out.println("Total access time: " + (end - start) + " ms, " + counter + " lookups, " + avg + " ns/lookup");
49 | }
50 |
51 | }
52 |
--------------------------------------------------------------------------------
/src/test/java/com/nicholasding/search/solr/LemmatizerFilterFactoryTest.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.solr;
2 |
3 | import org.apache.lucene.analysis.BaseTokenStreamTestCase;
4 | import org.apache.lucene.analysis.MockTokenizer;
5 | import org.apache.lucene.analysis.TokenStream;
6 | import org.junit.Test;
7 |
8 | import java.io.IOException;
9 | import java.io.StringReader;
10 | import java.util.HashMap;
11 | import java.util.Map;
12 |
13 | /**
14 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
15 | */
16 | public class LemmatizerFilterFactoryTest extends BaseTokenStreamTestCase {
17 |
18 | @Test
19 | public void testUsingPackagedWordNetReader() throws IOException {
20 | Map args = new HashMap<>();
21 | LemmatizerFilterFactory factory = new LemmatizerFilterFactory(args);
22 | StringReader reader = new StringReader("it better works");
23 | final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
24 | in.setReader(reader);
25 | TokenStream stream = factory.create(in);
26 | assertTokenStreamContents(stream, new String[] { "it", "good", "work" });
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/test/java/com/nicholasding/search/solr/LemmatizerFilterTest.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.solr;
2 |
3 | import com.nicholasding.search.lemmatization.impl.PackagedWordNetReader;
4 | import com.nicholasding.search.lemmatization.impl.WordNetLemmatizer;
5 | import java.io.IOException;
6 | import java.io.StringReader;
7 | import java.util.HashMap;
8 | import java.util.Map;
9 |
10 | import com.nicholasding.search.util.RTrie;
11 | import com.nicholasding.search.util.TernarySearchTree;
12 | import org.apache.lucene.analysis.BaseTokenStreamTestCase;
13 | import org.apache.lucene.analysis.MockTokenizer;
14 | import org.apache.lucene.analysis.TokenStream;
15 | import org.junit.Test;
16 |
17 | /**
18 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-29
19 | */
20 | public class LemmatizerFilterTest extends BaseTokenStreamTestCase {
21 |
22 | @Test
23 | public void testWithSamplePhrase() throws IOException {
24 | StringReader reader = new StringReader("it better works");
25 | final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
26 | in.setReader(reader);
27 | TokenStream stream = new LemmatizerFilter(in, new WordNetLemmatizer(new PackagedWordNetReader("wordnet.zip"), new RTrie()));
28 | assertTokenStreamContents(stream, new String[] { "it", "good", "work" });
29 | }
30 |
31 | @Test
32 | public void testUsingPackagedWordNetReaderFromFilterFactory() throws IOException {
33 | Map args = new HashMap<>();
34 | LemmatizerFilterFactory factory = new LemmatizerFilterFactory(args);
35 |
36 | StringReader reader = new StringReader("it better works");
37 | final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
38 | in.setReader(reader);
39 | TokenStream stream = factory.create(in);
40 | assertTokenStreamContents(stream, new String[] { "it", "good", "work" });
41 | }
42 |
43 | @Test
44 | public void testUsingDirectoryWordNetReaderWithDummyPathShouldFailSilently() throws IOException {
45 | Map args = new HashMap<>();
46 | args.put("dictPath", "/tmp");
47 | LemmatizerFilterFactory factory = new LemmatizerFilterFactory(args);
48 |
49 | StringReader reader = new StringReader("it better works");
50 | final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
51 | in.setReader(reader);
52 | TokenStream stream = factory.create(in);
53 | assertTokenStreamContents(stream, new String[] { "it", "better", "works" });
54 | }
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/src/test/java/com/nicholasding/search/util/TrieTest.java:
--------------------------------------------------------------------------------
1 | package com.nicholasding.search.util;
2 |
3 | import org.junit.Assert;
4 | import org.junit.Test;
5 |
6 | import java.util.Iterator;
7 |
8 | /**
9 | * @author Nicholas Ding (nicholasdsj@gmail.com) on 2016-06-28.
10 | */
11 | public class TrieTest {
12 |
13 | @Test
14 | public void testPut() {
15 | testPut(new RTrie());
16 | testPut(new TernarySearchTree());
17 | }
18 |
19 | private void testPut(Trie trie) {
20 | trie.put("ab", "ab");
21 | Assert.assertTrue(trie.contains("ab"));
22 | Assert.assertEquals("ab", trie.get("ab"));
23 | }
24 |
25 | @Test
26 | public void testIterateAllKeys() {
27 | testTrieKeys(new RTrie());
28 | testTrieKeys(new TernarySearchTree());
29 | }
30 |
31 | private void testTrieKeys(Trie trie) {
32 | trie.put("ab", "ab");
33 | trie.put("bc", "bc");
34 | trie.put("cd", "cd");
35 |
36 | Iterator keys = trie.keys();
37 | Assert.assertEquals("ab", keys.next());
38 | Assert.assertEquals("bc", keys.next());
39 | Assert.assertEquals("cd", keys.next());
40 | }
41 | }
42 |
--------------------------------------------------------------------------------