├── .travis.yml ├── .gitignore ├── src ├── main │ └── java │ │ └── com │ │ └── abahgat │ │ └── suffixtree │ │ ├── Edge.java │ │ ├── Utils.java │ │ ├── EdgeBag.java │ │ ├── Node.java │ │ └── GeneralizedSuffixTree.java └── test │ └── java │ └── com │ └── abahgat │ └── suffixtree │ ├── UtilsTest.java │ ├── EdgeBagTest.java │ └── SuffixTreeTest.java ├── pom.xml ├── README.md └── LICENSE.txt /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | *.ipr 3 | *.iws 4 | target 5 | .idea 6 | -------------------------------------------------------------------------------- /src/main/java/com/abahgat/suffixtree/Edge.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012 Alessandro Bahgat Shehata 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.abahgat.suffixtree; 17 | 18 | /** 19 | * Represents an Edge in the Suffix Tree. 20 | * It has a label and a destination Node 21 | */ 22 | class Edge { 23 | private String label; 24 | private Node dest; 25 | 26 | public String getLabel() { 27 | return label; 28 | } 29 | 30 | public void setLabel(String label) { 31 | this.label = label; 32 | } 33 | 34 | public Node getDest() { 35 | return dest; 36 | } 37 | 38 | public void setDest(Node dest) { 39 | this.dest = dest; 40 | } 41 | 42 | public Edge(String label, Node dest) { 43 | this.label = label; 44 | this.dest = dest; 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/test/java/com/abahgat/suffixtree/UtilsTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012 Alessandro Bahgat Shehata 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.abahgat.suffixtree; 17 | 18 | import java.util.Set; 19 | 20 | import com.abahgat.suffixtree.Utils; 21 | import junit.framework.TestCase; 22 | 23 | public class UtilsTest extends TestCase { 24 | 25 | public UtilsTest(String testName) { 26 | super(testName); 27 | } 28 | 29 | @Override 30 | protected void setUp() throws Exception { 31 | super.setUp(); 32 | } 33 | 34 | @Override 35 | protected void tearDown() throws Exception { 36 | super.tearDown(); 37 | } 38 | 39 | public void testNormalize() { 40 | System.out.println("normalize"); 41 | 42 | String[] ins = new String[]{"200 S Main St", "Lakeshore Dr.", "lake-view", "St. Jacob's Cathedral"}; 43 | String[] outs = new String[]{"200smainst", "lakeshoredr", "lakeview", "stjacobscathedral"}; 44 | 45 | for (int i = 0; i < ins.length; ++i) { 46 | String result = Utils.normalize(ins[i]); 47 | assertEquals(outs[i], result); 48 | } 49 | } 50 | 51 | public void testGetSubstrings() { 52 | System.out.println("getsubstrings"); 53 | 54 | String in = "banana"; 55 | Set out = Utils.getSubstrings(in); 56 | String[] outArr = new String[] { "b" , "a", "n", "ba", "an", "na", "ban", "ana", "nan", "bana", "anan", "nana", "banan", "anana", "banana"}; 57 | 58 | for (String s : outArr) { 59 | assertTrue(out.remove(s)); 60 | } 61 | 62 | 63 | assertTrue(out.isEmpty()); 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/com/abahgat/suffixtree/Utils.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012 Alessandro Bahgat Shehata 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.abahgat.suffixtree; 17 | 18 | import java.util.HashSet; 19 | import java.util.Set; 20 | 21 | public class Utils { 22 | 23 | /** 24 | * Normalize an input string 25 | * 26 | * @param in the input string to normalize 27 | * @return in all lower-case, without any non alphanumeric character 28 | */ 29 | public static String normalize(String in) { 30 | StringBuilder out = new StringBuilder(); 31 | String l = in.toLowerCase(); 32 | for (int i = 0; i < l.length(); ++i) { 33 | char c = l.charAt(i); 34 | if (c >= 'a' && c <= 'z' || c >= '0' && c <= '9') { 35 | out.append(c); 36 | } 37 | } 38 | return out.toString(); 39 | } 40 | 41 | /** 42 | * Computes the set of all the substrings contained within the str 43 | * 44 | * It is fairly inefficient, but it is used just in tests ;) 45 | * @param str the string to compute substrings of 46 | * @return the set of all possible substrings of str 47 | */ 48 | public static Set getSubstrings(String str) { 49 | Set ret = new HashSet(); 50 | // compute all substrings 51 | for (int len = 1; len <= str.length(); ++len) { 52 | for (int start = 0; start + len <= str.length(); ++start) { 53 | String itstr = str.substring(start, start + len); 54 | ret.add(itstr); 55 | } 56 | } 57 | 58 | return ret; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | com.abahgat 6 | suffixtree 7 | jar 8 | 1.0.0-SNAPSHOT 9 | Suffix Tree 10 | http://github.com/abahgat/suffixtree 11 | 12 | 13 | 14 | junit 15 | junit 16 | 4.13.1 17 | test 18 | 19 | 20 | 21 | 22 | 23 | 24 | org.apache.maven.plugins 25 | maven-compiler-plugin 26 | 2.0.2 27 | 28 | 1.6 29 | 1.6 30 | true 31 | 32 | 33 | 34 | 35 | org.apache.maven.plugins 36 | maven-surefire-plugin 37 | 38 | once 39 | -Xms1024m -Xmx1024m 40 | 41 | 42 | 43 | 44 | org.apache.maven.plugins 45 | maven-resources-plugin 46 | 2.6 47 | 48 | UTF-8 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | Apache 2 57 | http://www.apache.org/licenses/LICENSE-2.0.txt 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /src/test/java/com/abahgat/suffixtree/EdgeBagTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012 Alessandro Bahgat Shehata 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.abahgat.suffixtree; 17 | 18 | import org.junit.After; 19 | import org.junit.AfterClass; 20 | import org.junit.Before; 21 | import org.junit.BeforeClass; 22 | import org.junit.Test; 23 | import static org.junit.Assert.*; 24 | 25 | public class EdgeBagTest { 26 | 27 | public EdgeBagTest() { 28 | } 29 | 30 | @BeforeClass 31 | public static void setUpClass() throws Exception { 32 | } 33 | 34 | @AfterClass 35 | public static void tearDownClass() throws Exception { 36 | } 37 | 38 | @Before 39 | public void setUp() { 40 | } 41 | 42 | @After 43 | public void tearDown() { 44 | } 45 | 46 | 47 | @Test 48 | public void testPut() { 49 | EdgeBag bag = new EdgeBag(); 50 | Edge e1 = new Edge("asd", null); 51 | Edge e2 = new Edge("errimo", null); 52 | Edge e3 = new Edge("foo", null); 53 | Edge e4 = new Edge("bar", null); 54 | bag.put('a', e1); 55 | bag.put('e', e2); 56 | bag.put('f', e3); 57 | bag.put('b', e4); 58 | assertTrue("Bag contains " + bag.values().size() + " elements", bag.values().size() == 4); 59 | assertTrue(bag.get('a').equals(e1)); 60 | assertTrue(bag.get('e').equals(e2)); 61 | assertTrue(bag.get('f').equals(e3)); 62 | assertTrue(bag.get('b').equals(e4)); 63 | } 64 | 65 | @Test 66 | public void testCast() { 67 | for (char c = '0'; c <= '9'; ++c) { 68 | assertEquals(c, (char)(byte)c); 69 | } 70 | 71 | for (char c = 'a'; c <= 'z'; ++c) { 72 | assertEquals(c, (char)(byte)c); 73 | } 74 | } 75 | 76 | public void testSort() { 77 | 78 | } 79 | 80 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Generalized Suffix Tree 2 | [![Build Status](https://travis-ci.org/abahgat/suffixtree.png?branch=master)](https://travis-ci.org/abahgat/suffixtree) 3 | 4 | A Generalized Suffix Tree, based on Ukkonen's paper "On-line construction of suffix trees" 5 | http://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf 6 | 7 | Allows for fast storage and fast(er) retrieval by creating a tree-based index out of a set of strings. 8 | Unlike common suffix trees, which are generally used to build an index out of one (very) long string, a *Generalized Suffix Tree* can be used to build an index over many strings. 9 | 10 | Its main operations are `put` and `search`: 11 | 12 | * `put` adds the given key to the index, allowing for later retrieval of the given value. 13 | * `search` can be used to retrieve the set of all the values that were put in the index with keys that contain a given input. 14 | 15 | In particular, after `put(K, V)`, `search(H)` will return a set containing `V` for any string `H` that is substring of `K`. 16 | 17 | The overall complexity of the retrieval operation (`search`) is *O(m)* where *m* is the length of the string to search within the index. 18 | 19 | ## Differences from the original suffix tree 20 | 21 | Although the implementation is based on the original design by Ukkonen, there are a few aspects where it differs significantly. 22 | 23 | The tree is composed of a set of nodes and labeled edges. The labels on the edges can have any length as long as it's greater than 0. 24 | The only constraint is that no two edges going out from the same node start with the same character. 25 | 26 | Because of this, a given _(startNode, stringSuffix)_ pair can denote a unique path within the tree, and it is the path (if any) that can be composed by sequentially traversing all the edges _(e1, e2, …)_ starting from _startNode_ such that _(e1.label + e2.label + …)_ is equal to the _stringSuffix_. 27 | See the `GeneralizedSuffixTree#search` method for details. 28 | 29 | The union of all the edge labels from the root to a given leaf node denotes the set of the strings explicitly contained within the GST. 30 | In addition to those Strings, there are a set of different strings that are implicitly contained within the GST, and it is composed of the strings built by concatenating _e1.label + e2.label + ... + $end_, where _e1, e2, …_ is a proper path and _$end_ is prefix of any of the labels of the edges starting from the last node of the path. 31 | 32 | This kind of "implicit path" is important in the testAndSplit method. 33 | 34 | ## License 35 | 36 | This Generalized Suffix Tree is released under the Apache License 2.0 37 | 38 | Copyright 2012 Alessandro Bahgat Shehata 39 | 40 | Licensed under the Apache License, Version 2.0 (the "License"); 41 | you may not use this file except in compliance with the License. 42 | You may obtain a copy of the License at 43 | 44 | http://www.apache.org/licenses/LICENSE-2.0 45 | 46 | Unless required by applicable law or agreed to in writing, software 47 | distributed under the License is distributed on an "AS IS" BASIS, 48 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 49 | See the License for the specific language governing permissions and 50 | limitations under the License. 51 | -------------------------------------------------------------------------------- /src/main/java/com/abahgat/suffixtree/EdgeBag.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012 Alessandro Bahgat Shehata 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.abahgat.suffixtree; 17 | 18 | import java.util.Arrays; 19 | import java.util.Collection; 20 | import java.util.Map; 21 | import java.util.Set; 22 | 23 | /** 24 | * A specialized implementation of Map that uses native char types and sorted 25 | * arrays to keep minimize the memory footprint. 26 | * Implements only the operations that are needed within the suffix tree context. 27 | */ 28 | class EdgeBag implements Map { 29 | private byte[] chars; 30 | private Edge[] values; 31 | private static final int BSEARCH_THRESHOLD = 6; 32 | 33 | @Override 34 | public Edge put(Character character, Edge e) { 35 | char c = character.charValue(); 36 | if (c != (char) (byte) c) { 37 | throw new IllegalArgumentException("Illegal input character " + c + "."); 38 | } 39 | 40 | if (chars == null) { 41 | chars = new byte[0]; 42 | values = new Edge[0]; 43 | } 44 | int idx = search(c); 45 | Edge previous = null; 46 | 47 | if (idx < 0) { 48 | int currsize = chars.length; 49 | byte[] copy = new byte[currsize + 1]; 50 | System.arraycopy(chars, 0, copy, 0, currsize); 51 | chars = copy; 52 | Edge[] copy1 = new Edge[currsize + 1]; 53 | System.arraycopy(values, 0, copy1, 0, currsize); 54 | values = copy1; 55 | chars[currsize] = (byte) c; 56 | values[currsize] = e; 57 | currsize++; 58 | if (currsize > BSEARCH_THRESHOLD) { 59 | sortArrays(); 60 | } 61 | } else { 62 | previous = values[idx]; 63 | values[idx] = e; 64 | } 65 | return previous; 66 | } 67 | 68 | @Override 69 | public Edge get(Object maybeCharacter) { 70 | return get(((Character) maybeCharacter).charValue()); // throws if cast fails. 71 | } 72 | 73 | public Edge get(char c) { 74 | if (c != (char) (byte) c) { 75 | throw new IllegalArgumentException("Illegal input character " + c + "."); 76 | } 77 | 78 | int idx = search(c); 79 | if (idx < 0) { 80 | return null; 81 | } 82 | return values[idx]; 83 | } 84 | 85 | private int search(char c) { 86 | if (chars == null) 87 | return -1; 88 | 89 | if (chars.length > BSEARCH_THRESHOLD) { 90 | return java.util.Arrays.binarySearch(chars, (byte) c); 91 | } 92 | 93 | for (int i = 0; i < chars.length; i++) { 94 | if (c == chars[i]) { 95 | return i; 96 | } 97 | } 98 | return -1; 99 | } 100 | 101 | @Override 102 | public Collection values() { 103 | return Arrays.asList(values == null ? new Edge[0] : values); 104 | } 105 | 106 | /** 107 | * A trivial implementation of sort, used to sort chars[] and values[] according to the data in chars. 108 | * 109 | * It was preferred to faster sorts (like qsort) because of the small sizes (<=36) of the collections involved. 110 | */ 111 | private void sortArrays() { 112 | for (int i = 0; i < chars.length; i++) { 113 | for (int j = i; j > 0; j--) { 114 | if (chars[j-1] > chars[j]) { 115 | byte swap = chars[j]; 116 | chars[j] = chars[j-1]; 117 | chars[j-1] = swap; 118 | 119 | Edge swapEdge = values[j]; 120 | values[j] = values[j-1]; 121 | values[j-1] = swapEdge; 122 | } 123 | } 124 | } 125 | } 126 | 127 | @Override 128 | public boolean isEmpty() { 129 | return chars == null || chars.length == 0; 130 | } 131 | 132 | @Override 133 | public int size() { 134 | return chars == null ? 0 : chars.length; 135 | } 136 | 137 | @Override 138 | public Set> entrySet() { 139 | throw new UnsupportedOperationException("Not implemented"); 140 | } 141 | 142 | @Override 143 | public Set keySet() { 144 | throw new UnsupportedOperationException("Not implemented"); 145 | } 146 | 147 | @Override 148 | public void clear() { 149 | throw new UnsupportedOperationException("Not implemented"); 150 | } 151 | 152 | @Override 153 | public void putAll(Map m) { 154 | throw new UnsupportedOperationException("Not implemented"); 155 | } 156 | 157 | @Override 158 | public Edge remove(Object key) { 159 | throw new UnsupportedOperationException("Not implemented"); 160 | } 161 | 162 | @Override 163 | public boolean containsKey(Object key) { 164 | throw new UnsupportedOperationException("Not implemented"); 165 | } 166 | 167 | @Override 168 | public boolean containsValue(Object key) { 169 | throw new UnsupportedOperationException("Not implemented"); 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/main/java/com/abahgat/suffixtree/Node.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012 Alessandro Bahgat Shehata 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.abahgat.suffixtree; 17 | 18 | import java.util.Collection; 19 | import java.util.HashSet; 20 | import java.util.Map; 21 | import java.util.Set; 22 | 23 | /** 24 | * Represents a node of the generalized suffix tree graph 25 | * @see GeneralizedSuffixTree 26 | */ 27 | class Node { 28 | 29 | /** 30 | * The payload array used to store the data (indexes) associated with this node. 31 | * In this case, it is used to store all property indexes. 32 | * 33 | * As it is handled, it resembles an ArrayList: when it becomes full it 34 | * is copied to another bigger array (whose size is equals to data.length + 35 | * INCREMENT). 36 | * 37 | * Originally it was a List but it took too much memory, changing 38 | * it to int[] take less memory because indexes are stored using native 39 | * types. 40 | */ 41 | private int[] data; 42 | /** 43 | * Represents index of the last position used in the data int[] array. 44 | * 45 | * It should always be less than data.length 46 | */ 47 | private int lastIdx = 0; 48 | /** 49 | * The starting size of the int[] array containing the payload 50 | */ 51 | private static final int START_SIZE = 0; 52 | /** 53 | * The increment in size used when the payload array is full 54 | */ 55 | private static final int INCREMENT = 1; 56 | /** 57 | * The set of edges starting from this node 58 | */ 59 | private final Map edges; 60 | /** 61 | * The suffix link as described in Ukkonen's paper. 62 | * if str is the string denoted by the path from the root to this, this.suffix 63 | * is the node denoted by the path that corresponds to str without the first char. 64 | */ 65 | private Node suffix; 66 | /** 67 | * The total number of different results that are stored in this 68 | * node and in underlying ones (i.e. nodes that can be reached through paths 69 | * starting from this. 70 | * 71 | * This must be calculated explicitly using computeAndCacheCount 72 | * @see Node#computeAndCacheCount() 73 | */ 74 | private int resultCount = -1; 75 | 76 | /** 77 | * Creates a new Node 78 | */ 79 | Node() { 80 | edges = new EdgeBag(); 81 | suffix = null; 82 | data = new int[START_SIZE]; 83 | } 84 | 85 | /** 86 | * Returns all the indexes associated to this node and its children. 87 | * @return all the indexes associated to this node and its children 88 | */ 89 | Collection getData() { 90 | return getData(-1); 91 | } 92 | 93 | /** 94 | * Returns the first numElements elements from the ones associated to this node. 95 | * 96 | * Gets data from the payload of both this node and its children, the string representation 97 | * of the path to this node is a substring of the one of the children nodes. 98 | * 99 | * @param numElements the number of results to return. Use -1 to get all 100 | * @return the first numElements associated to this node and children 101 | */ 102 | Collection getData(int numElements) { 103 | Set ret = new HashSet(); 104 | for (int num : data) { 105 | ret.add(num); 106 | if (ret.size() == numElements) { 107 | return ret; 108 | } 109 | } 110 | // need to get more matches from child nodes. This is what may waste time 111 | for (Edge e : edges.values()) { 112 | if (-1 == numElements || ret.size() < numElements) { 113 | for (int num : e.getDest().getData()) { 114 | ret.add(num); 115 | if (ret.size() == numElements) { 116 | return ret; 117 | } 118 | } 119 | } 120 | } 121 | return ret; 122 | } 123 | 124 | /** 125 | * Adds the given index to the set of indexes associated with this 126 | */ 127 | void addRef(int index) { 128 | if (contains(index)) { 129 | return; 130 | } 131 | 132 | addIndex(index); 133 | 134 | // add this reference to all the suffixes as well 135 | Node iter = this.suffix; 136 | while (iter != null) { 137 | if (iter.contains(index)) { 138 | break; 139 | } 140 | iter.addRef(index); 141 | iter = iter.suffix; 142 | } 143 | 144 | } 145 | 146 | /** 147 | * Tests whether a node contains a reference to the given index. 148 | * 149 | * IMPORTANT: it works because the array is sorted by construction 150 | * 151 | * @param index the index to look for 152 | * @return true this contains a reference to index 153 | */ 154 | private boolean contains(int index) { 155 | int low = 0; 156 | int high = lastIdx - 1; 157 | 158 | while (low <= high) { 159 | int mid = (low + high) >>> 1; 160 | int midVal = data[mid]; 161 | 162 | if (midVal < index) 163 | low = mid + 1; 164 | else if (midVal > index) 165 | high = mid - 1; 166 | else 167 | return true; 168 | } 169 | return false; 170 | // Java 5 equivalent to 171 | // return java.util.Arrays.binarySearch(data, 0, lastIdx, index) >= 0; 172 | } 173 | 174 | /** 175 | * Computes the number of results that are stored on this node and on its 176 | * children, and caches the result. 177 | * 178 | * Performs the same operation on subnodes as well 179 | * @return the number of results 180 | */ 181 | protected int computeAndCacheCount() { 182 | computeAndCacheCountRecursive(); 183 | return resultCount; 184 | } 185 | 186 | private Set computeAndCacheCountRecursive() { 187 | Set ret = new HashSet(); 188 | for (int num : data) { 189 | ret.add(num); 190 | } 191 | for (Edge e : edges.values()) { 192 | for (int num : e.getDest().computeAndCacheCountRecursive()) { 193 | ret.add(num); 194 | } 195 | } 196 | 197 | resultCount = ret.size(); 198 | return ret; 199 | } 200 | 201 | /** 202 | * Returns the number of results that are stored on this node and on its 203 | * children. 204 | * Should be called after having called computeAndCacheCount. 205 | * 206 | * @throws IllegalStateException when this method is called without having called 207 | * computeAndCacheCount first 208 | * @see Node#computeAndCacheCount() 209 | * @todo this should raise an exception when the subtree is changed but count 210 | * wasn't updated 211 | */ 212 | public int getResultCount() throws IllegalStateException { 213 | if (-1 == resultCount) { 214 | throw new IllegalStateException("getResultCount() shouldn't be called without calling computeCount() first"); 215 | } 216 | 217 | return resultCount; 218 | } 219 | 220 | void addEdge(char ch, Edge e) { 221 | edges.put(ch, e); 222 | } 223 | 224 | Edge getEdge(char ch) { 225 | return edges.get(ch); 226 | } 227 | 228 | Map getEdges() { 229 | return edges; 230 | } 231 | 232 | Node getSuffix() { 233 | return suffix; 234 | } 235 | 236 | void setSuffix(Node suffix) { 237 | this.suffix = suffix; 238 | } 239 | 240 | private void addIndex(int index) { 241 | if (lastIdx == data.length) { 242 | int[] copy = new int[data.length + INCREMENT]; 243 | System.arraycopy(data, 0, copy, 0, data.length); 244 | data = copy; 245 | } 246 | data[lastIdx++] = index; 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /src/test/java/com/abahgat/suffixtree/SuffixTreeTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012 Alessandro Bahgat Shehata 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.abahgat.suffixtree; 17 | 18 | import java.util.Arrays; 19 | import java.util.Collection; 20 | import java.util.HashSet; 21 | import junit.framework.TestCase; 22 | import static com.abahgat.suffixtree.Utils.getSubstrings; 23 | 24 | public class SuffixTreeTest extends TestCase { 25 | 26 | public static void assertEmpty(Collection collection) { 27 | assertTrue("Expected empty collection.", collection.isEmpty()); 28 | } 29 | 30 | public void testBasicTreeGeneration() { 31 | GeneralizedSuffixTree in = new GeneralizedSuffixTree(); 32 | 33 | String word = "cacao"; 34 | in.put(word, 0); 35 | 36 | /* test that every substring is contained within the tree */ 37 | for (String s : getSubstrings(word)) { 38 | assertTrue(in.search(s).contains(0)); 39 | } 40 | assertEmpty(in.search("caco")); 41 | assertEmpty(in.search("cacaoo")); 42 | assertEmpty(in.search("ccacao")); 43 | 44 | in = new GeneralizedSuffixTree(); 45 | word = "bookkeeper"; 46 | in.put(word, 0); 47 | for (String s : getSubstrings(word)) { 48 | assertTrue(in.search(s).contains(0)); 49 | } 50 | assertEmpty(in.search("books")); 51 | assertEmpty(in.search("boke")); 52 | assertEmpty(in.search("ookepr")); 53 | } 54 | 55 | public void testWeirdword() { 56 | GeneralizedSuffixTree in = new GeneralizedSuffixTree(); 57 | 58 | String word = "cacacato"; 59 | in.put(word, 0); 60 | 61 | /* test that every substring is contained within the tree */ 62 | for (String s : getSubstrings(word)) { 63 | assertTrue(in.search(s).contains(0)); 64 | } 65 | } 66 | 67 | public void testDouble() { 68 | // test whether the tree can handle repetitions 69 | GeneralizedSuffixTree in = new GeneralizedSuffixTree(); 70 | String word = "cacao"; 71 | in.put(word, 0); 72 | in.put(word, 1); 73 | 74 | for (String s : getSubstrings(word)) { 75 | assertTrue(in.search(s).contains(0)); 76 | assertTrue(in.search(s).contains(1)); 77 | } 78 | } 79 | 80 | public void testBananaAddition() { 81 | GeneralizedSuffixTree in = new GeneralizedSuffixTree(); 82 | String[] words = new String[] {"banana", "bano", "ba"}; 83 | for (int i = 0; i < words.length; ++i) { 84 | in.put(words[i], i); 85 | 86 | for (String s : getSubstrings(words[i])) { 87 | Collection result = in.search(s); 88 | assertNotNull("result null for string " + s + " after adding " + words[i], result); 89 | assertTrue("substring " + s + " not found after adding " + words[i], result.contains(i)); 90 | } 91 | 92 | } 93 | 94 | // verify post-addition 95 | for (int i = 0; i < words.length; ++i) { 96 | for (String s : getSubstrings(words[i])) { 97 | assertTrue(in.search(s).contains(i)); 98 | } 99 | } 100 | 101 | // add again, to see if it's stable 102 | for (int i = 0; i < words.length; ++i) { 103 | in.put(words[i], i + words.length); 104 | 105 | for (String s : getSubstrings(words[i])) { 106 | assertTrue(in.search(s).contains(i + words.length)); 107 | } 108 | } 109 | 110 | } 111 | 112 | public void testAddition() { 113 | GeneralizedSuffixTree in = new GeneralizedSuffixTree(); 114 | String[] words = new String[] {"cacaor" , "caricato", "cacato", "cacata", "caricata", "cacao", "banana"}; 115 | for (int i = 0; i < words.length; ++i) { 116 | in.put(words[i], i); 117 | 118 | for (String s : getSubstrings(words[i])) { 119 | Collection result = in.search(s); 120 | assertNotNull("result null for string " + s + " after adding " + words[i], result); 121 | assertTrue("substring " + s + " not found after adding " + words[i], result.contains(i)); 122 | } 123 | } 124 | // verify post-addition 125 | for (int i = 0; i < words.length; ++i) { 126 | for (String s : getSubstrings(words[i])) { 127 | Collection result = in.search(s); 128 | assertNotNull("result null for string " + s + " after adding " + words[i], result); 129 | assertTrue("substring " + s + " not found after adding " + words[i], result.contains(i)); 130 | } 131 | } 132 | 133 | // add again, to see if it's stable 134 | for (int i = 0; i < words.length; ++i) { 135 | in.put(words[i], i + words.length); 136 | 137 | for (String s : getSubstrings(words[i])) { 138 | assertTrue(in.search(s).contains(i + words.length)); 139 | } 140 | } 141 | 142 | in.computeCount(); 143 | testResultsCount(in.getRoot()); 144 | 145 | assertEmpty(in.search("aoca")); 146 | } 147 | 148 | public void testSampleAddition() { 149 | GeneralizedSuffixTree in = new GeneralizedSuffixTree(); 150 | String[] words = new String[] {"libertypike", 151 | "franklintn", 152 | "carothersjohnhenryhouse", 153 | "carothersezealhouse", 154 | "acrossthetauntonriverfromdightonindightonrockstatepark", 155 | "dightonma", 156 | "dightonrock", 157 | "6mineoflowgaponlowgapfork", 158 | "lowgapky", 159 | "lemasterjohnjandellenhouse", 160 | "lemasterhouse", 161 | "70wilburblvd", 162 | "poughkeepsieny", 163 | "freerhouse", 164 | "701laurelst", 165 | "conwaysc", 166 | "hollidayjwjrhouse", 167 | "mainandappletonsts", 168 | "menomoneefallswi", 169 | "mainstreethistoricdistrict", 170 | "addressrestricted", 171 | "brownsmillsnj", 172 | "hanoverfurnace", 173 | "hanoverbogironfurnace", 174 | "sofsavannahatfergusonaveandbethesdard", 175 | "savannahga", 176 | "bethesdahomeforboys", 177 | "bethesda"}; 178 | for (int i = 0; i < words.length; ++i) { 179 | in.put(words[i], i); 180 | 181 | for (String s : getSubstrings(words[i])) { 182 | Collection result = in.search(s); 183 | assertNotNull("result null for string " + s + " after adding " + words[i], result); 184 | assertTrue("substring " + s + " not found after adding " + words[i], result.contains(i)); 185 | } 186 | 187 | 188 | } 189 | // verify post-addition 190 | for (int i = 0; i < words.length; ++i) { 191 | for (String s : getSubstrings(words[i])) { 192 | assertTrue(in.search(s).contains(i)); 193 | } 194 | } 195 | 196 | // add again, to see if it's stable 197 | for (int i = 0; i < words.length; ++i) { 198 | in.put(words[i], i + words.length); 199 | 200 | for (String s : getSubstrings(words[i])) { 201 | assertTrue(in.search(s).contains(i + words.length)); 202 | } 203 | } 204 | 205 | in.computeCount(); 206 | testResultsCount(in.getRoot()); 207 | 208 | assertEmpty(in.search("aoca")); 209 | } 210 | 211 | private void testResultsCount(Node n) { 212 | for (Edge e : n.getEdges().values()) { 213 | assertEquals(n.getData(-1).size(), n.getResultCount()); 214 | testResultsCount(e.getDest()); 215 | } 216 | } 217 | 218 | /* testing a test method :) */ 219 | public void testGetSubstrings() { 220 | Collection exp = new HashSet(); 221 | exp.addAll(Arrays.asList(new String[] {"w", "r", "d", "wr", "rd", "wrd"})); 222 | Collection ret = getSubstrings("wrd"); 223 | assertTrue(ret.equals(exp)); 224 | } 225 | 226 | } 227 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2012 Alessandro Bahgat Shehata 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /src/main/java/com/abahgat/suffixtree/GeneralizedSuffixTree.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012 Alessandro Bahgat Shehata 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.abahgat.suffixtree; 17 | 18 | import java.util.Collection; 19 | import java.util.Collections; 20 | 21 | /** 22 | * A Generalized Suffix Tree, based on the Ukkonen's paper "On-line construction of suffix trees" 23 | * http://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf 24 | * 25 | * Allows for fast storage and fast(er) retrieval by creating a tree-based index out of a set of strings. 26 | * Unlike common suffix trees, which are generally used to build an index out of one (very) long string, 27 | * a Generalized Suffix Tree can be used to build an index over many strings. 28 | * 29 | * Its main operations are put and search: 30 | * Put adds the given key to the index, allowing for later retrieval of the given value. 31 | * Search can be used to retrieve the set of all the values that were put in the index with keys that contain a given input. 32 | * 33 | * In particular, after put(K, V), search(H) will return a set containing V for any string H that is substring of K. 34 | * 35 | * The overall complexity of the retrieval operation (search) is O(m) where m is the length of the string to search within the index. 36 | * 37 | * Although the implementation is based on the original design by Ukkonen, there are a few aspects where it differs significantly. 38 | * 39 | * The tree is composed of a set of nodes and labeled edges. The labels on the edges can have any length as long as it's greater than 0. 40 | * The only constraint is that no two edges going out from the same node will start with the same character. 41 | * 42 | * Because of this, a given (startNode, stringSuffix) pair can denote a unique path within the tree, and it is the path (if any) that can be 43 | * composed by sequentially traversing all the edges (e1, e2, ...) starting from startNode such that (e1.label + e2.label + ...) is equal 44 | * to the stringSuffix. 45 | * See the search method for details. 46 | * 47 | * The union of all the edge labels from the root to a given leaf node denotes the set of the strings explicitly contained within the GST. 48 | * In addition to those Strings, there are a set of different strings that are implicitly contained within the GST, and it is composed of 49 | * the strings built by concatenating e1.label + e2.label + ... + $end, where e1, e2, ... is a proper path and $end is prefix of any of 50 | * the labels of the edges starting from the last node of the path. 51 | * 52 | * This kind of "implicit path" is important in the testAndSplit method. 53 | * 54 | */ 55 | public class GeneralizedSuffixTree { 56 | 57 | /** 58 | * The index of the last item that was added to the GST 59 | */ 60 | private int last = 0; 61 | /** 62 | * The root of the suffix tree 63 | */ 64 | private final Node root = new Node(); 65 | /** 66 | * The last leaf that was added during the update operation 67 | */ 68 | private Node activeLeaf = root; 69 | 70 | /** 71 | * Searches for the given word within the GST. 72 | * 73 | * Returns all the indexes for which the key contains the word that was 74 | * supplied as input. 75 | * 76 | * @param word the key to search for 77 | * @return the collection of indexes associated with the input word 78 | */ 79 | public Collection search(String word) { 80 | return search(word, -1); 81 | } 82 | 83 | /** 84 | * Searches for the given word within the GST and returns at most the given number of matches. 85 | * 86 | * @param word the key to search for 87 | * @param results the max number of results to return 88 | * @return at most results values for the given word 89 | */ 90 | public Collection search(String word, int results) { 91 | Node tmpNode = searchNode(word); 92 | if (tmpNode == null) { 93 | return Collections.EMPTY_LIST; 94 | } 95 | return tmpNode.getData(results); 96 | } 97 | 98 | /** 99 | * Searches for the given word within the GST and returns at most the given number of matches. 100 | * 101 | * @param word the key to search for 102 | * @param to the max number of results to return 103 | * @return at most results values for the given word 104 | * @see GeneralizedSuffixTree#ResultInfo 105 | */ 106 | public ResultInfo searchWithCount(String word, int to) { 107 | Node tmpNode = searchNode(word); 108 | if (tmpNode == null) { 109 | return new ResultInfo(Collections.EMPTY_LIST, 0); 110 | } 111 | 112 | return new ResultInfo(tmpNode.getData(to), tmpNode.getResultCount()); 113 | } 114 | 115 | /** 116 | * Returns the tree node (if present) that corresponds to the given string. 117 | */ 118 | private Node searchNode(String word) { 119 | /* 120 | * Verifies if exists a path from the root to a node such that the concatenation 121 | * of all the labels on the path is a superstring of the given word. 122 | * If such a path is found, the last node on it is returned. 123 | */ 124 | Node currentNode = root; 125 | Edge currentEdge; 126 | 127 | for (int i = 0; i < word.length(); ++i) { 128 | char ch = word.charAt(i); 129 | // follow the edge corresponding to this char 130 | currentEdge = currentNode.getEdge(ch); 131 | if (null == currentEdge) { 132 | // there is no edge starting with this char 133 | return null; 134 | } else { 135 | String label = currentEdge.getLabel(); 136 | int lenToMatch = Math.min(word.length() - i, label.length()); 137 | if (!word.regionMatches(i, label, 0, lenToMatch)) { 138 | // the label on the edge does not correspond to the one in the string to search 139 | return null; 140 | } 141 | 142 | if (label.length() >= word.length() - i) { 143 | return currentEdge.getDest(); 144 | } else { 145 | // advance to next node 146 | currentNode = currentEdge.getDest(); 147 | i += lenToMatch - 1; 148 | } 149 | } 150 | } 151 | 152 | return null; 153 | } 154 | 155 | /** 156 | * Adds the specified index to the GST under the given key. 157 | * 158 | * Entries must be inserted so that their indexes are in non-decreasing order, 159 | * otherwise an IllegalStateException will be raised. 160 | * 161 | * @param key the string key that will be added to the index 162 | * @param index the value that will be added to the index 163 | * @throws IllegalStateException if an invalid index is passed as input 164 | */ 165 | public void put(String key, int index) throws IllegalStateException { 166 | if (index < last) { 167 | throw new IllegalStateException("The input index must not be less than any of the previously inserted ones. Got " + index + ", expected at least " + last); 168 | } else { 169 | last = index; 170 | } 171 | 172 | // reset activeLeaf 173 | activeLeaf = root; 174 | 175 | String remainder = key; 176 | Node s = root; 177 | 178 | // proceed with tree construction (closely related to procedure in 179 | // Ukkonen's paper) 180 | String text = ""; 181 | // iterate over the string, one char at a time 182 | for (int i = 0; i < remainder.length(); i++) { 183 | // line 6 184 | text += remainder.charAt(i); 185 | // use intern to make sure the resulting string is in the pool. 186 | text = text.intern(); 187 | 188 | // line 7: update the tree with the new transitions due to this new char 189 | Pair active = update(s, text, remainder.substring(i), index); 190 | // line 8: make sure the active pair is canonical 191 | active = canonize(active.getFirst(), active.getSecond()); 192 | 193 | s = active.getFirst(); 194 | text = active.getSecond(); 195 | } 196 | 197 | // add leaf suffix link, is necessary 198 | if (null == activeLeaf.getSuffix() && activeLeaf != root && activeLeaf != s) { 199 | activeLeaf.setSuffix(s); 200 | } 201 | 202 | } 203 | 204 | /** 205 | * Tests whether the string stringPart + t is contained in the subtree that has inputs as root. 206 | * If that's not the case, and there exists a path of edges e1, e2, ... such that 207 | * e1.label + e2.label + ... + $end = stringPart 208 | * and there is an edge g such that 209 | * g.label = stringPart + rest 210 | * 211 | * Then g will be split in two different edges, one having $end as label, and the other one 212 | * having rest as label. 213 | * 214 | * @param inputs the starting node 215 | * @param stringPart the string to search 216 | * @param t the following character 217 | * @param remainder the remainder of the string to add to the index 218 | * @param value the value to add to the index 219 | * @return a pair containing 220 | * true/false depending on whether (stringPart + t) is contained in the subtree starting in inputs 221 | * the last node that can be reached by following the path denoted by stringPart starting from inputs 222 | * 223 | */ 224 | private Pair testAndSplit(final Node inputs, final String stringPart, final char t, final String remainder, final int value) { 225 | // descend the tree as far as possible 226 | Pair ret = canonize(inputs, stringPart); 227 | Node s = ret.getFirst(); 228 | String str = ret.getSecond(); 229 | 230 | if (!"".equals(str)) { 231 | Edge g = s.getEdge(str.charAt(0)); 232 | 233 | String label = g.getLabel(); 234 | // must see whether "str" is substring of the label of an edge 235 | if (label.length() > str.length() && label.charAt(str.length()) == t) { 236 | return new Pair(true, s); 237 | } else { 238 | // need to split the edge 239 | String newlabel = label.substring(str.length()); 240 | assert (label.startsWith(str)); 241 | 242 | // build a new node 243 | Node r = new Node(); 244 | // build a new edge 245 | Edge newedge = new Edge(str, r); 246 | 247 | g.setLabel(newlabel); 248 | 249 | // link s -> r 250 | r.addEdge(newlabel.charAt(0), g); 251 | s.addEdge(str.charAt(0), newedge); 252 | 253 | return new Pair(false, r); 254 | } 255 | 256 | } else { 257 | Edge e = s.getEdge(t); 258 | if (null == e) { 259 | // if there is no t-transtion from s 260 | return new Pair(false, s); 261 | } else { 262 | if (remainder.equals(e.getLabel())) { 263 | // update payload of destination node 264 | e.getDest().addRef(value); 265 | return new Pair(true, s); 266 | } else if (remainder.startsWith(e.getLabel())) { 267 | return new Pair(true, s); 268 | } else if (e.getLabel().startsWith(remainder)) { 269 | // need to split as above 270 | Node newNode = new Node(); 271 | newNode.addRef(value); 272 | 273 | Edge newEdge = new Edge(remainder, newNode); 274 | 275 | e.setLabel(e.getLabel().substring(remainder.length())); 276 | 277 | newNode.addEdge(e.getLabel().charAt(0), e); 278 | 279 | s.addEdge(t, newEdge); 280 | 281 | return new Pair(false, s); 282 | } else { 283 | // they are different words. No prefix. but they may still share some common substr 284 | return new Pair(true, s); 285 | } 286 | } 287 | } 288 | 289 | } 290 | 291 | /** 292 | * Return a (Node, String) (n, remainder) pair such that n is a farthest descendant of 293 | * s (the input node) that can be reached by following a path of edges denoting 294 | * a prefix of inputstr and remainder will be string that must be 295 | * appended to the concatenation of labels from s to n to get inpustr. 296 | */ 297 | private Pair canonize(final Node s, final String inputstr) { 298 | 299 | if ("".equals(inputstr)) { 300 | return new Pair(s, inputstr); 301 | } else { 302 | Node currentNode = s; 303 | String str = inputstr; 304 | Edge g = s.getEdge(str.charAt(0)); 305 | // descend the tree as long as a proper label is found 306 | while (g != null && str.startsWith(g.getLabel())) { 307 | str = str.substring(g.getLabel().length()); 308 | currentNode = g.getDest(); 309 | if (str.length() > 0) { 310 | g = currentNode.getEdge(str.charAt(0)); 311 | } 312 | } 313 | 314 | return new Pair(currentNode, str); 315 | } 316 | } 317 | 318 | /** 319 | * Updates the tree starting from inputNode and by adding stringPart. 320 | * 321 | * Returns a reference (Node, String) pair for the string that has been added so far. 322 | * This means: 323 | * - the Node will be the Node that can be reached by the longest path string (S1) 324 | * that can be obtained by concatenating consecutive edges in the tree and 325 | * that is a substring of the string added so far to the tree. 326 | * - the String will be the remainder that must be added to S1 to get the string 327 | * added so far. 328 | * 329 | * @param inputNode the node to start from 330 | * @param stringPart the string to add to the tree 331 | * @param rest the rest of the string 332 | * @param value the value to add to the index 333 | */ 334 | private Pair update(final Node inputNode, final String stringPart, final String rest, final int value) { 335 | Node s = inputNode; 336 | String tempstr = stringPart; 337 | char newChar = stringPart.charAt(stringPart.length() - 1); 338 | 339 | // line 1 340 | Node oldroot = root; 341 | 342 | // line 1b 343 | Pair ret = testAndSplit(s, tempstr.substring(0, tempstr.length() - 1), newChar, rest, value); 344 | 345 | Node r = ret.getSecond(); 346 | boolean endpoint = ret.getFirst(); 347 | 348 | Node leaf; 349 | // line 2 350 | while (!endpoint) { 351 | // line 3 352 | Edge tempEdge = r.getEdge(newChar); 353 | if (null != tempEdge) { 354 | // such a node is already present. This is one of the main differences from Ukkonen's case: 355 | // the tree can contain deeper nodes at this stage because different strings were added by previous iterations. 356 | leaf = tempEdge.getDest(); 357 | } else { 358 | // must build a new leaf 359 | leaf = new Node(); 360 | leaf.addRef(value); 361 | Edge newedge = new Edge(rest, leaf); 362 | r.addEdge(newChar, newedge); 363 | } 364 | 365 | // update suffix link for newly created leaf 366 | if (activeLeaf != root) { 367 | activeLeaf.setSuffix(leaf); 368 | } 369 | activeLeaf = leaf; 370 | 371 | // line 4 372 | if (oldroot != root) { 373 | oldroot.setSuffix(r); 374 | } 375 | 376 | // line 5 377 | oldroot = r; 378 | 379 | // line 6 380 | if (null == s.getSuffix()) { // root node 381 | assert (root == s); 382 | // this is a special case to handle what is referred to as node _|_ on the paper 383 | tempstr = tempstr.substring(1); 384 | } else { 385 | Pair canret = canonize(s.getSuffix(), safeCutLastChar(tempstr)); 386 | s = canret.getFirst(); 387 | // use intern to ensure that tempstr is a reference from the string pool 388 | tempstr = (canret.getSecond() + tempstr.charAt(tempstr.length() - 1)).intern(); 389 | } 390 | 391 | // line 7 392 | ret = testAndSplit(s, safeCutLastChar(tempstr), newChar, rest, value); 393 | r = ret.getSecond(); 394 | endpoint = ret.getFirst(); 395 | 396 | } 397 | 398 | // line 8 399 | if (oldroot != root) { 400 | oldroot.setSuffix(r); 401 | } 402 | oldroot = root; 403 | 404 | return new Pair(s, tempstr); 405 | } 406 | 407 | Node getRoot() { 408 | return root; 409 | } 410 | 411 | private String safeCutLastChar(String seq) { 412 | if (seq.length() == 0) { 413 | return ""; 414 | } 415 | return seq.substring(0, seq.length() - 1); 416 | } 417 | 418 | public int computeCount() { 419 | return root.computeAndCacheCount(); 420 | } 421 | 422 | /** 423 | * An utility object, used to store the data returned by the GeneralizedSuffixTree GeneralizedSuffixTree.searchWithCount method. 424 | * It contains a collection of results and the total number of results present in the GST. 425 | * @see GeneralizedSuffixTree#searchWithCount(java.lang.String, int) 426 | */ 427 | public static class ResultInfo { 428 | 429 | /** 430 | * The total number of results present in the database 431 | */ 432 | public int totalResults; 433 | /** 434 | * The collection of (some) results present in the GST 435 | */ 436 | public Collection results; 437 | 438 | public ResultInfo(Collection results, int totalResults) { 439 | this.totalResults = totalResults; 440 | this.results = results; 441 | } 442 | } 443 | 444 | /** 445 | * A private class used to return a tuples of two elements 446 | */ 447 | private class Pair { 448 | 449 | private final A first; 450 | private final B second; 451 | 452 | public Pair(A first, B second) { 453 | this.first = first; 454 | this.second = second; 455 | } 456 | 457 | public A getFirst() { 458 | return first; 459 | } 460 | 461 | public B getSecond() { 462 | return second; 463 | } 464 | } 465 | } 466 | --------------------------------------------------------------------------------