├── LICENSE ├── README.md ├── docs └── uberdoc.html ├── project.clj ├── src ├── clojure │ └── clj_fst │ │ ├── core.clj │ │ ├── enum.clj │ │ └── utils.clj └── java │ └── clj_fst │ └── CljUtils.java └── test └── clj_fst └── core_test.clj /LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # clj-fst 2 | 3 | This Clojure FST implementation is a wrapper above the Lucene FST package which is part of Lucene core. 4 | 5 | Finite state transducers are finite state machines with two tapes: an input and an output tape. The automaton map an input string to an output. The output can be another string or an integer. 6 | 7 | The FST produced by this application are implemented as a bytes array which makes FST really effective indexes in terms of speed and memory consumption. In fact, a 10 millions terms index will takes roughtly 256 MB of memory (depending of the string composition of the input strings, and if the outputs are integers or strings). 8 | 9 | `clj-fst` is an lightning-fast and memory effective way to figure out if something belong to a really huge set of things, or to get the output of an input. This is really simple but has profound implications. 10 | 11 | ## Installation 12 | 13 | ### Using Leiningen 14 | 15 | You can easily install `clj-fst` using Leiningen. The only thing you have to do is to add Add `[clj-fst "0.1.0"]` as a dependency to your `project.clj`. 16 | 17 | Then make sure that you downloaded this dependency by running the `lein deps` command. 18 | 19 | ## Documentation 20 | 21 | [The complete `clj-fst` documentation is available here.](http://structureddynamics.github.io/clj-fst/) 22 | 23 | ## Usage 24 | 25 | Here is how you can create, populate and save a FST: 26 | 27 | ```clojure 28 | ;; The first thing to do is to create the Builder 29 | (def builder (create-builder! :type :int)) 30 | 31 | ;; This small `sorted-map` defines the things 32 | ;; to add to the FST 33 | (def values (into (sorted-map) {"cat" 1 34 | "dog" 2 35 | "mice" 3})) 36 | 37 | ;; Populate the FST using that `sorted-map` 38 | (doseq [[input output] values] 39 | (add! builder {input output})) 40 | 41 | ;; Creating a new FST 42 | (def fst (create-fst! builder)) 43 | 44 | ;; Save a FST on the file system 45 | (save! "resources/fst.srz" fst) 46 | ``` 47 | 48 | Additionally you can load a previously saved FST: 49 | 50 | ```clojure 51 | ;; Load a FST from the file system 52 | (load! "resources/fst.srz) 53 | ``` 54 | 55 | You can query a FST to get the output related to an input: 56 | 57 | ```clojure 58 | ;; Query the FST 59 | (get-output "cat" fst) 60 | ``` 61 | 62 | You can iterate over a FST using FST enumerations: 63 | 64 | ```clojure 65 | ;; Create the FST enumeration 66 | (def enum (create-enum! fst)) 67 | 68 | ;; Get the first item in the FST 69 | (next! enum) 70 | 71 | ;; Get the current FST item pointed by the enumerator 72 | (current! enum) 73 | 74 | ;; Search for different input terms 75 | (get-ceil-term! "cat" enum) 76 | 77 | (get-floor-term! "cat" enum) 78 | 79 | (get-exact-term! "cat" enum) 80 | ``` 81 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject clj-fst "0.1.2" 2 | :description "Finite State Transducers (FST) for Clojure" 3 | :url "https://github.com/structureddynamics/clj-fst" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[org.clojure/clojure "1.9.0"] 7 | [org.apache.lucene/lucene-core "7.3.1"] 8 | [org.apache.lucene/lucene-misc "7.3.1"] 9 | [lein-marginalia "0.9.1"]] 10 | :source-paths ["src/clojure"] 11 | :java-source-paths ["src/java"] 12 | :target-path "target/%s" 13 | :marginalia {:exclude ["utils.clj"]}) 14 | -------------------------------------------------------------------------------- /src/clojure/clj_fst/core.clj: -------------------------------------------------------------------------------- 1 | ;; # Clojure Finite State Transdurer (FST) 2 | ;; 3 | ;; This Clojure FST implementation is a wrapper above the Lucene FST package which is part of Lucene core. 4 | ;; 5 | ;; Finite state transducers are finite state machines with two tapes: an input and an output tape. The automaton 6 | ;; map an input string to an output. The output can be another string or an integer. 7 | ;; 8 | ;; The FST produced by this application are implemented as a bytes array which makes FST really effective 9 | ;; indexes in terms of speed and memory consumption. In fact, a 10 millions terms index will takes roughtly 10 | ;; 256 MB of memory (depending of the string composition of the input strings, and if the outputs are 11 | ;; integers or strings). 12 | ;; 13 | ;; ## Limitations 14 | ;; 15 | ;; The main limitation is that a FST index cannot be updated once it is created. This means that it cannot 16 | ;; evolves over time. If you want to add or remove inputs/outputs, then you have to re-create the FST 17 | ;; entirely. 18 | ;; 19 | 20 | (ns clj-fst.core 21 | (:use [clj-fst.utils]) 22 | (:refer-clojure :exclude [load]) 23 | (:import (org.apache.lucene.util.fst PositiveIntOutputs CharSequenceOutputs ListOfOutputs Builder FST Util) 24 | (org.apache.lucene.util BytesRef BytesRefBuilder) 25 | (org.apache.lucene.util IntsRef IntsRefBuilder) 26 | (org.apache.lucene.util CharsRef CharsRefBuilder))) 27 | 28 | (declare int-outputs char-outputs chars-ref chars-ref-builder ints-ref ints-ref-builder bytes-ref bytes-ref-builder builder!) 29 | 30 | ;; ## Building and Populating a FST 31 | ;; 32 | ;; The creation of a new FST is a 3 steps process: 33 | ;; 34 | ;; 1. Create a new builder using `(create-builder!)`. The builder is used to populate the index with `` tuples 35 | ;; 2. Populate the index using `(add!)` 36 | ;; 3. Create the FST using `(create-fst!)` 37 | ;; 38 | ;; ### Builder Creation 39 | ;; 40 | ;; The first step is to create the FST's builder by using the `(create-builder!)` function. There are two types of 41 | ;; outputs currently supported by this wrapper: 42 | ;; 43 | ;; 1. integers 44 | ;; 2. unicode strings 45 | ;; 46 | ;; Here is some code that shows you how to create the builder: 47 | ;; 48 | ;; ;; Create a new FST builder with `:char` outputs 49 | ;; (def builder (create-builder! :type :char)) 50 | ;; 51 | ;; ;; Create a new FST builder with `:int` outputs 52 | ;; (def builder (create-builder! :type :int)) 53 | ;; 54 | 55 | (defn create-builder! 56 | "Create a new FST builder map. 57 | 58 | * `[type]` *(optional)*: Output type of the FST. Can be `:int` or `:char` (default)" 59 | [& {:keys [type] 60 | :or {type :char}}] 61 | {:builder (builder! type) 62 | :type type}) 63 | 64 | ;; ### Populating the FST 65 | ;; 66 | ;; Populating the FST with `` tuples is quite simple. The only thing you have to do 67 | ;; once your builder is created, is to add the tuples iteratively by calling multiple types the 68 | ;; `(add!)` function. 69 | ;; 70 | ;; However, there is quite an important thing to keep in mind: 71 | ;; 72 | ;; **You have to sort the index you want to create by their input keys** 73 | ;; 74 | ;; If you miss to perform this step, then you will end-up with unexpected results. 75 | ;; 76 | ;; Populating a FST is quite simple. Here is a code example that will populate a FST 77 | ;; using a `sorted-map`: 78 | ;; 79 | ;; ;; The first thing to do is to create the Builder 80 | ;; (def builder (create-builder! :type :int)) 81 | ;; 82 | ;; ;; This small `sorted-map` defines the things 83 | ;; ;; to add to the FST 84 | ;; (def values (into (sorted-map) {"cat" 1 85 | ;; "dog" 2 86 | ;; "mice" 3})) 87 | ;; 88 | ;; ;; Populate the FST using that `sorted-map` 89 | ;; (doseq [[input output] values] 90 | ;; (add! builder {input output})) 91 | ;; 92 | ;; What this code shows you is how you can iteratively 93 | ;; populate the FST. However, if you already have a single 94 | ;; `sorted-map` that does have all the `` 95 | ;; tuples that you want in your FST, then you can simply 96 | ;; use `(add!)` this way: 97 | ;; 98 | ;; ;; Populate the FST using that `sorted-map` 99 | ;; (add! builder values)) 100 | ;; 101 | ;; 102 | 103 | (defn add! 104 | "Populate a FST with `` tuples. This function can be called iteratively 105 | multiple times before the `(create-fst!)` function is called to actually create the 106 | FST. 107 | 108 | * `[builder]`: builder where to populate the FST 109 | * `[values]`: map of the inputs->ouputs. The keys of the maps are the inputs, 110 | and their values are the outputs. 111 | 112 | **Note:** if `(add!)` is used iteratively, then you have to make sure that the 113 | structure it iterates over has been previously sorted by the input keys." 114 | [builder values] 115 | (let [scratch-bytes-builder (bytes-ref-builder) 116 | scratch-ints (ints-ref-builder)] 117 | (doseq [[word index] values] 118 | (case (builder :type) 119 | :int (do 120 | (.append scratch-bytes-builder (BytesRef. word)) 121 | (.add (builder :builder) (. Util toIntsRef (.get scratch-bytes-builder) scratch-ints) index)) 122 | :char (.add (builder :builder) (. Util toUTF16 word scratch-ints) (new CharsRef index)))))) 123 | 124 | ;; ### FST Creation 125 | ;; 126 | ;; Once the builder is create and populated with the inputs and outputs, then the final 127 | ;; step is to create the actual FST. Once the FST is created, there is no way to add 128 | ;; or remove any inputs/outputs. If this become necessary, the FST needs to be re-created. 129 | ;; 130 | ;; Creating the FST is as simple as calling `(create-fst!)`: 131 | ;; 132 | ;; ;; Creating a new FST 133 | ;; (def fst (create-fst! builder)) 134 | 135 | (defn create-fst! 136 | "Create a new FST based on a builder that has been populated with inputs/outputs 137 | 138 | * `[builder]`: builder option that has been created and populated" 139 | [builder] 140 | (.finish (builder :builder))) 141 | 142 | ;; ### Querying the FST 143 | ;; 144 | ;; Now that we have a fully operational FST, the end goal is to be able to use it. 145 | ;; What we want to do is to know if there exists an output for a given input, and if 146 | ;; there is one to return the output associated with the input. Finally, if there is 147 | ;; no output for a given input, we want to get a `nil` value. 148 | ;; 149 | ;; Querying the FST is as simple as: 150 | ;; 151 | ;; ;; Query the FST 152 | ;; (get-output "cat" fst) 153 | ;; 154 | ;; This will return the output of the input string `"cat"` 155 | ;; 156 | 157 | (defn get-output 158 | "Get the output for a given input. 159 | 160 | * `[input]`: input for which we want its output 161 | * `[fst]`: FST object where to look for the output" 162 | [input fst] 163 | (let [result (. Util get fst (. Util toUTF16 input (ints-ref-builder)))] 164 | (if-not (nil? result) 165 | (process-output result)))) 166 | 167 | ;; ### Loading and Saving FST 168 | ;; 169 | ;; It is possible to save FST on the file system. That way, it is 170 | ;; possible to reload a FST from the file system when your 171 | ;; application starts. 172 | ;; 173 | ;; ;; Save a FST on the file system 174 | ;; (save! "resources/fst.srz" fst) 175 | ;; 176 | ;; ;; Load a FST from the file system 177 | ;; (load! "resources/fst.srz) 178 | 179 | (defn save! 180 | "Save a FST to a file on the file system 181 | 182 | * `[file]` is the file path on the file system 183 | * `[fst]` is the FST instance" 184 | [file fst] 185 | (. fst save (.toPath (clojure.java.io/file file)))) 186 | 187 | (defn load! 188 | "Load a FST to a file on the file system 189 | 190 | [file] is the file path on the file system 191 | [output-type] (optional) :int (default) when the output of the FST file are 192 | integers. :char when the output of the FST file are 193 | characters. 194 | 195 | Returns the loaded FST" 196 | [file & {:keys [output-type] 197 | :or {output-type :char}}] 198 | (let [outputs (if (= output-type :int) 199 | (int-outputs) 200 | (char-outputs))] 201 | (. FST read (.toPath (clojure.java.io/file file)) outputs))) 202 | 203 | ;; ## Utility functions 204 | ;; 205 | ;; This section list a series of utilities functions used by the 206 | ;; core `clj-fst` functions 207 | 208 | (defn builder! 209 | "Create a builder object. 210 | 211 | You can directly use this function instead of the `(create-builder!)` function 212 | if you require really specific settings. 213 | 214 | * `[type]`: type of the output. Can be `:int` or `:char` 215 | * `[min-suffix-count-1]` (optional): If pruning the input graph during construction, this threshold is used for telling 216 | if a node is kept or pruned. If transition_count(node) >= minSuffixCount1, the node 217 | is kept. 218 | * `[mind-suffix-count-2]` (optional): (Note: only Mike McCandless knows what this one is really doing...) 219 | * `[share-suffix]` (optional): If true, the shared suffixes will be compacted into unique paths. This requires an 220 | additional RAM-intensive hash map for lookups in memory. Setting this parameter to 221 | false creates a single suffix path for all input sequences. This will result in a 222 | larger FST, but requires substantially less memory and CPU during building. 223 | * `[share-non-singleton-nodes]` (optional): Only used if share-suffix is true. Set this to true to ensure FST is 224 | fully minimal, at cost of more CPU and more RAM during building. 225 | * `[share-max-tail-length]` (optional): Only used if share-suffix is true. Set this to 226 | Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more 227 | CPU and more RAM during building. 228 | * `[allow-array-arcs]` (optional): Pass false to disable the array arc optimization while building the FST; 229 | this will make the resulting FST smaller but slower to traverse. 230 | * `[bytes-page-bits]` (optional): How many bits wide to make each byte[] block in the BytesStore; if you know 231 | the FST will be large then make this larger. For example 15 bits = 32768 232 | byte pages." 233 | [type & {:keys [min-suffix-count-1 234 | min-suffix-count-2 235 | share-suffix 236 | share-non-singleton-nodes 237 | share-max-tail-length 238 | pack-fst 239 | acceptable-overhead-ratio 240 | allow-array-arcs 241 | bytes-page-bits] 242 | :or {min-suffix-count-1 0 243 | min-suffix-count-2 0 244 | share-suffix true 245 | share-non-singleton-nodes true 246 | share-max-tail-length Integer/MAX_VALUE 247 | pack-fst false 248 | acceptable-overhead-ratio org.apache.lucene.util.packed.PackedInts/COMPACT 249 | allow-array-arcs true 250 | bytes-page-bits 15}}] 251 | (if (= type :int) 252 | (Builder. org.apache.lucene.util.fst.FST$INPUT_TYPE/BYTE1 253 | min-suffix-count-1 254 | min-suffix-count-2 255 | share-suffix 256 | share-non-singleton-nodes 257 | share-max-tail-length 258 | (int-outputs) 259 | allow-array-arcs 260 | bytes-page-bits) 261 | (Builder. org.apache.lucene.util.fst.FST$INPUT_TYPE/BYTE4 262 | min-suffix-count-1 263 | min-suffix-count-2 264 | share-suffix 265 | share-non-singleton-nodes 266 | share-max-tail-length 267 | (char-outputs) 268 | allow-array-arcs 269 | bytes-page-bits))) 270 | 271 | (defn int-outputs 272 | "Create a PositiveIntOutputs" 273 | [] 274 | (new ListOfOutputs (. PositiveIntOutputs getSingleton))) 275 | 276 | (defn char-outputs 277 | "Create a CharSequenceOutputs" 278 | [] 279 | (new ListOfOutputs (. CharSequenceOutputs getSingleton))) 280 | 281 | (defn bytes-ref 282 | "Create a BytesRef" 283 | [] 284 | (new BytesRef)) 285 | 286 | (defn bytes-ref-builder 287 | "Create a BytesRefBuilder" 288 | [] 289 | (new BytesRefBuilder)) 290 | 291 | (defn ints-ref 292 | "Create a IntsRef" 293 | [] 294 | (new IntsRef)) 295 | 296 | (defn ints-ref-builder 297 | "Create a IntsRefBuilder" 298 | [] 299 | (new IntsRefBuilder)) 300 | 301 | (defn chars-ref 302 | "Create a CharsRef" 303 | [] 304 | (new CharsRef)) 305 | 306 | (defn chars-ref-builder 307 | "Create a CharsRefBuilder" 308 | [] 309 | (new CharsRefBuilder)) 310 | -------------------------------------------------------------------------------- /src/clojure/clj_fst/enum.clj: -------------------------------------------------------------------------------- 1 | ;; # FST Enumerations 2 | ;; 3 | ;; FST enumerations are used to search for the existance of inputs in the FST. 4 | ;; The first thing you have to do is to create an enumeration from a FST using 5 | ;; the `create-enum!` function. Then you have a series of utility function 6 | ;; that you can use to seek for a given input term. 7 | 8 | (ns clj-fst.enum 9 | (:use [clj-fst.core] 10 | [clj-fst.utils]) 11 | (:import (org.apache.lucene.util.fst BytesRefFSTEnum 12 | IntsRefFSTEnum 13 | Util) 14 | (org.apache.lucene.util BytesRef BytesRefBuilder 15 | IntsRef IntsRefBuilder) 16 | (org.apache.lucene.util.clj CljUtils))) 17 | 18 | ;; ## FST Creation 19 | ;; 20 | ;; Before creating an enumeration, you have to have a FST. Once your FST 21 | ;; is created, you can create its enumeration instance. Creating an enum 22 | ;; is as simple as: 23 | ;; 24 | ;; (def enum (create-enum! fst)) 25 | ;; 26 | 27 | (defn create-enum! 28 | "Create an enumeration of all the tuples that 29 | compose the FST" 30 | [fst] 31 | (when fst (new IntsRefFSTEnum fst))) 32 | 33 | ;; ## Iterating & Searching the Enumeration 34 | ;; 35 | ;; There are two things you can do with the Enumerations API: 36 | ;; 37 | ;; 1. Iterating over the content of the FST 38 | ;; 2. Searching over the content of the FST 39 | ;; 40 | 41 | ;; ### Iterating the content of a FST 42 | ;; 43 | ;; To iterate the content of the FST, you have to use the 44 | ;; `(next!)` function. What this function does is to return 45 | ;; the next `` tuple of the FST: 46 | ;; 47 | ;; (next! enum) 48 | ;; 49 | ;; will return the tuple in the form of a Clojure map: 50 | ;; 51 | ;; {:input "some input", :output ["some output"]} 52 | ;; 53 | ;; then you can always use the `(current!)` function to 54 | ;; return the current tuple pointed by the internal enumeration 55 | ;; pointer without moving it to the next tuple: 56 | ;; 57 | ;; (current! enum) 58 | ;; 59 | ;; 60 | 61 | (defn current! 62 | "Returns the term of the current input of the enumeration" 63 | [enum] 64 | (when enum 65 | (let [input-output (.current enum) 66 | input (CljUtils/inputToString (.input input-output) true) 67 | output (process-output (.output input-output))] 68 | {:input input :output (if (vector? output) output [output])}))) 69 | 70 | (defn next! 71 | "Returns the term of the next input of the enumeration" 72 | [enum] 73 | (when enum 74 | (let [input-output (.next enum) 75 | input (CljUtils/inputToString (.input input-output) true) 76 | output (process-output (.output input-output))] 77 | {:input input :output (if (vector? output) output [output])}))) 78 | 79 | ;; ### Searching the content of a FST 80 | ;; 81 | ;; Three functions will let you search the content of a FST: `(get-ceil-term!)`, 82 | ;; `(get-floor-term!)`, `(get-exact-term!)`. These functions will search the 83 | ;; FST in different ways. They will move the internal pointer to whatever input 84 | ;; they find. This means that if you use one of these functions, then if you use 85 | ;; the `(current!)` function than the same result will be returned because the 86 | ;; internal pointer got moved. 87 | 88 | (defn get-ceil-term! 89 | "Returns the smallest term that is greater or equal to the input term, nil otherwise." 90 | [input enum] 91 | (when (and enum input) 92 | (let [input-output (.seekCeil enum (. Util toUTF16 input (ints-ref-builder)))] 93 | (if input-output 94 | (let [input (CljUtils/inputToString (.input input-output) true) 95 | output (process-output (.output input-output))] 96 | {:input input :output output}))))) 97 | 98 | (defn get-floor-term! 99 | "Returns the biggest term that is smaller or equal to the input term, nil otherwise." 100 | [input enum] 101 | (when (and enum input) 102 | (let [input-output (.seekFloor enum (. Util toUTF16 input (ints-ref-builder)))] 103 | (if input-output 104 | (let [input (CljUtils/inputToString (.input input-output) true) 105 | output (process-output (.output input-output))] 106 | {:input input :output (if (vector? output) output [output])}))))) 107 | 108 | (defn get-exact-term! 109 | "Returns the term if the exact input term exists, nil otherwise." 110 | [input enum] 111 | (when (and enum input) 112 | (let [input-output (.seekExact enum (. Util toUTF16 input (ints-ref-builder)))] 113 | (if input-output 114 | (let [input (CljUtils/inputToString (.input input-output) true) 115 | output (process-output (.output input-output))] 116 | {:input input :output (if (vector? output) output [output])}))))) 117 | -------------------------------------------------------------------------------- /src/clojure/clj_fst/utils.clj: -------------------------------------------------------------------------------- 1 | (ns clj-fst.utils) 2 | 3 | (defn process-output 4 | "Process the output of an FST enumeration to generate a vector of outputs" 5 | [output] 6 | (let [output (if (instance? java.util.ArrayList output) 7 | (into [] output) 8 | [output])] 9 | (->> (remove nil? output) 10 | (map (fn [c-ref] 11 | [(.toString c-ref)])) 12 | (apply concat) 13 | (into [])))) 14 | -------------------------------------------------------------------------------- /src/java/clj_fst/CljUtils.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.util.clj; 2 | 3 | import org.apache.lucene.util.BytesRef; 4 | import org.apache.lucene.util.IntsRef; 5 | import org.apache.lucene.util.UnicodeUtil; 6 | 7 | /** Helper class to test FSTs. */ 8 | public class CljUtils { 9 | 10 | public static String inputToString(IntsRef term, boolean isValidUnicode) { 11 | if (!isValidUnicode) { 12 | return term.toString(); 13 | } else { 14 | // utf8 15 | //return toBytesRef(term).utf8ToString() + " " + term; 16 | return UnicodeUtil.newString(term.ints, term.offset, term.length); 17 | } 18 | } 19 | 20 | private static BytesRef toBytesRef(IntsRef ir) { 21 | BytesRef br = new BytesRef(ir.length); 22 | for(int i=0;i= 0 && x <= 255; 25 | br.bytes[i] = (byte) x; 26 | } 27 | br.length = ir.length; 28 | return br; 29 | } 30 | } -------------------------------------------------------------------------------- /test/clj_fst/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns clj-fst.core-test 2 | (:require [clojure.test :refer :all] 3 | [clj-fst.core :refer :all])) 4 | 5 | (deftest a-test 6 | (testing "FIXME, I fail." 7 | (is (= 0 1)))) 8 | --------------------------------------------------------------------------------