├── LICENSE
├── README.md
├── docs
└── uberdoc.html
├── project.clj
├── src
├── clojure
│ └── clj_fst
│ │ ├── core.clj
│ │ ├── enum.clj
│ │ └── utils.clj
└── java
│ └── clj_fst
│ └── CljUtils.java
└── test
└── clj_fst
└── core_test.clj
/LICENSE:
--------------------------------------------------------------------------------
1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC
2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
4 |
5 | 1. DEFINITIONS
6 |
7 | "Contribution" means:
8 |
9 | a) in the case of the initial Contributor, the initial code and
10 | documentation distributed under this Agreement, and
11 |
12 | b) in the case of each subsequent Contributor:
13 |
14 | i) changes to the Program, and
15 |
16 | ii) additions to the Program;
17 |
18 | where such changes and/or additions to the Program originate from and are
19 | distributed by that particular Contributor. A Contribution 'originates' from
20 | a Contributor if it was added to the Program by such Contributor itself or
21 | anyone acting on such Contributor's behalf. Contributions do not include
22 | additions to the Program which: (i) are separate modules of software
23 | distributed in conjunction with the Program under their own license
24 | agreement, and (ii) are not derivative works of the Program.
25 |
26 | "Contributor" means any person or entity that distributes the Program.
27 |
28 | "Licensed Patents" mean patent claims licensable by a Contributor which are
29 | necessarily infringed by the use or sale of its Contribution alone or when
30 | combined with the Program.
31 |
32 | "Program" means the Contributions distributed in accordance with this
33 | Agreement.
34 |
35 | "Recipient" means anyone who receives the Program under this Agreement,
36 | including all Contributors.
37 |
38 | 2. GRANT OF RIGHTS
39 |
40 | a) Subject to the terms of this Agreement, each Contributor hereby grants
41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to
42 | reproduce, prepare derivative works of, publicly display, publicly perform,
43 | distribute and sublicense the Contribution of such Contributor, if any, and
44 | such derivative works, in source code and object code form.
45 |
46 | b) Subject to the terms of this Agreement, each Contributor hereby grants
47 | Recipient a non-exclusive, worldwide, royalty-free patent license under
48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise
49 | transfer the Contribution of such Contributor, if any, in source code and
50 | object code form. This patent license shall apply to the combination of the
51 | Contribution and the Program if, at the time the Contribution is added by the
52 | Contributor, such addition of the Contribution causes such combination to be
53 | covered by the Licensed Patents. The patent license shall not apply to any
54 | other combinations which include the Contribution. No hardware per se is
55 | licensed hereunder.
56 |
57 | c) Recipient understands that although each Contributor grants the licenses
58 | to its Contributions set forth herein, no assurances are provided by any
59 | Contributor that the Program does not infringe the patent or other
60 | intellectual property rights of any other entity. Each Contributor disclaims
61 | any liability to Recipient for claims brought by any other entity based on
62 | infringement of intellectual property rights or otherwise. As a condition to
63 | exercising the rights and licenses granted hereunder, each Recipient hereby
64 | assumes sole responsibility to secure any other intellectual property rights
65 | needed, if any. For example, if a third party patent license is required to
66 | allow Recipient to distribute the Program, it is Recipient's responsibility
67 | to acquire that license before distributing the Program.
68 |
69 | d) Each Contributor represents that to its knowledge it has sufficient
70 | copyright rights in its Contribution, if any, to grant the copyright license
71 | set forth in this Agreement.
72 |
73 | 3. REQUIREMENTS
74 |
75 | A Contributor may choose to distribute the Program in object code form under
76 | its own license agreement, provided that:
77 |
78 | a) it complies with the terms and conditions of this Agreement; and
79 |
80 | b) its license agreement:
81 |
82 | i) effectively disclaims on behalf of all Contributors all warranties and
83 | conditions, express and implied, including warranties or conditions of title
84 | and non-infringement, and implied warranties or conditions of merchantability
85 | and fitness for a particular purpose;
86 |
87 | ii) effectively excludes on behalf of all Contributors all liability for
88 | damages, including direct, indirect, special, incidental and consequential
89 | damages, such as lost profits;
90 |
91 | iii) states that any provisions which differ from this Agreement are offered
92 | by that Contributor alone and not by any other party; and
93 |
94 | iv) states that source code for the Program is available from such
95 | Contributor, and informs licensees how to obtain it in a reasonable manner on
96 | or through a medium customarily used for software exchange.
97 |
98 | When the Program is made available in source code form:
99 |
100 | a) it must be made available under this Agreement; and
101 |
102 | b) a copy of this Agreement must be included with each copy of the Program.
103 |
104 | Contributors may not remove or alter any copyright notices contained within
105 | the Program.
106 |
107 | Each Contributor must identify itself as the originator of its Contribution,
108 | if any, in a manner that reasonably allows subsequent Recipients to identify
109 | the originator of the Contribution.
110 |
111 | 4. COMMERCIAL DISTRIBUTION
112 |
113 | Commercial distributors of software may accept certain responsibilities with
114 | respect to end users, business partners and the like. While this license is
115 | intended to facilitate the commercial use of the Program, the Contributor who
116 | includes the Program in a commercial product offering should do so in a
117 | manner which does not create potential liability for other Contributors.
118 | Therefore, if a Contributor includes the Program in a commercial product
119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend
120 | and indemnify every other Contributor ("Indemnified Contributor") against any
121 | losses, damages and costs (collectively "Losses") arising from claims,
122 | lawsuits and other legal actions brought by a third party against the
123 | Indemnified Contributor to the extent caused by the acts or omissions of such
124 | Commercial Contributor in connection with its distribution of the Program in
125 | a commercial product offering. The obligations in this section do not apply
126 | to any claims or Losses relating to any actual or alleged intellectual
127 | property infringement. In order to qualify, an Indemnified Contributor must:
128 | a) promptly notify the Commercial Contributor in writing of such claim, and
129 | b) allow the Commercial Contributor tocontrol, and cooperate with the
130 | Commercial Contributor in, the defense and any related settlement
131 | negotiations. The Indemnified Contributor may participate in any such claim
132 | at its own expense.
133 |
134 | For example, a Contributor might include the Program in a commercial product
135 | offering, Product X. That Contributor is then a Commercial Contributor. If
136 | that Commercial Contributor then makes performance claims, or offers
137 | warranties related to Product X, those performance claims and warranties are
138 | such Commercial Contributor's responsibility alone. Under this section, the
139 | Commercial Contributor would have to defend claims against the other
140 | Contributors related to those performance claims and warranties, and if a
141 | court requires any other Contributor to pay any damages as a result, the
142 | Commercial Contributor must pay those damages.
143 |
144 | 5. NO WARRANTY
145 |
146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON
147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR
149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A
150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the
151 | appropriateness of using and distributing the Program and assumes all risks
152 | associated with its exercise of rights under this Agreement , including but
153 | not limited to the risks and costs of program errors, compliance with
154 | applicable laws, damage to or loss of data, programs or equipment, and
155 | unavailability or interruption of operations.
156 |
157 | 6. DISCLAIMER OF LIABILITY
158 |
159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY
160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL,
161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION
162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE
165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY
166 | OF SUCH DAMAGES.
167 |
168 | 7. GENERAL
169 |
170 | If any provision of this Agreement is invalid or unenforceable under
171 | applicable law, it shall not affect the validity or enforceability of the
172 | remainder of the terms of this Agreement, and without further action by the
173 | parties hereto, such provision shall be reformed to the minimum extent
174 | necessary to make such provision valid and enforceable.
175 |
176 | If Recipient institutes patent litigation against any entity (including a
177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself
178 | (excluding combinations of the Program with other software or hardware)
179 | infringes such Recipient's patent(s), then such Recipient's rights granted
180 | under Section 2(b) shall terminate as of the date such litigation is filed.
181 |
182 | All Recipient's rights under this Agreement shall terminate if it fails to
183 | comply with any of the material terms or conditions of this Agreement and
184 | does not cure such failure in a reasonable period of time after becoming
185 | aware of such noncompliance. If all Recipient's rights under this Agreement
186 | terminate, Recipient agrees to cease use and distribution of the Program as
187 | soon as reasonably practicable. However, Recipient's obligations under this
188 | Agreement and any licenses granted by Recipient relating to the Program shall
189 | continue and survive.
190 |
191 | Everyone is permitted to copy and distribute copies of this Agreement, but in
192 | order to avoid inconsistency the Agreement is copyrighted and may only be
193 | modified in the following manner. The Agreement Steward reserves the right to
194 | publish new versions (including revisions) of this Agreement from time to
195 | time. No one other than the Agreement Steward has the right to modify this
196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The
197 | Eclipse Foundation may assign the responsibility to serve as the Agreement
198 | Steward to a suitable separate entity. Each new version of the Agreement will
199 | be given a distinguishing version number. The Program (including
200 | Contributions) may always be distributed subject to the version of the
201 | Agreement under which it was received. In addition, after a new version of
202 | the Agreement is published, Contributor may elect to distribute the Program
203 | (including its Contributions) under the new version. Except as expressly
204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or
205 | licenses to the intellectual property of any Contributor under this
206 | Agreement, whether expressly, by implication, estoppel or otherwise. All
207 | rights in the Program not expressly granted under this Agreement are
208 | reserved.
209 |
210 | This Agreement is governed by the laws of the State of New York and the
211 | intellectual property laws of the United States of America. No party to this
212 | Agreement will bring a legal action under this Agreement more than one year
213 | after the cause of action arose. Each party waives its rights to a jury trial
214 | in any resulting litigation.
215 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # clj-fst
2 |
3 | This Clojure FST implementation is a wrapper above the Lucene FST package which is part of Lucene core.
4 |
5 | Finite state transducers are finite state machines with two tapes: an input and an output tape. The automaton map an input string to an output. The output can be another string or an integer.
6 |
7 | The FST produced by this application are implemented as a bytes array which makes FST really effective indexes in terms of speed and memory consumption. In fact, a 10 millions terms index will takes roughtly 256 MB of memory (depending of the string composition of the input strings, and if the outputs are integers or strings).
8 |
9 | `clj-fst` is an lightning-fast and memory effective way to figure out if something belong to a really huge set of things, or to get the output of an input. This is really simple but has profound implications.
10 |
11 | ## Installation
12 |
13 | ### Using Leiningen
14 |
15 | You can easily install `clj-fst` using Leiningen. The only thing you have to do is to add Add `[clj-fst "0.1.0"]` as a dependency to your `project.clj`.
16 |
17 | Then make sure that you downloaded this dependency by running the `lein deps` command.
18 |
19 | ## Documentation
20 |
21 | [The complete `clj-fst` documentation is available here.](http://structureddynamics.github.io/clj-fst/)
22 |
23 | ## Usage
24 |
25 | Here is how you can create, populate and save a FST:
26 |
27 | ```clojure
28 | ;; The first thing to do is to create the Builder
29 | (def builder (create-builder! :type :int))
30 |
31 | ;; This small `sorted-map` defines the things
32 | ;; to add to the FST
33 | (def values (into (sorted-map) {"cat" 1
34 | "dog" 2
35 | "mice" 3}))
36 |
37 | ;; Populate the FST using that `sorted-map`
38 | (doseq [[input output] values]
39 | (add! builder {input output}))
40 |
41 | ;; Creating a new FST
42 | (def fst (create-fst! builder))
43 |
44 | ;; Save a FST on the file system
45 | (save! "resources/fst.srz" fst)
46 | ```
47 |
48 | Additionally you can load a previously saved FST:
49 |
50 | ```clojure
51 | ;; Load a FST from the file system
52 | (load! "resources/fst.srz)
53 | ```
54 |
55 | You can query a FST to get the output related to an input:
56 |
57 | ```clojure
58 | ;; Query the FST
59 | (get-output "cat" fst)
60 | ```
61 |
62 | You can iterate over a FST using FST enumerations:
63 |
64 | ```clojure
65 | ;; Create the FST enumeration
66 | (def enum (create-enum! fst))
67 |
68 | ;; Get the first item in the FST
69 | (next! enum)
70 |
71 | ;; Get the current FST item pointed by the enumerator
72 | (current! enum)
73 |
74 | ;; Search for different input terms
75 | (get-ceil-term! "cat" enum)
76 |
77 | (get-floor-term! "cat" enum)
78 |
79 | (get-exact-term! "cat" enum)
80 | ```
81 |
--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
1 | (defproject clj-fst "0.1.2"
2 | :description "Finite State Transducers (FST) for Clojure"
3 | :url "https://github.com/structureddynamics/clj-fst"
4 | :license {:name "Eclipse Public License"
5 | :url "http://www.eclipse.org/legal/epl-v10.html"}
6 | :dependencies [[org.clojure/clojure "1.9.0"]
7 | [org.apache.lucene/lucene-core "7.3.1"]
8 | [org.apache.lucene/lucene-misc "7.3.1"]
9 | [lein-marginalia "0.9.1"]]
10 | :source-paths ["src/clojure"]
11 | :java-source-paths ["src/java"]
12 | :target-path "target/%s"
13 | :marginalia {:exclude ["utils.clj"]})
14 |
--------------------------------------------------------------------------------
/src/clojure/clj_fst/core.clj:
--------------------------------------------------------------------------------
1 | ;; # Clojure Finite State Transdurer (FST)
2 | ;;
3 | ;; This Clojure FST implementation is a wrapper above the Lucene FST package which is part of Lucene core.
4 | ;;
5 | ;; Finite state transducers are finite state machines with two tapes: an input and an output tape. The automaton
6 | ;; map an input string to an output. The output can be another string or an integer.
7 | ;;
8 | ;; The FST produced by this application are implemented as a bytes array which makes FST really effective
9 | ;; indexes in terms of speed and memory consumption. In fact, a 10 millions terms index will takes roughtly
10 | ;; 256 MB of memory (depending of the string composition of the input strings, and if the outputs are
11 | ;; integers or strings).
12 | ;;
13 | ;; ## Limitations
14 | ;;
15 | ;; The main limitation is that a FST index cannot be updated once it is created. This means that it cannot
16 | ;; evolves over time. If you want to add or remove inputs/outputs, then you have to re-create the FST
17 | ;; entirely.
18 | ;;
19 |
20 | (ns clj-fst.core
21 | (:use [clj-fst.utils])
22 | (:refer-clojure :exclude [load])
23 | (:import (org.apache.lucene.util.fst PositiveIntOutputs CharSequenceOutputs ListOfOutputs Builder FST Util)
24 | (org.apache.lucene.util BytesRef BytesRefBuilder)
25 | (org.apache.lucene.util IntsRef IntsRefBuilder)
26 | (org.apache.lucene.util CharsRef CharsRefBuilder)))
27 |
28 | (declare int-outputs char-outputs chars-ref chars-ref-builder ints-ref ints-ref-builder bytes-ref bytes-ref-builder builder!)
29 |
30 | ;; ## Building and Populating a FST
31 | ;;
32 | ;; The creation of a new FST is a 3 steps process:
33 | ;;
34 | ;; 1. Create a new builder using `(create-builder!)`. The builder is used to populate the index with `` tuples
35 | ;; 2. Populate the index using `(add!)`
36 | ;; 3. Create the FST using `(create-fst!)`
37 | ;;
38 | ;; ### Builder Creation
39 | ;;
40 | ;; The first step is to create the FST's builder by using the `(create-builder!)` function. There are two types of
41 | ;; outputs currently supported by this wrapper:
42 | ;;
43 | ;; 1. integers
44 | ;; 2. unicode strings
45 | ;;
46 | ;; Here is some code that shows you how to create the builder:
47 | ;;
48 | ;; ;; Create a new FST builder with `:char` outputs
49 | ;; (def builder (create-builder! :type :char))
50 | ;;
51 | ;; ;; Create a new FST builder with `:int` outputs
52 | ;; (def builder (create-builder! :type :int))
53 | ;;
54 |
55 | (defn create-builder!
56 | "Create a new FST builder map.
57 |
58 | * `[type]` *(optional)*: Output type of the FST. Can be `:int` or `:char` (default)"
59 | [& {:keys [type]
60 | :or {type :char}}]
61 | {:builder (builder! type)
62 | :type type})
63 |
64 | ;; ### Populating the FST
65 | ;;
66 | ;; Populating the FST with `` tuples is quite simple. The only thing you have to do
67 | ;; once your builder is created, is to add the tuples iteratively by calling multiple types the
68 | ;; `(add!)` function.
69 | ;;
70 | ;; However, there is quite an important thing to keep in mind:
71 | ;;
72 | ;; **You have to sort the index you want to create by their input keys**
73 | ;;
74 | ;; If you miss to perform this step, then you will end-up with unexpected results.
75 | ;;
76 | ;; Populating a FST is quite simple. Here is a code example that will populate a FST
77 | ;; using a `sorted-map`:
78 | ;;
79 | ;; ;; The first thing to do is to create the Builder
80 | ;; (def builder (create-builder! :type :int))
81 | ;;
82 | ;; ;; This small `sorted-map` defines the things
83 | ;; ;; to add to the FST
84 | ;; (def values (into (sorted-map) {"cat" 1
85 | ;; "dog" 2
86 | ;; "mice" 3}))
87 | ;;
88 | ;; ;; Populate the FST using that `sorted-map`
89 | ;; (doseq [[input output] values]
90 | ;; (add! builder {input output}))
91 | ;;
92 | ;; What this code shows you is how you can iteratively
93 | ;; populate the FST. However, if you already have a single
94 | ;; `sorted-map` that does have all the ``
95 | ;; tuples that you want in your FST, then you can simply
96 | ;; use `(add!)` this way:
97 | ;;
98 | ;; ;; Populate the FST using that `sorted-map`
99 | ;; (add! builder values))
100 | ;;
101 | ;;
102 |
103 | (defn add!
104 | "Populate a FST with `` tuples. This function can be called iteratively
105 | multiple times before the `(create-fst!)` function is called to actually create the
106 | FST.
107 |
108 | * `[builder]`: builder where to populate the FST
109 | * `[values]`: map of the inputs->ouputs. The keys of the maps are the inputs,
110 | and their values are the outputs.
111 |
112 | **Note:** if `(add!)` is used iteratively, then you have to make sure that the
113 | structure it iterates over has been previously sorted by the input keys."
114 | [builder values]
115 | (let [scratch-bytes-builder (bytes-ref-builder)
116 | scratch-ints (ints-ref-builder)]
117 | (doseq [[word index] values]
118 | (case (builder :type)
119 | :int (do
120 | (.append scratch-bytes-builder (BytesRef. word))
121 | (.add (builder :builder) (. Util toIntsRef (.get scratch-bytes-builder) scratch-ints) index))
122 | :char (.add (builder :builder) (. Util toUTF16 word scratch-ints) (new CharsRef index))))))
123 |
124 | ;; ### FST Creation
125 | ;;
126 | ;; Once the builder is create and populated with the inputs and outputs, then the final
127 | ;; step is to create the actual FST. Once the FST is created, there is no way to add
128 | ;; or remove any inputs/outputs. If this become necessary, the FST needs to be re-created.
129 | ;;
130 | ;; Creating the FST is as simple as calling `(create-fst!)`:
131 | ;;
132 | ;; ;; Creating a new FST
133 | ;; (def fst (create-fst! builder))
134 |
135 | (defn create-fst!
136 | "Create a new FST based on a builder that has been populated with inputs/outputs
137 |
138 | * `[builder]`: builder option that has been created and populated"
139 | [builder]
140 | (.finish (builder :builder)))
141 |
142 | ;; ### Querying the FST
143 | ;;
144 | ;; Now that we have a fully operational FST, the end goal is to be able to use it.
145 | ;; What we want to do is to know if there exists an output for a given input, and if
146 | ;; there is one to return the output associated with the input. Finally, if there is
147 | ;; no output for a given input, we want to get a `nil` value.
148 | ;;
149 | ;; Querying the FST is as simple as:
150 | ;;
151 | ;; ;; Query the FST
152 | ;; (get-output "cat" fst)
153 | ;;
154 | ;; This will return the output of the input string `"cat"`
155 | ;;
156 |
157 | (defn get-output
158 | "Get the output for a given input.
159 |
160 | * `[input]`: input for which we want its output
161 | * `[fst]`: FST object where to look for the output"
162 | [input fst]
163 | (let [result (. Util get fst (. Util toUTF16 input (ints-ref-builder)))]
164 | (if-not (nil? result)
165 | (process-output result))))
166 |
167 | ;; ### Loading and Saving FST
168 | ;;
169 | ;; It is possible to save FST on the file system. That way, it is
170 | ;; possible to reload a FST from the file system when your
171 | ;; application starts.
172 | ;;
173 | ;; ;; Save a FST on the file system
174 | ;; (save! "resources/fst.srz" fst)
175 | ;;
176 | ;; ;; Load a FST from the file system
177 | ;; (load! "resources/fst.srz)
178 |
179 | (defn save!
180 | "Save a FST to a file on the file system
181 |
182 | * `[file]` is the file path on the file system
183 | * `[fst]` is the FST instance"
184 | [file fst]
185 | (. fst save (.toPath (clojure.java.io/file file))))
186 |
187 | (defn load!
188 | "Load a FST to a file on the file system
189 |
190 | [file] is the file path on the file system
191 | [output-type] (optional) :int (default) when the output of the FST file are
192 | integers. :char when the output of the FST file are
193 | characters.
194 |
195 | Returns the loaded FST"
196 | [file & {:keys [output-type]
197 | :or {output-type :char}}]
198 | (let [outputs (if (= output-type :int)
199 | (int-outputs)
200 | (char-outputs))]
201 | (. FST read (.toPath (clojure.java.io/file file)) outputs)))
202 |
203 | ;; ## Utility functions
204 | ;;
205 | ;; This section list a series of utilities functions used by the
206 | ;; core `clj-fst` functions
207 |
208 | (defn builder!
209 | "Create a builder object.
210 |
211 | You can directly use this function instead of the `(create-builder!)` function
212 | if you require really specific settings.
213 |
214 | * `[type]`: type of the output. Can be `:int` or `:char`
215 | * `[min-suffix-count-1]` (optional): If pruning the input graph during construction, this threshold is used for telling
216 | if a node is kept or pruned. If transition_count(node) >= minSuffixCount1, the node
217 | is kept.
218 | * `[mind-suffix-count-2]` (optional): (Note: only Mike McCandless knows what this one is really doing...)
219 | * `[share-suffix]` (optional): If true, the shared suffixes will be compacted into unique paths. This requires an
220 | additional RAM-intensive hash map for lookups in memory. Setting this parameter to
221 | false creates a single suffix path for all input sequences. This will result in a
222 | larger FST, but requires substantially less memory and CPU during building.
223 | * `[share-non-singleton-nodes]` (optional): Only used if share-suffix is true. Set this to true to ensure FST is
224 | fully minimal, at cost of more CPU and more RAM during building.
225 | * `[share-max-tail-length]` (optional): Only used if share-suffix is true. Set this to
226 | Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
227 | CPU and more RAM during building.
228 | * `[allow-array-arcs]` (optional): Pass false to disable the array arc optimization while building the FST;
229 | this will make the resulting FST smaller but slower to traverse.
230 | * `[bytes-page-bits]` (optional): How many bits wide to make each byte[] block in the BytesStore; if you know
231 | the FST will be large then make this larger. For example 15 bits = 32768
232 | byte pages."
233 | [type & {:keys [min-suffix-count-1
234 | min-suffix-count-2
235 | share-suffix
236 | share-non-singleton-nodes
237 | share-max-tail-length
238 | pack-fst
239 | acceptable-overhead-ratio
240 | allow-array-arcs
241 | bytes-page-bits]
242 | :or {min-suffix-count-1 0
243 | min-suffix-count-2 0
244 | share-suffix true
245 | share-non-singleton-nodes true
246 | share-max-tail-length Integer/MAX_VALUE
247 | pack-fst false
248 | acceptable-overhead-ratio org.apache.lucene.util.packed.PackedInts/COMPACT
249 | allow-array-arcs true
250 | bytes-page-bits 15}}]
251 | (if (= type :int)
252 | (Builder. org.apache.lucene.util.fst.FST$INPUT_TYPE/BYTE1
253 | min-suffix-count-1
254 | min-suffix-count-2
255 | share-suffix
256 | share-non-singleton-nodes
257 | share-max-tail-length
258 | (int-outputs)
259 | allow-array-arcs
260 | bytes-page-bits)
261 | (Builder. org.apache.lucene.util.fst.FST$INPUT_TYPE/BYTE4
262 | min-suffix-count-1
263 | min-suffix-count-2
264 | share-suffix
265 | share-non-singleton-nodes
266 | share-max-tail-length
267 | (char-outputs)
268 | allow-array-arcs
269 | bytes-page-bits)))
270 |
271 | (defn int-outputs
272 | "Create a PositiveIntOutputs"
273 | []
274 | (new ListOfOutputs (. PositiveIntOutputs getSingleton)))
275 |
276 | (defn char-outputs
277 | "Create a CharSequenceOutputs"
278 | []
279 | (new ListOfOutputs (. CharSequenceOutputs getSingleton)))
280 |
281 | (defn bytes-ref
282 | "Create a BytesRef"
283 | []
284 | (new BytesRef))
285 |
286 | (defn bytes-ref-builder
287 | "Create a BytesRefBuilder"
288 | []
289 | (new BytesRefBuilder))
290 |
291 | (defn ints-ref
292 | "Create a IntsRef"
293 | []
294 | (new IntsRef))
295 |
296 | (defn ints-ref-builder
297 | "Create a IntsRefBuilder"
298 | []
299 | (new IntsRefBuilder))
300 |
301 | (defn chars-ref
302 | "Create a CharsRef"
303 | []
304 | (new CharsRef))
305 |
306 | (defn chars-ref-builder
307 | "Create a CharsRefBuilder"
308 | []
309 | (new CharsRefBuilder))
310 |
--------------------------------------------------------------------------------
/src/clojure/clj_fst/enum.clj:
--------------------------------------------------------------------------------
1 | ;; # FST Enumerations
2 | ;;
3 | ;; FST enumerations are used to search for the existance of inputs in the FST.
4 | ;; The first thing you have to do is to create an enumeration from a FST using
5 | ;; the `create-enum!` function. Then you have a series of utility function
6 | ;; that you can use to seek for a given input term.
7 |
8 | (ns clj-fst.enum
9 | (:use [clj-fst.core]
10 | [clj-fst.utils])
11 | (:import (org.apache.lucene.util.fst BytesRefFSTEnum
12 | IntsRefFSTEnum
13 | Util)
14 | (org.apache.lucene.util BytesRef BytesRefBuilder
15 | IntsRef IntsRefBuilder)
16 | (org.apache.lucene.util.clj CljUtils)))
17 |
18 | ;; ## FST Creation
19 | ;;
20 | ;; Before creating an enumeration, you have to have a FST. Once your FST
21 | ;; is created, you can create its enumeration instance. Creating an enum
22 | ;; is as simple as:
23 | ;;
24 | ;; (def enum (create-enum! fst))
25 | ;;
26 |
27 | (defn create-enum!
28 | "Create an enumeration of all the tuples that
29 | compose the FST"
30 | [fst]
31 | (when fst (new IntsRefFSTEnum fst)))
32 |
33 | ;; ## Iterating & Searching the Enumeration
34 | ;;
35 | ;; There are two things you can do with the Enumerations API:
36 | ;;
37 | ;; 1. Iterating over the content of the FST
38 | ;; 2. Searching over the content of the FST
39 | ;;
40 |
41 | ;; ### Iterating the content of a FST
42 | ;;
43 | ;; To iterate the content of the FST, you have to use the
44 | ;; `(next!)` function. What this function does is to return
45 | ;; the next `` tuple of the FST:
46 | ;;
47 | ;; (next! enum)
48 | ;;
49 | ;; will return the tuple in the form of a Clojure map:
50 | ;;
51 | ;; {:input "some input", :output ["some output"]}
52 | ;;
53 | ;; then you can always use the `(current!)` function to
54 | ;; return the current tuple pointed by the internal enumeration
55 | ;; pointer without moving it to the next tuple:
56 | ;;
57 | ;; (current! enum)
58 | ;;
59 | ;;
60 |
61 | (defn current!
62 | "Returns the term of the current input of the enumeration"
63 | [enum]
64 | (when enum
65 | (let [input-output (.current enum)
66 | input (CljUtils/inputToString (.input input-output) true)
67 | output (process-output (.output input-output))]
68 | {:input input :output (if (vector? output) output [output])})))
69 |
70 | (defn next!
71 | "Returns the term of the next input of the enumeration"
72 | [enum]
73 | (when enum
74 | (let [input-output (.next enum)
75 | input (CljUtils/inputToString (.input input-output) true)
76 | output (process-output (.output input-output))]
77 | {:input input :output (if (vector? output) output [output])})))
78 |
79 | ;; ### Searching the content of a FST
80 | ;;
81 | ;; Three functions will let you search the content of a FST: `(get-ceil-term!)`,
82 | ;; `(get-floor-term!)`, `(get-exact-term!)`. These functions will search the
83 | ;; FST in different ways. They will move the internal pointer to whatever input
84 | ;; they find. This means that if you use one of these functions, then if you use
85 | ;; the `(current!)` function than the same result will be returned because the
86 | ;; internal pointer got moved.
87 |
88 | (defn get-ceil-term!
89 | "Returns the smallest term that is greater or equal to the input term, nil otherwise."
90 | [input enum]
91 | (when (and enum input)
92 | (let [input-output (.seekCeil enum (. Util toUTF16 input (ints-ref-builder)))]
93 | (if input-output
94 | (let [input (CljUtils/inputToString (.input input-output) true)
95 | output (process-output (.output input-output))]
96 | {:input input :output output})))))
97 |
98 | (defn get-floor-term!
99 | "Returns the biggest term that is smaller or equal to the input term, nil otherwise."
100 | [input enum]
101 | (when (and enum input)
102 | (let [input-output (.seekFloor enum (. Util toUTF16 input (ints-ref-builder)))]
103 | (if input-output
104 | (let [input (CljUtils/inputToString (.input input-output) true)
105 | output (process-output (.output input-output))]
106 | {:input input :output (if (vector? output) output [output])})))))
107 |
108 | (defn get-exact-term!
109 | "Returns the term if the exact input term exists, nil otherwise."
110 | [input enum]
111 | (when (and enum input)
112 | (let [input-output (.seekExact enum (. Util toUTF16 input (ints-ref-builder)))]
113 | (if input-output
114 | (let [input (CljUtils/inputToString (.input input-output) true)
115 | output (process-output (.output input-output))]
116 | {:input input :output (if (vector? output) output [output])})))))
117 |
--------------------------------------------------------------------------------
/src/clojure/clj_fst/utils.clj:
--------------------------------------------------------------------------------
1 | (ns clj-fst.utils)
2 |
3 | (defn process-output
4 | "Process the output of an FST enumeration to generate a vector of outputs"
5 | [output]
6 | (let [output (if (instance? java.util.ArrayList output)
7 | (into [] output)
8 | [output])]
9 | (->> (remove nil? output)
10 | (map (fn [c-ref]
11 | [(.toString c-ref)]))
12 | (apply concat)
13 | (into []))))
14 |
--------------------------------------------------------------------------------
/src/java/clj_fst/CljUtils.java:
--------------------------------------------------------------------------------
1 | package org.apache.lucene.util.clj;
2 |
3 | import org.apache.lucene.util.BytesRef;
4 | import org.apache.lucene.util.IntsRef;
5 | import org.apache.lucene.util.UnicodeUtil;
6 |
7 | /** Helper class to test FSTs. */
8 | public class CljUtils {
9 |
10 | public static String inputToString(IntsRef term, boolean isValidUnicode) {
11 | if (!isValidUnicode) {
12 | return term.toString();
13 | } else {
14 | // utf8
15 | //return toBytesRef(term).utf8ToString() + " " + term;
16 | return UnicodeUtil.newString(term.ints, term.offset, term.length);
17 | }
18 | }
19 |
20 | private static BytesRef toBytesRef(IntsRef ir) {
21 | BytesRef br = new BytesRef(ir.length);
22 | for(int i=0;i= 0 && x <= 255;
25 | br.bytes[i] = (byte) x;
26 | }
27 | br.length = ir.length;
28 | return br;
29 | }
30 | }
--------------------------------------------------------------------------------
/test/clj_fst/core_test.clj:
--------------------------------------------------------------------------------
1 | (ns clj-fst.core-test
2 | (:require [clojure.test :refer :all]
3 | [clj-fst.core :refer :all]))
4 |
5 | (deftest a-test
6 | (testing "FIXME, I fail."
7 | (is (= 0 1))))
8 |
--------------------------------------------------------------------------------