├── .gitignore
├── LICENSE
├── README.md
├── TODO.md
├── elm.json
├── examples
├── elm.json
└── src
│ ├── IndexNewAddSearch.elm
│ ├── IndexNewAddSearchListFields.elm
│ ├── IndexNewWithAddSearch.elm
│ └── MultipleAddSearch.elm
├── src
├── ElmTextSearch.elm
├── ElmTextSearch
│ └── Json
│ │ ├── Decoder.elm
│ │ └── Encoder.elm
├── ElmTextSearchErrors.elm
├── Index.elm
├── Index
│ ├── Defaults.elm
│ ├── Load.elm
│ ├── Model.elm
│ ├── Utils.elm
│ └── Vector.elm
├── StopWordFilter.elm
├── TokenProcessors.elm
└── Utils.elm
└── tests
├── DefaultTests.elm
├── ElmTextSearchTests.elm
├── IndexDecoderTests.elm
├── IndexEncoderTests.elm
├── IndexLoadTests.elm
├── IndexTests.elm
├── IndexUtilsTests.elm
├── SearchIndexTests.elm
├── StopWordFilterTests.elm
├── TestUtils.elm
└── TokenProcessorTests.elm
/.gitignore:
--------------------------------------------------------------------------------
1 | elm-stuff
2 | lunr.js
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2016, Robin Luiten www.github.com/rluiten
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | * Neither the name of ElmTextSearch nor the names of its
15 | contributors may be used to endorse or promote products derived from
16 | this software without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ElmTextSearch full text indexer
2 |
3 | Copyright (c) 2025 Robin Luiten
4 |
5 | This is a full text indexing engine inspired by lunr.js and written in Elm language.
6 | See http://lunrjs.com/ for lunr.js
7 |
8 | I am happy to hear about users of this package.
9 |
10 | I am happy to receive contributions be they bug reports, pull requests, documentation updates or examples.
11 |
12 | ### v4.0.0 will not load indexes saved with old version.
13 |
14 | If you do not use `storeToValue` `storeToString` `fromString` `fromValue` in ElmTextSearch this update is not likely to introduce issues.
15 |
16 | The way that filters and transforms are applied to the content of documents has changed.
17 | This is to properly fix a bug reported see https://github.com/rluiten/elm-text-search/issues/10 where stop word filters were not correctly applied. This means saved indexes from previous version of ElmTextSearch will not load in this version.
18 |
19 | * `Defaults.indexVersion` has changed value.
20 |
21 | The reason this is a Major version bump is some generalisation was done to enable future support
22 | for loading and saving of older version and types of default index configurations.
23 |
24 | ### v5.0.0 updates for Elm 0.19
25 |
26 | Result types from loading indexes are now Decode.Error not String.
27 |
28 | ### v5.0.2, v5.1.0
29 |
30 | New functions addT for add, searchT for search and removeT for remove.
31 | These replace the error type of result with a type.
32 | v5.0.2 was a goof on my part i forgot to expose new functions correctly.
33 |
34 | ### v5.1.1
35 |
36 | Fixed a bug in Trie that affect this when removing documents quite seriously.
37 | New version of Trie, a few extra tests created.
38 |
39 | ### Packages
40 |
41 | Several packages were created for this project and published separately for this package to depend on.
42 |
43 | * trie
44 | * http://package.elm-lang.org/packages/rluiten/trie/latest
45 | * stemmer
46 | * http://package.elm-lang.org/packages/rluiten/stemmer/latest
47 | * sparsevector
48 | * http://package.elm-lang.org/packages/rluiten/sparsevector/latest
49 |
50 | ### Parts of lunr.js were left out
51 |
52 | * This does not have an event system.
53 | * Its internal data structure is not compatible.
54 |
55 | ### Notes captured along way writing this.
56 |
57 | * lunr.js
58 | * tokenStore.remove does not decrement length, but it doesn't use length really only save/load
59 | * stemmer "lay" -> "lay" "try" -> "tri" is opposite to porter stemmer
60 | * porter stemmer erlang implementation
61 | * step5b does not use endsWithDoubleCons which is required afaik to pass the voc.txt output.txt cases
62 |
63 |
64 | ### Example
65 |
66 | See examples folder for four examples.
67 | You can run any of the examples if you navigate to the examples folder and run `elm reactor` and select an example in the src folder.
68 |
69 | First example is included inline here.
70 |
71 | IndexNewAddSearch.elm
72 | ```elm
73 | module Main exposing (ExampleDocType, createNewIndexExample, main, resultSearchIndex, resultUpdatedMyIndexAfterAdd)
74 |
75 | {-| Create an index and add a document, search a document
76 |
77 | Copyright (c) 2025 Robin Luiten
78 |
79 | -}
80 |
81 | import Browser
82 | import ElmTextSearch
83 | import Html exposing (Html, button, div, text)
84 |
85 |
86 | {-| Example document type.
87 | -}
88 | type alias ExampleDocType =
89 | { cid : String
90 | , title : String
91 | , author : String
92 | , body : String
93 | }
94 |
95 |
96 | {-| Create an index with default configuration.
97 | See ElmTextSearch.SimpleConfig documentation for parameter information.
98 | -}
99 | createNewIndexExample : ElmTextSearch.Index ExampleDocType
100 | createNewIndexExample =
101 | ElmTextSearch.new
102 | { ref = .cid
103 | , fields =
104 | [ ( .title, 5.0 )
105 | , ( .body, 1.0 )
106 | ]
107 | , listFields = []
108 | }
109 |
110 |
111 | {-| Add a document to an index.
112 | -}
113 | resultUpdatedMyIndexAfterAdd : Result String (ElmTextSearch.Index ExampleDocType)
114 | resultUpdatedMyIndexAfterAdd =
115 | ElmTextSearch.add
116 | { cid = "id1"
117 | , title = "First Title"
118 | , author = "Some Author"
119 | , body = "Words in this example document with explanations."
120 | }
121 | createNewIndexExample
122 |
123 |
124 | {-| Search the index.
125 |
126 | The result includes an updated Index because a search causes internal
127 | caches to be updated to improve overall performance.
128 |
129 | -}
130 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
131 | resultSearchIndex =
132 | resultUpdatedMyIndexAfterAdd
133 | |> Result.andThen
134 | (ElmTextSearch.search "explanations")
135 |
136 |
137 | {-| Display search result.
138 | -}
139 | main =
140 | Browser.sandbox { init = 0, update = update, view = view }
141 |
142 |
143 | type Msg
144 | = DoNothing
145 |
146 |
147 | update msg model =
148 | case msg of
149 | DoNothing ->
150 | model
151 |
152 |
153 | view model =
154 | let
155 | -- want only the search results not the returned index
156 | searchResults =
157 | Result.map Tuple.second resultSearchIndex
158 | in
159 | div []
160 | [ text
161 | ("Result of searching for \"explanations\" is "
162 | ++ Debug.toString searchResults
163 | )
164 | ]
165 | ```
166 |
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | TODO - TokenProcessor look at elm/parser instead of regex looks better and probably more efficient than regex.
2 |
3 | TODO - Idea store index to corpus tokens for each document not the words. maybe worth it.
4 |
5 |
--------------------------------------------------------------------------------
/elm.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "package",
3 | "name": "rluiten/elm-text-search",
4 | "summary": "Full text index engine in Elm language inspired by lunr.js.",
5 | "license": "BSD-3-Clause",
6 | "version": "5.1.1",
7 | "exposed-modules": [
8 | "ElmTextSearch",
9 | "ElmTextSearchErrors",
10 | "Index.Defaults",
11 | "StopWordFilter",
12 | "ElmTextSearch.Json.Encoder",
13 | "ElmTextSearch.Json.Decoder"
14 | ],
15 | "elm-version": "0.19.0 <= v < 0.20.0",
16 | "dependencies": {
17 | "NoRedInk/elm-json-decode-pipeline": "1.0.0 <= v < 2.0.0",
18 | "elm/core": "1.0.0 <= v < 2.0.0",
19 | "elm/json": "1.0.0 <= v < 2.0.0",
20 | "elm/regex": "1.0.0 <= v < 2.0.0",
21 | "rluiten/sparsevector": "1.0.3 <= v < 2.0.0",
22 | "rluiten/stemmer": "1.0.4 <= v < 2.0.0",
23 | "rluiten/trie": "2.1.1 <= v < 3.0.0"
24 | },
25 | "test-dependencies": {
26 | "elm-explorations/test": "2.0.0 <= v <= 2.2.0"
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/examples/elm.json:
--------------------------------------------------------------------------------
1 | {
2 | "type": "application",
3 | "source-directories": [
4 | "src",
5 | "../src"
6 | ],
7 | "elm-version": "0.19.0",
8 | "dependencies": {
9 | "direct": {
10 | "NoRedInk/elm-json-decode-pipeline": "1.0.0",
11 | "elm/browser": "1.0.0",
12 | "elm/core": "1.0.0",
13 | "elm/html": "1.0.0",
14 | "elm/json": "1.0.0",
15 | "elm/regex": "1.0.0",
16 | "rluiten/sparsevector": "1.0.3",
17 | "rluiten/stemmer": "1.0.4",
18 | "rluiten/trie": "2.0.3"
19 | },
20 | "indirect": {
21 | "elm/time": "1.0.0",
22 | "elm/url": "1.0.0",
23 | "elm/virtual-dom": "1.0.0"
24 | }
25 | },
26 | "test-dependencies": {
27 | "direct": {},
28 | "indirect": {}
29 | }
30 | }
--------------------------------------------------------------------------------
/examples/src/IndexNewAddSearch.elm:
--------------------------------------------------------------------------------
1 | module Main exposing (ExampleDocType, createNewIndexExample, main, resultSearchIndex, resultUpdatedMyIndexAfterAdd)
2 |
3 | {-| Create an index and add a document, search a document
4 |
5 | Copyright (c) 2016 Robin Luiten
6 |
7 | -}
8 |
9 | import Browser
10 | import ElmTextSearch
11 | import Html exposing (Html, button, div, text)
12 |
13 |
14 | {-| Example document type.
15 | -}
16 | type alias ExampleDocType =
17 | { cid : String
18 | , title : String
19 | , author : String
20 | , body : String
21 | }
22 |
23 |
24 | {-| Create an index with default configuration.
25 | See ElmTextSearch.SimpleConfig documentation for parameter information.
26 | -}
27 | createNewIndexExample : ElmTextSearch.Index ExampleDocType
28 | createNewIndexExample =
29 | ElmTextSearch.new
30 | { ref = .cid
31 | , fields =
32 | [ ( .title, 5.0 )
33 | , ( .body, 1.0 )
34 | ]
35 | , listFields = []
36 | }
37 |
38 |
39 | {-| Add a document to an index.
40 | -}
41 | resultUpdatedMyIndexAfterAdd : Result String (ElmTextSearch.Index ExampleDocType)
42 | resultUpdatedMyIndexAfterAdd =
43 | ElmTextSearch.add
44 | { cid = "id1"
45 | , title = "First Title"
46 | , author = "Some Author"
47 | , body = "Words in this example document with explanations."
48 | }
49 | createNewIndexExample
50 |
51 |
52 | {-| Search the index.
53 |
54 | The result includes an updated Index because a search causes internal
55 | caches to be updated to improve overall performance.
56 |
57 | -}
58 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
59 | resultSearchIndex =
60 | resultUpdatedMyIndexAfterAdd
61 | |> Result.andThen
62 | (ElmTextSearch.search "explanations")
63 |
64 |
65 | {-| Display search result.
66 | -}
67 | main =
68 | Browser.sandbox { init = 0, update = update, view = view }
69 |
70 |
71 | type Msg
72 | = DoNothing
73 |
74 |
75 | update msg model =
76 | case msg of
77 | DoNothing ->
78 | model
79 |
80 |
81 | view model =
82 | let
83 | -- want only the search results not the returned index
84 | searchResults =
85 | Result.map Tuple.second resultSearchIndex
86 | in
87 | div []
88 | [ text
89 | ("Result of searching for \"explanations\" is "
90 | ++ Debug.toString searchResults
91 | )
92 | ]
93 |
--------------------------------------------------------------------------------
/examples/src/IndexNewAddSearchListFields.elm:
--------------------------------------------------------------------------------
1 | module Main exposing (ExampleDocType, createNewIndexExample, main, resultSearchIndex, resultUpdatedMyIndexAfterAdd)
2 |
3 | {-| Create an index and add a document, search a document
4 | This variation indexes words from a field which is List String.
5 |
6 | Copyright (c) 2016 Robin Luiten
7 |
8 | -}
9 |
10 | import Browser
11 | import ElmTextSearch
12 | import Html exposing (Html, div, text)
13 |
14 |
15 | {-| Example document type.
16 | -}
17 | type alias ExampleDocType =
18 | { cid : String
19 | , title : String
20 | , author : String
21 | , body : List String
22 | }
23 |
24 |
25 | {-| Create an index with default configuration.
26 | See ElmTextSearch.SimpleConfig documentation for parameter information.
27 | -}
28 | createNewIndexExample : ElmTextSearch.Index ExampleDocType
29 | createNewIndexExample =
30 | ElmTextSearch.new
31 | { ref = .cid
32 | , fields =
33 | [ ( .title, 5.0 )
34 | ]
35 | , listFields =
36 | [ ( .body, 1.0 )
37 | ]
38 | }
39 |
40 |
41 | {-| Add a document to an index.
42 | -}
43 | resultUpdatedMyIndexAfterAdd : Result String (ElmTextSearch.Index ExampleDocType)
44 | resultUpdatedMyIndexAfterAdd =
45 | ElmTextSearch.add
46 | { cid = "id1"
47 | , title = "First Title"
48 | , author = "Some Author"
49 | , body =
50 | [ "Words in this example "
51 | , "document with explanations."
52 | ]
53 | }
54 | createNewIndexExample
55 |
56 |
57 | {-| Search the index.
58 |
59 | The result includes an updated Index because a search causes internal
60 | caches to be updated to improve overall performance.
61 |
62 | -}
63 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
64 | resultSearchIndex =
65 | resultUpdatedMyIndexAfterAdd
66 | |> Result.andThen
67 | (ElmTextSearch.search "explanations")
68 |
69 |
70 | {-| Display search result.
71 | -}
72 | main =
73 | Browser.sandbox { init = 0, update = update, view = view }
74 |
75 |
76 | type Msg
77 | = DoNothing
78 |
79 |
80 | update msg model =
81 | case msg of
82 | DoNothing ->
83 | model
84 |
85 | {-| Display search result.
86 | -}
87 | view model =
88 | let
89 | -- want only the search results not the returned index
90 | searchResults =
91 | Result.map Tuple.second resultSearchIndex
92 | in
93 | div []
94 | [ text
95 | ("Result of searching for \"explanations\" is "
96 | ++ Debug.toString searchResults
97 | )
98 | ]
99 |
--------------------------------------------------------------------------------
/examples/src/IndexNewWithAddSearch.elm:
--------------------------------------------------------------------------------
1 | module Main exposing (ExampleDocType, addDocToIndexExample, createMyStopWordFilter, createNewWithIndexExample, firstResultSearchIndex, main, secondResultSearchIndex)
2 |
3 | {-| Create an index with customized stop word filter using
4 | ElmTextSearch.newWith.
5 |
6 | Copyright (c) 2016 Robin Luiten
7 |
8 | -}
9 |
10 | import Browser
11 | import ElmTextSearch
12 | import Html exposing (Html, div, text)
13 | import Index.Defaults
14 | import StopWordFilter
15 |
16 |
17 | {-| Example document type.
18 | -}
19 | type alias ExampleDocType =
20 | { cid : String
21 | , title : String
22 | , author : String
23 | , body : String
24 | }
25 |
26 |
27 | {-| Create an extended stop word filter.
28 |
29 | Be careful about adding words to your stop word list, as any stop word
30 | will not be indexed and you will not be able to search for the word in
31 | documents as it will not be found.
32 |
33 | It is possible to completely replace the stop word list and not
34 | just extend it.
35 |
36 | -}
37 | createMyStopWordFilter =
38 | StopWordFilter.createFilterFuncWith
39 | [ "explanations" ]
40 |
41 |
42 | {-| Create an index with extra options.
43 |
44 | - In this case a customized stop word filter is provided.
45 | - It is supplying the default transform factories.
46 | - It supplies an index type for the customized index config.
47 | This becomes important when loading back saved index.
48 | - It is a good idea to include a version in your index type string
49 | in case you update things and might still have old versions
50 | around that you need to work with.
51 |
52 | -}
53 | createNewWithIndexExample : ElmTextSearch.Index ExampleDocType
54 | createNewWithIndexExample =
55 | ElmTextSearch.newWith
56 | { indexType = "ElmTextSearch - Customized Stop Words v1"
57 | , ref = .cid
58 | , fields =
59 | [ ( .title, 5.0 )
60 | , ( .body, 1.0 )
61 | ]
62 | , listFields = []
63 | , initialTransformFactories = Index.Defaults.defaultInitialTransformFactories
64 | , transformFactories = Index.Defaults.defaultTransformFactories
65 | , filterFactories = [ createMyStopWordFilter ]
66 | }
67 |
68 |
69 | {-| Adding a document to the index.
70 | -}
71 | addDocToIndexExample : Result String (ElmTextSearch.Index ExampleDocType)
72 | addDocToIndexExample =
73 | ElmTextSearch.add
74 | { cid = "id1"
75 | , title = "First Title"
76 | , author = "Some Author"
77 | , body = "Words in this example document with explanations."
78 | }
79 | createNewWithIndexExample
80 |
81 |
82 | {-| Search the index for a word in our extended stop words.
83 | This will return no matches.
84 | -}
85 | firstResultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
86 | firstResultSearchIndex =
87 | addDocToIndexExample
88 | |> Result.andThen
89 | (ElmTextSearch.search "explanation")
90 |
91 |
92 | {-| Search the index for a word that is not a stop word.
93 | It will return an Err about no search terms.
94 | -}
95 | secondResultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
96 | secondResultSearchIndex =
97 | addDocToIndexExample
98 | |> Result.andThen
99 | (ElmTextSearch.search "examples")
100 |
101 |
102 | {-| Display search result.
103 | -}
104 | main =
105 | Browser.sandbox { init = 0, update = update, view = view }
106 |
107 |
108 | type Msg
109 | = DoNothing
110 |
111 |
112 | update msg model =
113 | case msg of
114 | DoNothing ->
115 | model
116 |
117 |
118 | view model =
119 | let
120 | searchResults1 =
121 | Result.map Tuple.second firstResultSearchIndex
122 |
123 | searchResults2 =
124 | Result.map Tuple.second secondResultSearchIndex
125 | in
126 | div []
127 | [ div []
128 | [ text
129 | ("Expecting no matches (because explanation is in stop words). Result of first search for \"explanation\" is "
130 | ++ Debug.toString searchResults1
131 | )
132 | ]
133 | , div []
134 | [ text
135 | ("Result of second search for \"examples\" is "
136 | ++ Debug.toString searchResults2
137 | )
138 | ]
139 | ]
140 |
--------------------------------------------------------------------------------
/examples/src/MultipleAddSearch.elm:
--------------------------------------------------------------------------------
1 | module Main exposing (ExampleDocType, createNewIndexExample, documents, indexWithMulitpleDocumentsAdded, main, resultSearchIndex)
2 |
3 | {-| Create an index and add multiple documents.
4 |
5 | Copyright (c) 2016 Robin Luiten
6 |
7 | -}
8 |
9 | import Browser
10 | import ElmTextSearch
11 | import Html exposing (Html, div, text)
12 |
13 |
14 | {-| Example document type.
15 | -}
16 | type alias ExampleDocType =
17 | { cid : String
18 | , title : String
19 | , author : String
20 | , body : String
21 | }
22 |
23 |
24 | {-| Create an index with default configuration.
25 | See ElmTextSearch.SimpleConfig documentation for parameter information.
26 | -}
27 | createNewIndexExample : ElmTextSearch.Index ExampleDocType
28 | createNewIndexExample =
29 | ElmTextSearch.new
30 | { ref = .cid
31 | , fields =
32 | [ ( .title, 5.0 )
33 | , ( .body, 1.0 )
34 | ]
35 | , listFields = []
36 | }
37 |
38 |
39 | documents =
40 | [ { cid = "id1"
41 | , title = "First Title"
42 | , author = "Some Author"
43 | , body = "Words in this example document with explanations."
44 | }
45 | , { cid = "id2"
46 | , title = "Is a cactus as pretty as a tree ?"
47 | , author = "Joe Greeny"
48 | , body = "This title contains information about cactuses."
49 | }
50 | ]
51 |
52 |
53 | {-| Add a documents to index.
54 |
55 | If any add result is an Err this returns the first failure.
56 |
57 | -}
58 | indexWithMulitpleDocumentsAdded : ( ElmTextSearch.Index ExampleDocType, List ( Int, String ) )
59 | indexWithMulitpleDocumentsAdded =
60 | ElmTextSearch.addDocs
61 | documents
62 | createNewIndexExample
63 |
64 |
65 | {-| Search the index.
66 |
67 | The result includes an updated Index because a search causes internal
68 | caches to be updated to improve overall performance.
69 |
70 | This is ignoring any errors from call to addAllDocs
71 | in indexWithMulitpleDocumentsAdded.
72 |
73 | -}
74 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
75 | resultSearchIndex =
76 | ElmTextSearch.search "title" (Tuple.first indexWithMulitpleDocumentsAdded)
77 |
78 |
79 | {-| Display search result.
80 | -}
81 | main =
82 | Browser.sandbox { init = 0, update = update, view = view }
83 |
84 |
85 | type Msg
86 | = DoNothing
87 |
88 |
89 | update msg model =
90 | case msg of
91 | DoNothing ->
92 | model
93 |
94 |
95 | view model =
96 | let
97 | -- want only the search results not the returned index
98 | searchResults =
99 | Result.map Tuple.second resultSearchIndex
100 | in
101 | div []
102 | [ text
103 | ("Result of searching for \"title\" is "
104 | ++ Debug.toString searchResults
105 | )
106 | ]
107 |
--------------------------------------------------------------------------------
/src/ElmTextSearch.elm:
--------------------------------------------------------------------------------
1 | module ElmTextSearch exposing
2 | ( new
3 | , newWith
4 | , add
5 | , addT
6 | , addDocs
7 | , remove
8 | , removeT
9 | , update
10 | , addOrUpdate
11 | , search
12 | , searchT
13 | , Index
14 | , Config
15 | , SimpleConfig
16 | , storeToValue
17 | , storeToString
18 | , fromString
19 | , fromValue
20 | , fromStringWith
21 | , fromValueWith
22 | )
23 |
24 | {-| A full text indexer written in Elm language inspired by lunr.js.
25 |
26 | A useful article about lunr.js
27 |
28 |
29 |
30 | ## Create Index
31 |
32 | @docs new
33 | @docs newWith
34 |
35 |
36 | ## Modify Index
37 |
38 | @docs add
39 | @docs addT
40 | @docs addDocs
41 | @docs remove
42 | @docs removeT
43 | @docs update
44 | @docs addOrUpdate
45 |
46 |
47 | ## Query Index
48 |
49 | @docs search
50 | @docs searchT
51 |
52 |
53 | ## Types
54 |
55 | @docs Index
56 | @docs Config
57 | @docs SimpleConfig
58 |
59 |
60 | ## Save and Load an Index
61 |
62 | - You can save an index using [`ElmTextSearch.Json.Encoder.encoder`](ElmTextSearch.Json.Encoder#encoder)
63 | - You can load a saved index using
64 | [`ElmTextSearch.Json.Decoder.decoder`](ElmTextSearch.Json.Decoder#decoder)
65 | to produce a [`Index.Model.CodecIndexRecord`](Index.Model#CodecIndexRecord).
66 | - You can save a [`Index.Model.CodecIndexRecord`](Index.Model#CodecIndexRecord)
67 | using [`ElmTextSearch.Json.Encoder.codecIndexRecordEncoder`](ElmTextSearch.Json.Encoder#codecIndexRecordEncoder)
68 | - \*\* Modifying an index outside of ElmTextSearch using the Decoder and Encoder directly
69 | may cause it to not work correctly loaded into ElmTextSearch. \*\*
70 |
71 | @docs storeToValue
72 | @docs storeToString
73 | @docs fromString
74 | @docs fromValue
75 | @docs fromStringWith
76 | @docs fromValueWith
77 |
78 | Copyright (c) 2016 Robin Luiten
79 |
80 | -}
81 |
82 | import ElmTextSearch.Json.Encoder as IndexEncoder
83 | import ElmTextSearchErrors
84 | import Index
85 | import Index.Defaults as Defaults
86 | import Index.Load
87 | import Index.Model as Model
88 | import Json.Decode as Decode
89 | import Json.Encode as Encode
90 |
91 |
92 | {-| An Index holds the data to be able search for added documents.
93 | -}
94 | type alias Index doc =
95 | Index.Index doc
96 |
97 |
98 | {-| A SimpleConfig is the least amount of configuration data
99 | required to create an Index.
100 |
101 | See [`ElmTextSearch.new`](ElmTextSearch#new) for fields.
102 |
103 | -}
104 | type alias SimpleConfig doc =
105 | Model.IndexSimpleConfig doc
106 |
107 |
108 | {-| A Config is required to create an Index.
109 | -}
110 | type alias Config doc =
111 | Model.Config doc
112 |
113 |
114 | {-| Create new index.
115 |
116 | Example
117 |
118 | import ElmTextSearch
119 |
120 | {-| Example document type.
121 | -}
122 | type alias ExampleDocType =
123 | { cid : String
124 | , title : String
125 | , author : String
126 | , body : String
127 | }
128 |
129 | {-| Create an index with default configuration.
130 | See ElmTextSearch.SimpleConfig documentation for parameter information.
131 | -}
132 | createNewIndexExample : ElmTextSearch.Index ExampleDocType
133 | createNewIndexExample =
134 | ElmTextSearch.new
135 | { ref = .cid
136 | , fields =
137 | [ ( .title, 5.0 )
138 | , ( .body, 1.0 )
139 | ]
140 | , listFields = []
141 | }
142 |
143 | The `SimpleConfig` parameter to new is
144 |
145 | - ref
146 | - The unique document reference will be extracted from each
147 | document using `.cid`.
148 | - fields
149 | - Define which fields contain a strings to be indexed.
150 | - The following fields will be indexed from each document
151 | - `.title`
152 | - `.body`
153 | - When searching the index any word matches found in the
154 | `.title` field (boost value 5.0) raise the document match score
155 | more than if found in the `.body` field (boost value 1.0).
156 | - The document match score determines the order of the list
157 | of matching documents returned.
158 | - listFields
159 | - Define which fields contain list of strings to be indexed.
160 |
161 | -}
162 | new : SimpleConfig doc -> Index doc
163 | new simpleConfig =
164 | Index.new (Defaults.getIndexSimpleConfig simpleConfig)
165 |
166 |
167 | {-| Create new index with additional configuration.
168 |
169 | Example.
170 |
171 | import ElmTextSearch
172 | import Index.Defaults
173 | import StopWordFilter
174 |
175 | type alias ExampleDocType =
176 | { cid : String
177 | , title : String
178 | , author : String
179 | , body : String
180 | }
181 |
182 | createMyStopWordFilter =
183 | StopWordFilter.createFilterFuncWith
184 | [ "explanations" ]
185 |
186 | createNewWithIndexExample : ElmTextSearch.Index ExampleDocType
187 | createNewWithIndexExample =
188 | ElmTextSearch.newWith
189 | { indexType = "ElmTextSearch - Customized Stop Words v1"
190 | , ref = .cid
191 | , fields =
192 | [ ( .title, 5.0 )
193 | , ( .body, 1.0 )
194 | ]
195 | , listFields = []
196 | , initialTransformFactories = Index.Defaults.defaultInitialTransformFactories
197 | , transformFactories = Index.Defaults.defaultTransformFactories
198 | , filterFactories = [ createMyStopWordFilter ]
199 | }
200 |
201 | -}
202 | newWith : Config doc -> Index doc
203 | newWith =
204 | Index.newWith
205 |
206 |
207 | {-| Add a document to an index.
208 |
209 | Starting with the ElmTextSearch.new example above this adds a document.
210 |
211 | addDocToIndexExample : Result String (ElmTextSearch.Index ExampleDocType)
212 | addDocToIndexExample =
213 | ElmTextSearch.add
214 | { cid = "id1"
215 | , title = "First Title"
216 | , author = "Some Author"
217 | , body = "Words in this example document with explanations."
218 | }
219 | createNewWithIndexExample
220 |
221 | Conditions that cause a result Err with message.
222 |
223 | - Error document ref is empty.
224 | - Error after tokenisation there are no terms to index.
225 | - Error adding document that allready exists.
226 |
227 | Original function signature retained for backward compatible.
228 |
229 | -}
230 | add : doc -> Index doc -> Result String (Index doc)
231 | add =
232 | Index.add
233 |
234 |
235 | {-| Add document to an Index if no error conditions found.
236 |
237 | Variant of `add` that provides AddError type for error Results.
238 |
239 | -}
240 | addT : doc -> Index doc -> Result ElmTextSearchErrors.AddError (Index doc)
241 | addT =
242 | Index.addT
243 |
244 |
245 | {-| Add multiple documents. Tries to add all docs and collects errors..
246 | It does not stop adding at first error encountered.
247 |
248 | The result part List (Int, String) is the list of document index
249 | and the error string message result of adding.
250 | Returns the index unchanged if all documents error when added.
251 | Returns the updated index after adding the documents.
252 |
253 | -}
254 | addDocs : List doc -> Index doc -> ( Index doc, List ( Int, String ) )
255 | addDocs =
256 | Index.addDocs
257 |
258 |
259 | {-| Remove a document from an index.
260 |
261 | Starting with the ElmTextSearch.new example above this removes a document.
262 |
263 | removeDocFromIndexExample =
264 | ElmTextSearch.remove
265 | { cid = "123"
266 | , title = "Examples of a Banana"
267 | , author = "Sally Apples"
268 | , body = "Sally writes words about a banana."
269 | }
270 | createNewIndexExample
271 |
272 | Conditions that cause a result Err with message.
273 |
274 | - Error document has an empty unique id (ref).
275 | - Error document is not in index.
276 |
277 | Original function signature retained for backward compatible.
278 |
279 | -}
280 | remove : doc -> Index doc -> Result String (Index doc)
281 | remove =
282 | Index.remove
283 |
284 |
285 | {-| Add document to an Index if no error conditions found.
286 |
287 | Variant of `remove` that provides RemoveError type for error Results.
288 |
289 | -}
290 | removeT : doc -> Index doc -> Result ElmTextSearchErrors.RemoveError (Index doc)
291 | removeT =
292 | Index.removeT
293 |
294 |
295 | {-| Update a document in an index.
296 |
297 | Starting with the ElmTextSearch.new example above this updates a document.
298 |
299 | updatedIndex =
300 | ElmTextSearch.update
301 | { cid = "123"
302 | , title = "Examples of a Bananas in every day life."
303 | , author = "Sally Apples"
304 | , body = "Sally writes more words about a banana."
305 | }
306 | createNewIndexExample
307 |
308 | Conditions that cause an error result are those for
309 | [`ElmTextSearch.remove`](ElmTextSearch#remove) and
310 | [`ElmTextSearch.add`](ElmTextSearch#add).
311 |
312 | -}
313 | update : doc -> Index doc -> Result String (Index doc)
314 | update =
315 | Index.update
316 |
317 |
318 | {-| Add or Update a document in an index.
319 | This removes the document first if it is allready in index then adds it.
320 | -}
321 | addOrUpdate : doc -> Index doc -> Result String (Index doc)
322 | addOrUpdate =
323 | Index.addOrUpdate
324 |
325 |
326 | {-| Search an index with query.
327 |
328 | Tokens are extracted from the query string and passed through the
329 | same processing used when indexing documents.
330 |
331 | Each token is expanded, so that the term "he" might be expanded to "hello"
332 | and "help" if those terms were already included in the document index.
333 |
334 | Multiple tokens are allowed and will lead to an AND based query.
335 |
336 | The following example runs a search for documents containing both "apple" and "banana".
337 |
338 | searchResult =
339 | Index.search "Apple banana" createNewIndexExample
340 |
341 | Results are a list of matching document reference identifiers with
342 | there similarity to query score, ordered by score descending, so the
343 | best matches are earliest in the list.
344 |
345 | An index is returned from search as well. This is because the data model may
346 | be updated to improve performance for later searches.
347 |
348 | Adding or removing a new document will cause some of the internal caching
349 | to be reset.
350 |
351 | Conditions that cause a result Err with message.
352 |
353 | - Error there are no documents in index to search.
354 | - Error query is empty.
355 | - Error after tokenisation there are no terms to search for.
356 |
357 | Original function signature retained for backward compatible.
358 |
359 | -}
360 | search :
361 | String
362 | -> Index doc
363 | -> Result String ( Index doc, List ( String, Float ) )
364 | search =
365 | Index.search
366 |
367 |
368 | {-| Add document to an Index if no error conditions found.
369 |
370 | Variant of `search` that provides SearchError type for error Results.
371 |
372 | -}
373 | searchT :
374 | String
375 | -> Index doc
376 | -> Result ElmTextSearchErrors.SearchError ( Index doc, List ( String, Float ) )
377 | searchT =
378 | Index.searchT
379 |
380 |
381 | {-| Store an index to a Value.
382 | You can also use [`ElmTextSearch.Json.Encoder`](ElmTextSearch.Json.Encoder).
383 | -}
384 | storeToValue : Index doc -> Encode.Value
385 | storeToValue =
386 | IndexEncoder.encoder
387 |
388 |
389 | {-| Store an index to a String.
390 | You can also use [`ElmTextSearch.Json.Encoder`](ElmTextSearch.Json.Encoder).
391 | -}
392 | storeToString : Index doc -> String
393 | storeToString index =
394 | Encode.encode 0 (IndexEncoder.encoder index)
395 |
396 |
397 | {-| Create an Index from a String which has a stored Index in it and the
398 | supplied basic configurations.
399 | See [`ElmTextSearch.fromStringWith`](ElmTextSearch#fromStringWith) for possible Err results.
400 | -}
401 | fromString : SimpleConfig doc -> String -> Result Decode.Error (Index doc)
402 | fromString simpleConfig inputString =
403 | Index.Load.loadIndex
404 | (Defaults.getIndexSimpleConfig simpleConfig)
405 | inputString
406 |
407 |
408 | {-| Create an Index from a Value which has a stored Index in it.
409 | See [`ElmTextSearch.fromStringWith`](ElmTextSearch#fromStringWith) for possible Err results.
410 | -}
411 | fromValue : SimpleConfig doc -> Decode.Value -> Result Decode.Error (Index doc)
412 | fromValue simpleConfig inputValue =
413 | Index.Load.loadIndexValue
414 | (Defaults.getIndexSimpleConfig simpleConfig)
415 | inputValue
416 |
417 |
418 | {-| Create an Index from a String which has a stored Index in it.
419 |
420 | If none of the indexVersion in the list of Config match the index
421 | type being loaded it will return an Err.
422 |
423 | The list of configurations wil be searched for a matching indexType
424 | so you should provide configs for all types you may be trying to load.
425 | No more than the config that matches is required though.
426 |
427 | If the none of the supplied Config match the loaded Index then it
428 | will try if the index being loaded matches the default version if so
429 | it will still load the index.
430 |
431 | The following Err results may be returned.
432 |
433 | - "Error cannot load Index. Tried to load index of type "\_\_IndexTest Type -". It is not in supported index configurations."
434 | - It contains the loaded version index type which comes from input.
435 | - "Error cannot load Index. Version supported is 1.0.0. Version tried to load is 1.0.1."
436 | - It includes both expected and loaded versions which may vary.
437 |
438 | -}
439 | fromStringWith : List (Config doc) -> String -> Result Decode.Error (Index doc)
440 | fromStringWith =
441 | Index.Load.loadIndexWith
442 |
443 |
444 | {-| Create an Index from a String which has a stored Index in it.
445 | If none of the indexVersion in the list of SimpleConfig match the index
446 | being decoded it will return an Err.
447 |
448 | See [`ElmTextSearch.fromStringWith`](ElmTextSearch#fromStringWith) for possible Err results.
449 |
450 | -}
451 | fromValueWith : List (Config doc) -> Decode.Value -> Result Decode.Error (Index doc)
452 | fromValueWith =
453 | Index.Load.loadIndexValueWith
454 |
--------------------------------------------------------------------------------
/src/ElmTextSearch/Json/Decoder.elm:
--------------------------------------------------------------------------------
1 | module ElmTextSearch.Json.Decoder exposing (decoder)
2 |
3 | {-| Decoder for Index.
4 |
5 | It decodes to a CodecIndexRecord.
6 |
7 | @docs decoder
8 |
9 | Copyright (c) 2016 Robin Luiten
10 |
11 | -}
12 |
13 | import Dict exposing (Dict)
14 | import Index.Model as Model
15 | import Json.Decode as Decode exposing (..)
16 | import Json.Decode.Pipeline exposing (required)
17 | import Set exposing (Set)
18 | import Trie.Json.Decoder as TrieDecoder
19 |
20 |
21 | {-| CodecIndexRecord decoder.
22 | -}
23 | decoder : Decoder Model.CodecIndexRecord
24 | decoder =
25 | Decode.succeed Model.CodecIndexRecord
26 | |> required "indexVersion" string
27 | |> required "indexType" string
28 | |> required "documentStore" documentStoreDecoder
29 | |> required "corpusTokens" setDecoder
30 | |> required "tokenStore" (TrieDecoder.decoder float)
31 |
32 |
33 | documentStoreDecoder : Decoder (Dict String (Set String))
34 | documentStoreDecoder =
35 | dict setDecoder
36 |
37 |
38 | setDecoder : Decoder (Set String)
39 | setDecoder =
40 | map Set.fromList (list string)
41 |
--------------------------------------------------------------------------------
/src/ElmTextSearch/Json/Encoder.elm:
--------------------------------------------------------------------------------
1 | module ElmTextSearch.Json.Encoder exposing
2 | ( encoder
3 | , codecIndexRecordEncoder
4 | )
5 |
6 | {-| Encoder for Index.
7 |
8 | @docs encoder
9 | @docs codecIndexRecordEncoder
10 |
11 | Copyright (c) 2016 Robin Luiten
12 |
13 | -}
14 |
15 | import Dict exposing (Dict)
16 | import Index
17 | import Index.Model as Model exposing (Index(..))
18 | import Json.Encode as Encode
19 | import Set exposing (Set)
20 | import Trie exposing (Trie)
21 | import Trie.Json.Encoder as TrieEncoder
22 |
23 |
24 | {-| Encoder for Index a.
25 |
26 | Only encoding fields required to recreate a working index.
27 |
28 | The following fields are not saved as they are restored via
29 | the provided Config on fromString.
30 |
31 | - ref
32 | - fields
33 | - transformFactories
34 | - filterFactories
35 |
36 | The following fields are not saved because they are an
37 | acceleration model, decoder needs to set it on fromString.
38 |
39 | - corpusTokensIndex
40 |
41 | The following fields are not saved because they are caches
42 | and are cached as operationg requires
43 |
44 | - transforms
45 | - filters
46 | - idfCache
47 |
48 | Do not need an (a -> Encode.Value) because a is a document
49 | type and that is never encoded from an Index.
50 |
51 | -}
52 | encoder : Index doc -> Encode.Value
53 | encoder (Index irec) =
54 | codecIndexRecordEncoder
55 | { indexVersion = irec.indexVersion
56 | , indexType = irec.indexType
57 | , documentStore = irec.documentStore
58 | , corpusTokens = irec.corpusTokens
59 | , tokenStore = irec.tokenStore
60 | }
61 |
62 |
63 | {-| Encode CodecIndexRecord.
64 | -}
65 | codecIndexRecordEncoder : Model.CodecIndexRecord -> Encode.Value
66 | codecIndexRecordEncoder rec =
67 | Encode.object
68 | [ ( "indexVersion", Encode.string rec.indexVersion )
69 | , ( "indexType", Encode.string rec.indexType )
70 | , ( "documentStore", documentStoreEncoder rec.documentStore )
71 | , ( "corpusTokens", corpusTokensEncoder rec.corpusTokens )
72 | , ( "tokenStore", tokenStore rec.tokenStore )
73 | ]
74 |
75 |
76 | documentStoreEncoder : Dict String (Set String) -> Encode.Value
77 | documentStoreEncoder dict =
78 | Encode.object <|
79 | List.map
80 | (\( key, val ) ->
81 | ( key
82 | , Encode.list Encode.string (Set.toList val)
83 | )
84 | )
85 | (Dict.toList dict)
86 |
87 |
88 | corpusTokensEncoder : Set String -> Encode.Value
89 | corpusTokensEncoder setVal =
90 | Encode.list Encode.string (Set.toList setVal)
91 |
92 | tokenStore : Trie Float -> Encode.Value
93 | tokenStore =
94 | TrieEncoder.encoder Encode.float
95 |
--------------------------------------------------------------------------------
/src/ElmTextSearchErrors.elm:
--------------------------------------------------------------------------------
1 | module ElmTextSearchErrors exposing
2 | ( AddError(..)
3 | , RemoveError(..)
4 | , SearchError(..)
5 | )
6 |
7 | {-| Error types used in ElmTextSearch results.
8 |
9 |
10 | ## Types
11 |
12 | @docs AddError
13 | @docs RemoveError
14 | @docs SearchError
15 |
16 | -}
17 |
18 |
19 | {-| Used in error Result case of ElmTextSearh.addT
20 | -}
21 | type AddError
22 | = AddErrorUniqueRefIsEmpty
23 | | NoTermsToIndexAfterTokenisation
24 | | DocAlreadyExists
25 |
26 |
27 | {-| Used in error Result case of ElmTextSearh.removeT
28 | -}
29 | type RemoveError
30 | = RemoveErrorUniqueRefIsEmpty
31 | | DocIsNotInIndex
32 |
33 |
34 | {-| Used in error Result case of ElmTextSearh.searchT
35 | -}
36 | type SearchError
37 | = IndexIsEmpty
38 | | QueryIsEmpty
39 | | NoTermsToSearchAfterTokenisation
40 |
--------------------------------------------------------------------------------
/src/Index.elm:
--------------------------------------------------------------------------------
1 | module Index exposing
2 | ( new
3 | , newWith
4 | , add
5 | , addT
6 | , addDocs
7 | , remove
8 | , removeT
9 | , update
10 | , addOrUpdate
11 | , search
12 | , searchT
13 | , Index
14 | )
15 |
16 | {-| Index module for full text indexer
17 |
18 | Added addT, removeT and searchT functions that provide
19 | a strong type for Error in the Result.
20 |
21 |
22 | ## Create Index
23 |
24 | @docs new
25 | @docs newWith
26 |
27 |
28 | ## Update Index
29 |
30 | @docs add
31 | @docs addT
32 | @docs addDocs
33 | @docs remove
34 | @docs removeT
35 | @docs update
36 | @docs addOrUpdate
37 |
38 |
39 | ## Query Index
40 |
41 | @docs search
42 | @docs searchT
43 |
44 |
45 | ## Types
46 |
47 | @docs Index
48 |
49 | Copyright (c) 2016 Robin Luiten
50 |
51 | -}
52 |
53 | import Dict
54 | import ElmTextSearchErrors exposing (AddError(..), RemoveError(..), SearchError(..))
55 | import Index.Defaults as Defaults
56 | import Index.Model as Model exposing (Index(..))
57 | import Index.Utils
58 | import Index.Vector exposing (..)
59 | import Maybe
60 | import Set exposing (Set)
61 | import String
62 | import Trie
63 | import Utils
64 |
65 |
66 | type alias Index doc =
67 | Model.Index doc
68 |
69 |
70 | type alias Config doc =
71 | Model.Config doc
72 |
73 |
74 | type alias SimpleConfig doc =
75 | Model.ModelSimpleConfig doc
76 |
77 |
78 | {-| Create new index.
79 | -}
80 | new : SimpleConfig doc -> Index doc
81 | new simpleConfig =
82 | newWith
83 | (Defaults.getDefaultIndexConfig simpleConfig)
84 |
85 |
86 | {-| Create new index with control of transformers and filters.
87 | -}
88 | newWith : Config doc -> Index doc
89 | newWith { indexType, ref, fields, listFields, initialTransformFactories, transformFactories, filterFactories } =
90 | Index
91 | { indexVersion = Defaults.indexVersion
92 | , indexType = indexType
93 | , ref = ref
94 | , fields = fields
95 | , listFields = listFields
96 | , initialTransformFactories = initialTransformFactories
97 | , transformFactories = transformFactories
98 | , filterFactories = filterFactories
99 | , initialTransforms = Nothing
100 | , transforms = Nothing
101 | , filters = Nothing
102 | , corpusTokens = Set.empty
103 | , corpusTokensIndex = Dict.empty
104 | , documentStore = Dict.empty
105 | , tokenStore = Trie.empty
106 | , idfCache = Dict.empty
107 | }
108 |
109 |
110 | {-| Add document to an Index if no error conditions found.
111 | See ElmTextSearch documentation for `add` to see error conditions.
112 |
113 | Original function signature retained for backward compatible.
114 |
115 | -}
116 | add : doc -> Index doc -> Result String (Index doc)
117 | add doc index =
118 | case addT doc index of
119 | Ok resultValue ->
120 | Ok resultValue
121 |
122 | Err error ->
123 | case error of
124 | AddErrorUniqueRefIsEmpty ->
125 | Err "Error document has an empty unique id (ref)."
126 |
127 | DocAlreadyExists ->
128 | Err "Error adding document that allready exists."
129 |
130 | NoTermsToIndexAfterTokenisation ->
131 | Err "Error after tokenisation there are no terms to index."
132 |
133 |
134 | {-| Add document to an Index if no error conditions found.
135 |
136 | Variant that supports AddError type for Result
137 |
138 | See ElmTextSearch documentation for `add` to see error conditions.
139 |
140 | -}
141 | addT : doc -> Index doc -> Result AddError (Index doc)
142 | addT doc ((Index irec) as index) =
143 | let
144 | docRef =
145 | irec.ref doc
146 | in
147 | if String.isEmpty docRef then
148 | Err AddErrorUniqueRefIsEmpty
149 |
150 | else if Index.Utils.refExists docRef index then
151 | Err DocAlreadyExists
152 |
153 | else
154 | let
155 | ( u1index, fieldsWordListAndBoost ) =
156 | List.foldr
157 | (getWordsForField doc)
158 | ( index, [] )
159 | irec.fields
160 |
161 | ( u2index, u2fieldsWordListAndBoost ) =
162 | List.foldr
163 | (getWordsForFieldList doc)
164 | ( u1index, fieldsWordListAndBoost )
165 | irec.listFields
166 |
167 | docTokens =
168 | List.map Tuple.first u2fieldsWordListAndBoost
169 | |> List.foldr Set.union Set.empty
170 | in
171 | if Set.isEmpty docTokens then
172 | Err NoTermsToIndexAfterTokenisation
173 |
174 | else
175 | Ok (addDoc docRef u2fieldsWordListAndBoost docTokens u2index)
176 |
177 |
178 | {-| Add multiple documents. Tries to add all docs and collects errors..
179 | It does not stop adding at first error encountered.
180 |
181 | The result part List (Int, String) is the list of document index
182 | and the error string message result of adding.
183 | Returns the index unchanged if all documents error when added.
184 | Returns the updated index after adding the documents.
185 |
186 | -}
187 | addDocs : List doc -> Index doc -> ( Index doc, List ( Int, String ) )
188 | addDocs docs index =
189 | addDocsCore 0 docs index []
190 |
191 |
192 | addDocsCore :
193 | Int
194 | -> List doc
195 | -> Index doc
196 | -> List ( Int, String )
197 | -> ( Index doc, List ( Int, String ) )
198 | addDocsCore docsI docs index errors =
199 | case docs of
200 | [] ->
201 | ( index, errors )
202 |
203 | headDoc :: tailDocs ->
204 | case add headDoc index of
205 | Ok u1index ->
206 | addDocsCore (docsI + 1) tailDocs u1index errors
207 |
208 | Err msg ->
209 | addDocsCore (docsI + 1) tailDocs index (errors ++ [ ( docsI, msg ) ])
210 |
211 |
212 | {-| Reducer to extract tokens from each field String from doc.
213 | -}
214 | getWordsForField :
215 | doc
216 | -> ( doc -> String, Float )
217 | -> ( Index doc, List ( Set String, Float ) )
218 | -> ( Index doc, List ( Set String, Float ) )
219 | getWordsForField doc ( getField, fieldBoost ) ( index, fieldsLists ) =
220 | -- GRR fieldBoost goes where? dammmit. it doesnt belong here :( its not part of aggregate
221 | let
222 | ( u1index, tokens ) =
223 | Index.Utils.getTokens index (getField doc)
224 | in
225 | ( u1index, ( Set.fromList tokens, fieldBoost ) :: fieldsLists )
226 |
227 |
228 | {-| Reducer to extract tokens from each field List String from doc.
229 | -}
230 | getWordsForFieldList :
231 | doc
232 | -> ( doc -> List String, Float )
233 | -> ( Index doc, List ( Set String, Float ) )
234 | -> ( Index doc, List ( Set String, Float ) )
235 | getWordsForFieldList doc ( getFieldList, fieldBoost ) ( index, fieldsLists ) =
236 | let
237 | ( u1index, tokens ) =
238 | Index.Utils.getTokensList index (getFieldList doc)
239 | in
240 | ( u1index, ( Set.fromList tokens, fieldBoost ) :: fieldsLists )
241 |
242 |
243 | {-| Add the document to the index.
244 | -}
245 | addDoc : String -> List ( Set String, Float ) -> Set String -> Index doc -> Index doc
246 | addDoc docRef fieldTokensAndBoosts docTokens (Index irec) =
247 | let
248 | addTokenScore ( token, score ) trie =
249 | Trie.add ( docRef, score ) token trie
250 |
251 | updatedDocumentStore =
252 | Dict.insert docRef docTokens irec.documentStore
253 |
254 | updatedCorpusTokens =
255 | Set.union irec.corpusTokens docTokens
256 |
257 | -- can the cost of this be reduced ?
258 | updatedCorpusTokensIndex =
259 | Index.Utils.buildOrderIndex updatedCorpusTokens
260 |
261 | tokenAndScores =
262 | List.map
263 | (scoreToken fieldTokensAndBoosts)
264 | (Set.toList docTokens)
265 |
266 | updatedTokenStore =
267 | List.foldr addTokenScore irec.tokenStore tokenAndScores
268 | in
269 | Index
270 | { irec
271 | | documentStore = updatedDocumentStore
272 | , corpusTokens = updatedCorpusTokens
273 | , corpusTokensIndex = updatedCorpusTokensIndex
274 | , tokenStore = updatedTokenStore
275 | , idfCache = Dict.empty
276 | }
277 |
278 |
279 | {-| Return term frequency score for a token in document.
280 |
281 | Overall score for a token is based on the number of fields the word
282 | appears and weighted by boost score on each field.
283 |
284 | -}
285 | scoreToken : List ( Set String, Float ) -> String -> ( String, Float )
286 | scoreToken fieldTokensAndBoost token =
287 | let
288 | score : ( Set String, Float ) -> Float -> Float
289 | score ( tokenSet, fieldBoost ) scoreSum =
290 | if Set.isEmpty tokenSet then
291 | scoreSum
292 |
293 | else
294 | let
295 | tokenBoost =
296 | if Set.member token tokenSet then
297 | fieldBoost / toFloat (Set.size tokenSet)
298 |
299 | else
300 | 0
301 | in
302 | scoreSum + tokenBoost
303 | in
304 | ( token, List.foldr score 0 fieldTokensAndBoost )
305 |
306 |
307 | {-| Remove document from an Index if no error result conditions encountered.
308 |
309 | Original function signature retained for backward compatible.
310 |
311 | See ElmTextSearch documentation for `remove` to see error result conditions.
312 |
313 | This does the following things
314 |
315 | - Remove the document tags from documentStore.
316 | - Remove all the document references in tokenStore.
317 | - It does not modify corpusTokens - as this requires
318 | reprocessing tokens for all documents to recreate corpusTokens.
319 | - This may skew the results over time after many removes but not badly.
320 | - It appears lunr.js operates this way as well for remove.
321 |
322 | -}
323 | remove : doc -> Index doc -> Result String (Index doc)
324 | remove doc index =
325 | case removeT doc index of
326 | Ok value ->
327 | Ok value
328 |
329 | Err err ->
330 | case err of
331 | DocIsNotInIndex ->
332 | Err "Error document is not in index."
333 |
334 | RemoveErrorUniqueRefIsEmpty ->
335 | Err "Error document has an empty unique id (ref)."
336 |
337 |
338 | {-| Remove document from an Index if no error result conditions encountered.
339 |
340 | Variant that supports RemoveError type for Result
341 |
342 | See ElmTextSearch documentation for `remove` to see error result conditions.
343 |
344 | This does the following things
345 |
346 | - Remove the document tags from documentStore.
347 | - Remove all the document references in tokenStore.
348 | - It does not modify corpusTokens - as this requires
349 | reprocessing tokens for all documents to recreate corpusTokens.
350 | - This may skew the results over time after many removes but not badly.
351 | - It appears lunr.js operates this way as well for remove.
352 |
353 | -}
354 | removeT : doc -> Index doc -> Result RemoveError (Index doc)
355 | removeT doc ((Index irec) as index) =
356 | let
357 | docRef =
358 | irec.ref doc
359 | in
360 | if String.isEmpty docRef then
361 | Err RemoveErrorUniqueRefIsEmpty
362 |
363 | else if not (Index.Utils.refExists docRef index) then
364 | Err DocIsNotInIndex
365 |
366 | else
367 | Ok
368 | (Maybe.withDefault index <|
369 | Maybe.map
370 | (removeDoc docRef index)
371 | (Dict.get docRef irec.documentStore)
372 | )
373 |
374 |
375 | errorMessageNotIndex : String
376 | errorMessageNotIndex =
377 | "Error document is not in index."
378 |
379 |
380 | {-| Remove the doc by docRef id from the index.
381 | -}
382 | removeDoc : String -> Index doc -> Set String -> Index doc
383 | removeDoc docRef (Index irec) docTokens =
384 | let
385 | removeToken token trie =
386 | Trie.remove token docRef trie
387 |
388 | updatedDocumentStore =
389 | Dict.remove docRef irec.documentStore
390 |
391 | updatedTokenStore =
392 | List.foldr removeToken irec.tokenStore (Set.toList docTokens)
393 | in
394 | Index
395 | { irec
396 | | documentStore = updatedDocumentStore
397 | , tokenStore = updatedTokenStore
398 | , idfCache = Dict.empty
399 | }
400 |
401 |
402 | {-| Update document in Index. Does a remove then add.
403 | See ElmTextSearch documentation for `add` and `remove` to see error result conditions.
404 | -}
405 | update : doc -> Index doc -> Result String (Index doc)
406 | update doc index =
407 | remove doc index
408 | |> Result.andThen (add doc)
409 |
410 |
411 | {-| Add or Update document in Index.
412 | This does an add if document is not in index.
413 | -}
414 | addOrUpdate : doc -> Index doc -> Result String (Index doc)
415 | addOrUpdate doc index =
416 | case remove doc index of
417 | Ok u1index ->
418 | add doc u1index
419 |
420 | Err msg ->
421 | if msg == errorMessageNotIndex then
422 | add doc index
423 |
424 | else
425 | Err msg
426 |
427 |
428 | {-| Search index with query.
429 | See ElmTextSearch documentation for `search` to see error result conditions.
430 |
431 | Original function signature retained for backward compatible.
432 |
433 | -}
434 | search : String -> Index doc -> Result String ( Index doc, List ( String, Float ) )
435 | search query index =
436 | case searchT query index of
437 | Ok value ->
438 | Ok value
439 |
440 | Err error ->
441 | Err <|
442 | case error of
443 | IndexIsEmpty ->
444 | "Error there are no documents in index to search."
445 |
446 | QueryIsEmpty ->
447 | "Error query is empty."
448 |
449 | NoTermsToSearchAfterTokenisation ->
450 | "Error after tokenisation there are no terms to search for."
451 |
452 |
453 | {-| Search index with query.
454 | See ElmTextSearch documentation for `search` to see error result conditions.
455 |
456 | Variant that supports RemoveError type for Result
457 |
458 | -}
459 | searchT : String -> Index doc -> Result SearchError ( Index doc, List ( String, Float ) )
460 | searchT query index =
461 | let
462 | ( (Index i1irec) as i1index, tokens ) =
463 | Index.Utils.getTokens index query
464 |
465 | tokenInStore token =
466 | Trie.getNode token i1irec.tokenStore /= Nothing
467 | in
468 | if Dict.isEmpty i1irec.documentStore then
469 | Err IndexIsEmpty
470 |
471 | else if String.isEmpty (String.trim query) then
472 | Err QueryIsEmpty
473 |
474 | else if List.isEmpty tokens then
475 | Err NoTermsToSearchAfterTokenisation
476 |
477 | else if List.isEmpty tokens || not (List.any tokenInStore tokens) then
478 | Ok ( i1index, [] )
479 |
480 | else
481 | Ok (searchTokens tokens i1index)
482 |
483 |
484 | {-| Return list of document ref's with score, ordered by score descending.
485 |
486 | This had a bug it used "fields" boosts but did not use "listFields" for all fields indexed.
487 | This meant that if you only indexed with listFields that fieldsBoosts would be zero and
488 | resultant score would end up NaN.
489 |
490 | In addition a second problem was that it makes little to no sense to scale query vector
491 | by average of all fields boost as it does not change the relative score document matches.
492 | So removing boost on queries is a simpler solution than including "listFields" into boosts.
493 |
494 | -}
495 | searchTokens :
496 | List String
497 | -> Index doc
498 | -> ( Index doc, List ( String, Float ) )
499 | searchTokens tokens index =
500 | let
501 | ( tokenDocSets, queryVector, u1index ) =
502 | Index.Vector.getQueryVector
503 | tokens
504 | index
505 |
506 | ( u2index, matchedDocs ) =
507 | List.foldr
508 | (scoreAndCompare queryVector)
509 | ( u1index, [] )
510 | (Set.toList (Utils.intersectSets tokenDocSets))
511 |
512 | -- _ = Debug.log "searchTokens intersect" (Utils.intersectSets tokenDocSets)
513 | in
514 | ( u2index, List.reverse (List.sortBy Tuple.second matchedDocs) )
515 |
--------------------------------------------------------------------------------
/src/Index/Defaults.elm:
--------------------------------------------------------------------------------
1 | module Index.Defaults exposing
2 | ( indexVersion
3 | , elmTextSearchIndexType
4 | , defaultTransformFactories
5 | , defaultFilterFactories
6 | , defaultTokenTrimmerFuncCreator
7 | , defaultStemmerFuncCreator
8 | , defaultStopWordFilterFuncCreator
9 | , defaultInitialTransformFactories
10 | , getDefaultIndexConfig
11 | , getIndexSimpleConfig
12 | )
13 |
14 | {-| Defaults for indexes and configurations.
15 |
16 |
17 | ## Index Storage Engine Version and Type
18 |
19 | @docs indexVersion
20 | @docs elmTextSearchIndexType
21 |
22 |
23 | ## Built in Transforms and Filters
24 |
25 | @docs defaultTransformFactories
26 | @docs defaultFilterFactories
27 | @docs defaultTokenTrimmerFuncCreator
28 | @docs defaultStemmerFuncCreator
29 | @docs defaultStopWordFilterFuncCreator
30 | @docs defaultInitialTransformFactories
31 |
32 |
33 | ## Config type adapters
34 |
35 | @docs getDefaultIndexConfig
36 | @docs getIndexSimpleConfig
37 |
38 | Copyright (c) 2016 Robin Luiten
39 |
40 | -}
41 |
42 | import Index.Model as Model
43 | exposing
44 | ( FilterFactory
45 | , IndexSimpleConfig
46 | , TransformFactory
47 | )
48 | import Index.Utils
49 | import Stemmer
50 | import StopWordFilter
51 | import TokenProcessors
52 |
53 |
54 | {-| The version of index, for loading a saved index.
55 |
56 | This is not the same as package version.
57 |
58 | This needs to change if the encoded format changes. Be careful of updates to
59 | Trie package, if Trie encoding format changes this version needs to change as
60 | well.
61 |
62 | -}
63 | indexVersion : String
64 | indexVersion =
65 | "1.1.0"
66 |
67 |
68 | {-| The type of index defaults to using.
69 | It defines the default token transforms and filters.
70 | -}
71 | elmTextSearchIndexType : String
72 | elmTextSearchIndexType =
73 | "-= ElmTextSearch Index Type 1 =-"
74 |
75 |
76 | {-| Index default transform factories.
77 | -}
78 | defaultTransformFactories : List (TransformFactory doc)
79 | defaultTransformFactories =
80 | [ defaultStemmerFuncCreator
81 | ]
82 |
83 |
84 | {-| Index default transform factories that apply before filters.
85 | -}
86 | defaultInitialTransformFactories : List (TransformFactory doc)
87 | defaultInitialTransformFactories =
88 | [ defaultTokenTrimmerFuncCreator
89 | ]
90 |
91 |
92 | {-| Index default filter factories.
93 | -}
94 | defaultFilterFactories : List (FilterFactory doc)
95 | defaultFilterFactories =
96 | [ defaultStopWordFilterFuncCreator
97 | ]
98 |
99 |
100 | {-| The default token trimmer transform function creator.
101 | Normally applied first in transform functions.
102 | -}
103 | defaultTokenTrimmerFuncCreator : TransformFactory doc
104 | defaultTokenTrimmerFuncCreator =
105 | Index.Utils.createFuncCreator TokenProcessors.trimmer
106 |
107 |
108 | {-| The default token stemmer transform function creator.
109 | -}
110 | defaultStemmerFuncCreator : TransformFactory doc
111 | defaultStemmerFuncCreator =
112 | Index.Utils.createFuncCreator Stemmer.stem
113 |
114 |
115 | {-| The default stop word filter function creator.
116 | -}
117 | defaultStopWordFilterFuncCreator : FilterFactory doc
118 | defaultStopWordFilterFuncCreator =
119 | StopWordFilter.createDefaultFilterFunc
120 |
121 |
122 | {-| Convert Index.Model.ModelSimpleConfig to Index.Model.Config
123 | Filling in default values for fields not in SimpleConfig
124 | This is the definition of the default index configuration.
125 | -}
126 | getDefaultIndexConfig : Model.ModelSimpleConfig doc -> Model.Config doc
127 | getDefaultIndexConfig { indexType, ref, fields, listFields } =
128 | { indexType = indexType
129 | , ref = ref
130 | , fields = fields
131 | , listFields = listFields
132 | , initialTransformFactories = defaultInitialTransformFactories
133 | , transformFactories = defaultTransformFactories
134 | , filterFactories = defaultFilterFactories
135 | }
136 |
137 |
138 | {-| convert ElmTextSearch.IndexSimpleConfig to Index.Model.ModelSimpleConfig
139 | -}
140 | getIndexSimpleConfig : IndexSimpleConfig doc -> Model.ModelSimpleConfig doc
141 | getIndexSimpleConfig { ref, fields, listFields } =
142 | { indexType = elmTextSearchIndexType
143 | , ref = ref
144 | , fields = fields
145 | , listFields = listFields
146 | }
147 |
--------------------------------------------------------------------------------
/src/Index/Load.elm:
--------------------------------------------------------------------------------
1 | module Index.Load exposing (errorPrefix, loadIndex, loadIndexValue, loadIndexValueWith, loadIndexWith)
2 |
3 | {-| Load an index from Value or String
4 |
5 | Copyright (c) 2016 Robin Luiten
6 |
7 | -}
8 |
9 | import Dict
10 | import ElmTextSearch.Json.Decoder as IndexDecoder
11 | import Index.Defaults as Defaults
12 | import Index.Model exposing (..)
13 | import Index.Utils
14 | import Json.Decode as Decode
15 |
16 |
17 | errorPrefix : String
18 | errorPrefix =
19 | "Error cannot load Index."
20 |
21 |
22 | {-| Decode an index with one of provided configs.
23 |
24 | The configurations supplied will be used in the order provided in
25 | the list so the earliest one that matches indexType is used.
26 | Try to use a supported index config first.
27 | Then try the default just in case.
28 |
29 | -}
30 | loadIndexWith : List (Config doc) -> String -> Result Decode.Error (Index doc)
31 | loadIndexWith supportedIndexConfigs inputString =
32 | Decode.decodeString
33 | (IndexDecoder.decoder
34 | |> Decode.andThen (mapIndexConfig supportedIndexConfigs)
35 | |> Decode.andThen createIndex
36 | )
37 | inputString
38 |
39 |
40 | mapIndexConfig : List (Config doc) -> CodecIndexRecord -> Decode.Decoder ( Config doc, CodecIndexRecord )
41 | mapIndexConfig supportedIndexConfigs index =
42 | if Defaults.indexVersion /= index.indexVersion then
43 | Decode.fail <|
44 | (errorPrefix
45 | ++ " Version supported is "
46 | ++ Defaults.indexVersion
47 | ++ ". Version tried to load is "
48 | ++ index.indexVersion
49 | ++ "."
50 | )
51 |
52 | else
53 | let
54 | config =
55 | List.filter
56 | (\cfg -> cfg.indexType == index.indexType)
57 | supportedIndexConfigs
58 | in
59 | case config of
60 | [] ->
61 | Decode.fail <|
62 | (errorPrefix
63 | ++ " Tried to load index of type \""
64 | ++ index.indexType
65 | ++ "\". It is not in supported index configurations."
66 | )
67 |
68 | matchedConfig :: _ ->
69 | Decode.succeed ( matchedConfig, index )
70 |
71 |
72 | loadIndexValueWith : List (Config doc) -> Decode.Value -> Result Decode.Error (Index doc)
73 | loadIndexValueWith supportedIndexConfigs inputValue =
74 | Decode.decodeValue
75 | (IndexDecoder.decoder
76 | |> Decode.andThen (mapIndexConfig supportedIndexConfigs)
77 | |> Decode.andThen createIndex
78 | )
79 | inputValue
80 |
81 |
82 | createIndex : ( Config doc, CodecIndexRecord ) -> Decode.Decoder (Index doc)
83 | createIndex ( config, decodedIndex ) =
84 | Decode.succeed <|
85 | Index
86 | { indexVersion = decodedIndex.indexVersion
87 | , indexType = decodedIndex.indexType
88 | , ref = config.ref
89 | , fields = config.fields
90 | , listFields = config.listFields
91 | , initialTransformFactories = config.initialTransformFactories
92 | , transformFactories = config.transformFactories
93 | , filterFactories = config.filterFactories
94 | , documentStore = decodedIndex.documentStore
95 | , corpusTokens = decodedIndex.corpusTokens
96 | , tokenStore = decodedIndex.tokenStore
97 | , corpusTokensIndex =
98 | Index.Utils.buildOrderIndex decodedIndex.corpusTokens
99 | , initialTransforms = Nothing
100 | , transforms = Nothing
101 | , filters = Nothing
102 | , idfCache = Dict.empty
103 | }
104 |
105 |
106 | loadIndex : ModelSimpleConfig doc -> String -> Result Decode.Error (Index doc)
107 | loadIndex simpleConfig inputString =
108 | loadIndexWith
109 | [ Defaults.getDefaultIndexConfig simpleConfig ]
110 | inputString
111 |
112 |
113 | loadIndexValue : ModelSimpleConfig doc -> Decode.Value -> Result Decode.Error (Index doc)
114 | loadIndexValue simpleConfig inputValue =
115 | loadIndexValueWith
116 | [ Defaults.getDefaultIndexConfig simpleConfig ]
117 | inputValue
118 |
--------------------------------------------------------------------------------
/src/Index/Model.elm:
--------------------------------------------------------------------------------
1 | module Index.Model exposing
2 | ( Index(..)
3 | , IndexSimpleConfig
4 | , CodecIndexRecord, Config, FilterFactory, FilterFunc, FuncFactory, ModelSimpleConfig, TransformFactory, TransformFunc, TransformFunc2
5 | )
6 |
7 | {-| Define the Index Model
8 |
9 | @docs Index
10 | @docs IndexSimpleConfig
11 | @docs IndexConfig
12 |
13 | Copyright (c) 2016 Robin Luiten
14 |
15 | -}
16 |
17 | import Dict exposing (Dict)
18 | import Set exposing (Set)
19 | import Trie exposing (Trie)
20 |
21 |
22 | {-| Func and Factory types used with ElmTextSearch.
23 | -}
24 | type alias FuncFactory doc func =
25 | Index doc -> ( Index doc, func )
26 |
27 |
28 | type alias TransformFunc =
29 | String -> String
30 |
31 |
32 | {-| Variant, making composition nicer in code
33 | The old version just isn't right was using "" as no token rather than Maybe.
34 | Makes composition lots better
35 | -}
36 | type alias TransformFunc2 =
37 | String -> Maybe String
38 |
39 |
40 | type alias TransformFactory doc =
41 | Index doc -> ( Index doc, String -> String )
42 |
43 |
44 | type alias FilterFunc =
45 | String -> Bool
46 |
47 |
48 | type alias FilterFactory doc =
49 | Index doc -> ( Index doc, String -> Bool )
50 |
51 |
52 | {-| Index is a full text index for a document type.
53 |
54 | The internal data model of Index
55 |
56 | - indexType
57 | - a string that can be used on load to provide the correct set
58 |
59 | - indexVersion
60 | - a version string
61 |
62 | - ref
63 | - how to get at unique id of documents added
64 |
65 | - fields
66 | - list of fields of type String to index from document
67 | - first field is function to get String content of field
68 | - second field Float is a boost to text frequency of tokens in this field
69 |
70 | - listFields
71 | - list of fields of type List String to index from document
72 | - first field is function to get List String content of field
73 | - second field Float is a boost to text frequency of tokens in this field
74 |
75 | - initialTransformFactories
76 | - list of factory functions to create transform functions
77 | - this list is of transforms is applied before filters
78 | - the ones in `transformFactories` are applied after filters
79 |
80 | - transformFactories
81 | - list of factory functions to create transform functions
82 |
83 | - filterFactories
84 | - list of factory functions to create filter functions
85 |
86 | - transforms
87 | - the transforms in index token processing
88 | - lazy populated from transformFactories
89 |
90 | - filters
91 | - the files in index token processing
92 | - lazy populated from filterFactories
93 |
94 | - documentStore
95 | - contains dictionary of document ref to Set of document tokens
96 |
97 | - corpusTokens
98 | - Set of all indexed tokens from all documentStore
99 |
100 | - corpusTokensIndex
101 | - to get the position of a token in the order list of corpusTokens
102 |
103 | - tokenStore
104 | - tokenStore is used for efficient storing and lookup of the
105 | reverse index of token to document ref and holding the
106 | token term frequency
107 |
108 | - idfCache
109 | - cached idf (inverse document frequency scores)
110 | - cache is reset (cleared) if any document is added removed or updated in index
111 |
112 | -}
113 | type Index doc
114 | = Index (IndexRecord doc)
115 |
116 |
117 | {-| The Record model in an Index.
118 | -}
119 | type alias IndexRecord doc =
120 | { indexVersion : String
121 | , indexType : String
122 | , ref : doc -> String
123 | , fields : List ( doc -> String, Float )
124 | , listFields : List ( doc -> List String, Float )
125 | , initialTransformFactories : List (TransformFactory doc)
126 | , transformFactories : List (TransformFactory doc)
127 | , filterFactories : List (FilterFactory doc)
128 | , documentStore : Dict String (Set String)
129 | , corpusTokens : Set String
130 | , tokenStore : Trie Float
131 | , corpusTokensIndex : Dict String Int
132 | , initialTransforms : Maybe (List TransformFunc2)
133 | , transforms : Maybe (List TransformFunc2)
134 | , filters : Maybe (List TransformFunc2)
135 | , idfCache : Dict String Float
136 | }
137 |
138 |
139 | {-| Simple index config with default token processing.
140 |
141 | Simple still requires configuring the fields for your document type.
142 | See [`ElmTextSearch.SimpleConfig`](ElmTextSearch#SimpleConfig)
143 | for explantions of `ref`, `fields` and `listFields` fields.
144 |
145 | - ElmTextSearch.SimpleConfig does not include `indexType`.
146 | - In this case the user is getting the ElmTextSearch default token processing.
147 | - Index.SimpleConfig includes `indexType`.
148 |
149 | `indexType` is an identifier used to determine the transforms and filters the
150 | index uses for operation. It should be unique for all possible differently
151 | configured indexes you plan to use.
152 |
153 |
154 | ### The default transform factories.
155 |
156 | Index.Defaults.defaultTransformFactories
157 |
158 |
159 | ### The default filter factories.
160 |
161 | Index.Defaults.defaultFilterFactories
162 |
163 | -}
164 | type alias ModelSimpleConfig doc =
165 | { indexType : String
166 | , ref : doc -> String
167 | , fields : List ( doc -> String, Float )
168 | , listFields : List ( doc -> List String, Float )
169 | }
170 |
171 |
172 | {-| Index config with customized token processing.
173 |
174 | If a configuration does not match an index being loaded
175 | you will get an Err Result returned.
176 |
177 | -}
178 | type alias Config doc =
179 | { indexType : String
180 | , ref : doc -> String
181 | , fields : List ( doc -> String, Float )
182 | , listFields : List ( doc -> List String, Float )
183 | , initialTransformFactories : List (TransformFactory doc)
184 | , transformFactories : List (TransformFactory doc)
185 | , filterFactories : List (FilterFactory doc)
186 | }
187 |
188 |
189 | {-| Just the fields encoded for an Index.
190 | -}
191 | type alias CodecIndexRecord =
192 | { indexVersion : String
193 | , indexType : String
194 | , documentStore : Dict String (Set String)
195 | , corpusTokens : Set String
196 | , tokenStore : Trie Float
197 | }
198 |
199 |
200 | {-| A SimpleConfig is the least amount of configuration data
201 | required to create an Index.
202 | -}
203 | type alias IndexSimpleConfig doc =
204 | { ref : doc -> String
205 | , fields : List ( doc -> String, Float )
206 | , listFields : List ( doc -> List String, Float )
207 | }
208 |
--------------------------------------------------------------------------------
/src/Index/Utils.elm:
--------------------------------------------------------------------------------
1 | module Index.Utils exposing
2 | ( createFuncCreator
3 | , getTokens
4 | , getTokensList
5 | , processTokens
6 | , idf
7 | , refExists
8 | , buildOrderIndex
9 | )
10 |
11 | {-| Index Utilities
12 |
13 |
14 | ## Functions
15 |
16 | @docs createFuncCreator
17 | @docs getTokens
18 | @docs getTokensList
19 | @docs processTokens
20 | @docs idf
21 | @docs refExists
22 | @docs buildOrderIndex
23 |
24 | Copyright (c) 2016 Robin Luiten
25 |
26 | -}
27 |
28 | import Dict exposing (Dict)
29 | import Index.Model
30 | exposing
31 | ( FilterFactory
32 | , FuncFactory
33 | , Index(..)
34 | , TransformFunc
35 | , TransformFunc2
36 | )
37 | import Set exposing (Set)
38 | import TokenProcessors
39 | import Trie
40 |
41 |
42 | {-| Create a function creator (FuncFactory)
43 | given the simple Function to start with
44 | -}
45 | createFuncCreator : func -> FuncFactory doc func
46 | createFuncCreator func index =
47 | ( index, func )
48 |
49 |
50 | {-| Extract tokens from string, and process them.
51 | -}
52 | getTokens : Index doc -> String -> ( Index doc, List String )
53 | getTokens index string =
54 | processTokens index (TokenProcessors.tokenizer string)
55 |
56 |
57 | getTokensList : Index doc -> List String -> ( Index doc, List String )
58 | getTokensList index listString =
59 | processTokens index (TokenProcessors.tokenizerList listString)
60 |
61 |
62 | {-| Transform list of words into tokens for index and search.
63 |
64 | Applies filters and transformers configured in index.
65 |
66 | Applies filters first then tokenizers.
67 | So filters apply to untokenized words from document.
68 |
69 | -}
70 | processTokens : Index doc -> List String -> ( Index doc, List String )
71 | processTokens index tokens =
72 | let
73 | ( u1index, initialTransformTokens ) =
74 | applyInitialTransform index tokens
75 |
76 | ( u2index, filterTokens ) =
77 | applyFilter u1index initialTransformTokens
78 | in
79 | applyTransform u2index filterTokens
80 |
81 |
82 | {-| Apply the transforms to tokens.
83 | If any transform converts a token to an empty string no further transforms
84 | are applied and the empty string is removed from the set of tokens.
85 | -}
86 | applyTransform : Index doc -> List String -> ( Index doc, List String )
87 | applyTransform index strings =
88 | let
89 | ( u1index, transformList2 ) =
90 | getOrSetTransformList index
91 | in
92 | ( u1index
93 | , List.filterMap
94 | (applyTransformList transformList2)
95 | strings
96 | )
97 |
98 |
99 | {-| Would prefer to pass just accessors (eg .transforms) to
100 | getOrSetIndexFuncList but so far the types are beating me.
101 | -}
102 | getOrSetTransformList : Index doc -> ( Index doc, List TransformFunc2 )
103 | getOrSetTransformList index =
104 | getOrSetIndexFuncListA
105 | (\(Index irec) -> irec.transforms)
106 | (\(Index irec) -> irec.transformFactories)
107 | setIndexTransforms
108 | index
109 |
110 |
111 | {-| set Index transforms func field
112 |
113 | Added listFuncs2
114 |
115 | -}
116 | setIndexTransforms : Index doc -> List TransformFunc2 -> Index doc
117 | setIndexTransforms (Index irec) listFuncs2 =
118 | Index { irec | transforms = Just listFuncs2 }
119 |
120 |
121 | applyInitialTransform : Index doc -> List String -> ( Index doc, List String )
122 | applyInitialTransform index strings =
123 | let
124 | ( u1index, intitialTransformList2 ) =
125 | getOrSetInitialTransformList index
126 | in
127 | ( u1index
128 | , List.filterMap
129 | (applyTransformList intitialTransformList2)
130 | strings
131 | )
132 |
133 |
134 | getOrSetInitialTransformList : Index doc -> ( Index doc, List TransformFunc2 )
135 | getOrSetInitialTransformList index =
136 | getOrSetIndexFuncListA
137 | (\(Index irec) -> irec.initialTransforms)
138 | (\(Index irec) -> irec.initialTransformFactories)
139 | setIndexInitialTransforms
140 | index
141 |
142 |
143 | setIndexInitialTransforms : Index doc -> List TransformFunc2 -> Index doc
144 | setIndexInitialTransforms (Index irec) listFuncs2 =
145 | Index { irec | initialTransforms = Just listFuncs2 }
146 |
147 |
148 | {-| Apply all transforms in sequence to input token.
149 |
150 | This works it came from reference learn-maybe/src/Transforms.elm my test project.
151 |
152 | -}
153 | applyTransformList : List TransformFunc2 -> String -> Maybe String
154 | applyTransformList transforms token =
155 | List.foldl (\t -> Maybe.andThen t) (Just token) transforms
156 |
157 |
158 | {-| Adapt function String -> String
159 | Into String -> Maybe String
160 | Where an empty string maps to Nothing.
161 |
162 | This is only exposed to test AUGH!
163 | -}
164 | adaptFuncStrA : a -> (String -> a) -> (String -> Maybe a)
165 | adaptFuncStrA aValue func =
166 | \string ->
167 | let
168 | result =
169 | func string
170 | in
171 | if result /= aValue then
172 | Just result
173 |
174 | else
175 | Nothing
176 |
177 |
178 | adaptFuncStrB : (String -> Bool) -> (String -> Maybe String)
179 | adaptFuncStrB func =
180 | \string ->
181 | let
182 | result =
183 | func string
184 | in
185 | if result then
186 | Just string
187 |
188 | else
189 | Nothing
190 |
191 |
192 | {-| Apply index filters to tokens.
193 |
194 | If any token is an empty string it will be filtered out as well.
195 |
196 | -}
197 | applyFilter : Index doc -> List String -> ( Index doc, List String )
198 | applyFilter index strings =
199 | let
200 | ( u1index, filterList2 ) =
201 | getOrSetFilterList index
202 | in
203 | ( u1index
204 | , List.filterMap
205 | (applyTransformList filterList2)
206 | strings
207 | )
208 |
209 |
210 | getOrSetFilterList : Index doc -> ( Index doc, List TransformFunc2 )
211 | getOrSetFilterList index =
212 | getOrSetIndexFuncListB
213 | (\(Index irec) -> irec.filters)
214 | (\(Index irec) -> irec.filterFactories)
215 | setIndexFilters
216 | index
217 |
218 |
219 | {-| set Index filters func field
220 | -}
221 | setIndexFilters : Index doc -> List TransformFunc2 -> Index doc
222 | setIndexFilters (Index irec) listFuncs2 =
223 | Index { irec | filters = Just listFuncs2 }
224 |
225 |
226 | {-| String TranformFunc source type variant.
227 |
228 | See getOrSetIndexFuncListB for FilterFunc variant
229 | Generic type `a` isnt helping me here so splitting for specific types
230 | Dang and these two variants work.
231 |
232 | -}
233 | getOrSetIndexFuncListA :
234 | (Index doc -> Maybe (List TransformFunc2))
235 | -> (Index doc -> List (FuncFactory doc TransformFunc))
236 | -> (Index doc -> List TransformFunc2 -> Index doc)
237 | -> Index doc
238 | -> ( Index doc, List TransformFunc2 )
239 | getOrSetIndexFuncListA getFuncs2 getFactoryFuncs setFuncs index =
240 | case getFuncs2 index of
241 | -- init allready run
242 | Just funcList2 ->
243 | ( index, funcList2 )
244 |
245 | -- rebuild function lists
246 | _ ->
247 | let
248 | ( u1index, newFuncList ) =
249 | runFactories (getFactoryFuncs index) index
250 |
251 | newFunc2List =
252 | List.map (adaptFuncStrA "") newFuncList
253 |
254 | u2index =
255 | setFuncs u1index newFunc2List
256 | in
257 | ( u2index, newFunc2List )
258 |
259 |
260 | {-| Variant for FilterFunc hydration
261 |
262 | If i switch FilterFunc to be TransformFunc instead i can share above code, just one less variation.
263 |
264 | -}
265 | getOrSetIndexFuncListB :
266 | (Index doc -> Maybe (List TransformFunc2))
267 | -> (Index doc -> List (FilterFactory doc))
268 | -> (Index doc -> List TransformFunc2 -> Index doc)
269 | -> Index doc
270 | -> ( Index doc, List TransformFunc2 )
271 | getOrSetIndexFuncListB getFuncs2 getFactoryFuncs setFuncs index =
272 | case getFuncs2 index of
273 | -- init allready run
274 | Just funcList2 ->
275 | ( index, funcList2 )
276 |
277 | -- rebuild function lists
278 | _ ->
279 | let
280 | ( u1index, newFuncList ) =
281 | runFactories (getFactoryFuncs index) index
282 |
283 | newFunc2List =
284 | List.map adaptFuncStrB newFuncList
285 |
286 | u2index =
287 | setFuncs u1index newFunc2List
288 | in
289 | ( u2index, newFunc2List )
290 |
291 |
292 | {-| Run each of the function factories returning the list of functions.
293 |
294 | TODO use foldr?, probably dont mater here
295 |
296 | -}
297 | runFactories : List (FuncFactory doc func) -> Index doc -> ( Index doc, List func )
298 | runFactories factoryList index =
299 | List.foldr
300 | (\factory ( u1index, funcList ) ->
301 | let
302 | ( u2index, newFunc ) =
303 | factory u1index
304 | in
305 | ( u2index, newFunc :: funcList )
306 | )
307 | ( index, [] )
308 | factoryList
309 |
310 |
311 | {-| Calculate the inverse document frequency for a token in the Index.
312 |
313 | Model will update if token has no cached value for idf.
314 |
315 | -}
316 | idf : Index doc -> String -> ( Index doc, Float )
317 | idf ((Index irec) as index) token =
318 | case Dict.get token irec.idfCache of
319 | Nothing ->
320 | calcIdf index token
321 |
322 | Just idfValue ->
323 | ( index, idfValue )
324 |
325 |
326 | calcIdf : Index doc -> String -> ( Index doc, Float )
327 | calcIdf (Index irec) token =
328 | let
329 | -- _ = Debug.log("calcIdf") (token)
330 | docFrequency =
331 | toFloat (Trie.valueCount token irec.tokenStore)
332 |
333 | idfLocal =
334 | if docFrequency > 0 then
335 | 1
336 | + logBase 10
337 | (toFloat (Dict.size irec.documentStore) / docFrequency)
338 |
339 | else
340 | toFloat 1
341 |
342 | updatedIdfCache =
343 | Dict.insert token idfLocal irec.idfCache
344 |
345 | u1index =
346 | Index
347 | { irec
348 | | idfCache = updatedIdfCache
349 | }
350 | in
351 | ( u1index, idfLocal )
352 |
353 |
354 | {-| Return True if document reference is indexed.
355 | -}
356 | refExists : String -> Index doc -> Bool
357 | refExists docRef (Index irec) =
358 | Dict.member docRef irec.documentStore
359 |
360 |
361 | {-| Build an index of string to index from Set where key is
362 | Set word and value is ordered index of word in Set.
363 | -}
364 | buildOrderIndex : Set String -> Dict String Int
365 | buildOrderIndex tokenSet =
366 | let
367 | withIndex =
368 | List.indexedMap Tuple.pair (Set.toList tokenSet)
369 | in
370 | List.foldr (\( i, v ) d -> Dict.insert v i d) Dict.empty withIndex
371 |
--------------------------------------------------------------------------------
/src/Index/Vector.elm:
--------------------------------------------------------------------------------
1 | module Index.Vector exposing (buildDocVector, getDocVector, getQueryVector, scoreAndCompare, similarityBoost, updateDocVector, updateSetAndVec)
2 |
3 | {-| Index document vector support.
4 |
5 | Copyright (c) 2016 Robin Luiten
6 |
7 | -}
8 |
9 | import Dict
10 | import Index.Model exposing (Index(..))
11 | import Index.Utils
12 | import Maybe
13 | import Set exposing (Set)
14 | import SparseVector exposing (SparseVector)
15 | import String
16 | import Trie
17 |
18 |
19 | {-| Build a query vector and the sets of candidate document matches
20 | for each token in our query tokens.
21 |
22 | Each token in our query will have a seperate Set String entry in
23 | the returned List. As all query token document result sets are
24 | intersected together for final list of documents matched. (a logical and
25 | of all the query tokens)
26 |
27 | -}
28 | getQueryVector :
29 | List String
30 | -> Index doc
31 | -> ( List (Set String), SparseVector, Index doc )
32 | getQueryVector tokens index =
33 | List.foldr
34 | (buildDocVector (List.length tokens))
35 | ( [], SparseVector.empty, index )
36 | tokens
37 |
38 |
39 | {-| Update query vector elements to create query vector.
40 | Update the list of documents that match for each query token (baseToken).
41 | -}
42 | buildDocVector :
43 | Int
44 | -> String
45 | -> ( List (Set String), SparseVector, Index doc )
46 | -> ( List (Set String), SparseVector, Index doc )
47 | buildDocVector tokensLength baseToken ( docSets, vec, (Index irec) as index ) =
48 | let
49 | termFrequency =
50 | 1 / toFloat tokensLength
51 |
52 | expandedTokens =
53 | Trie.expand baseToken irec.tokenStore
54 |
55 | -- _ = Debug.log("buildDocVector") (tokensLength, baseToken, expandedTokens)
56 | ( docs, vecU1, indexU1 ) =
57 | List.foldr
58 | (updateSetAndVec termFrequency baseToken)
59 | ( Set.empty, vec, index )
60 | expandedTokens
61 | in
62 | ( docs :: docSets, vecU1, indexU1 )
63 |
64 |
65 | {-| Calculate Term frequency-inverse document frequency (tf-idf).
66 | Union of documents for each expandedToken for this (base)token.
67 | -}
68 | updateSetAndVec :
69 | Float
70 | -> String
71 | -> String
72 | -> ( Set String, SparseVector, Index doc )
73 | -> ( Set String, SparseVector, Index doc )
74 | updateSetAndVec tf token expandedToken ( docSets, vec, (Index irec) as index ) =
75 | let
76 | ( (Index u1irec) as u1index, keyIdf ) =
77 | Index.Utils.idf index expandedToken
78 |
79 | tfidf =
80 | tf * keyIdf * similarityBoost token expandedToken
81 |
82 | -- _ = Debug.log("updateSetAndVec") (tf, token, expandedToken, (similarityBoost token expandedToken), keyIdf, tfidf)
83 | -- _ = Debug.log("updateSetAndVec corpus") (irec.corpusTokensIndex)
84 | u1vec =
85 | Maybe.withDefault vec <|
86 | Maybe.map
87 | (\pos -> SparseVector.insert pos tfidf vec)
88 | (Dict.get expandedToken irec.corpusTokensIndex)
89 |
90 | expandedTokenDocSet =
91 | Maybe.withDefault Set.empty <|
92 | Maybe.map
93 | (\dict -> Set.fromList (Dict.keys dict))
94 | (Trie.get expandedToken u1irec.tokenStore)
95 |
96 | u1docSets =
97 | Set.union expandedTokenDocSet docSets
98 |
99 | -- _ = Debug.log("updateSetAndVec u1docSets u1vec") (expandedToken, u1docSets, u1vec)
100 | in
101 | ( u1docSets, u1vec, u1index )
102 |
103 |
104 | {-| if the expanded token is not an exact match to the token then
105 | penalise the score for this key by how different the key is
106 | to the token.
107 | -}
108 | similarityBoost : String -> String -> Float
109 | similarityBoost token expandedToken =
110 | if expandedToken == token then
111 | 1
112 |
113 | else
114 | 1
115 | / logBase 10
116 | (toFloat
117 | (max 3
118 | (String.length expandedToken
119 | - String.length token
120 | )
121 | )
122 | )
123 |
124 |
125 | {-| calculate the score for each doc
126 | -}
127 | scoreAndCompare :
128 | SparseVector
129 | -> String
130 | -> ( Index doc, List ( String, Float ) )
131 | -> ( Index doc, List ( String, Float ) )
132 | scoreAndCompare queryVector ref ( index, docs ) =
133 | let
134 | ( u1index, docVector ) =
135 | getDocVector index ref
136 |
137 | -- _ = Debug.log("scoreAndCompare") (docVector)
138 | in
139 | ( u1index, ( ref, SparseVector.cosineSimilarity queryVector docVector ) :: docs )
140 |
141 |
142 | {-| build vector for docRef
143 | -}
144 | getDocVector : Index doc -> String -> ( Index doc, SparseVector )
145 | getDocVector ((Index irec) as index) docRef =
146 | Maybe.withDefault ( index, SparseVector.empty ) <|
147 | Maybe.map
148 | (\tokenSet ->
149 | List.foldr
150 | (updateDocVector docRef)
151 | ( index, SparseVector.empty )
152 | (Set.toList tokenSet)
153 | )
154 | (Dict.get docRef irec.documentStore)
155 |
156 |
157 | {-| reducer for docRef docVector for this token
158 | -}
159 | updateDocVector : String -> String -> ( Index doc, SparseVector ) -> ( Index doc, SparseVector )
160 | updateDocVector docRef token (( (Index irec) as index, docVector ) as inputTuple) =
161 | Maybe.withDefault inputTuple <|
162 | Maybe.map2
163 | (\position termFrequency ->
164 | let
165 | ( u1index, idfScore ) =
166 | Index.Utils.idf index token
167 | in
168 | ( u1index, SparseVector.insert position (termFrequency * idfScore) docVector )
169 | )
170 | (Dict.get token irec.corpusTokensIndex)
171 | (Trie.get token irec.tokenStore
172 | |> Maybe.andThen (Dict.get docRef)
173 | )
174 |
--------------------------------------------------------------------------------
/src/StopWordFilter.elm:
--------------------------------------------------------------------------------
1 | module StopWordFilter exposing
2 | ( createDefaultFilterFunc
3 | , stopEnglishWordList
4 | , createFilterFuncWith
5 | , createFilterFunc
6 | )
7 |
8 | {-| StopWordFilter is an English language stop word list filter, any words
9 | contained in the list are not stored in the index.
10 |
11 | This is intended to be used in the ElmTextSearch token processing pipeline.
12 |
13 |
14 | ### Things to know about stop word lists.
15 |
16 | - Words in document are split on white space to create tokens.
17 | - Tokens have non word characters from prefix and suffix to improve matching filters.
18 | - Input tokens to create stop word filters should be full words.
19 | - It is more efficient to merge all your stop words into a single
20 | stop word filter.
21 |
22 |
23 | ## create default stop word filter func
24 |
25 | @docs createDefaultFilterFunc
26 |
27 |
28 | ## A default stop word english filter list
29 |
30 | @docs stopEnglishWordList
31 |
32 |
33 | ## Create a custom stop word filter list
34 |
35 | @docs createFilterFuncWith
36 | @docs createFilterFunc
37 |
38 | Copyright (c) 2016 Robin Luiten
39 |
40 | -}
41 |
42 | import Index.Model exposing (FilterFactory)
43 | import Set
44 |
45 |
46 | {-| Default english stop word list to create filter.
47 | -}
48 | stopEnglishWordList : List String
49 | stopEnglishWordList =
50 | [ "a"
51 | , "able"
52 | , "about"
53 | , "across"
54 | , "after"
55 | , "all"
56 | , "almost"
57 | , "also"
58 | , "am"
59 | , "among"
60 | , "an"
61 | , "and"
62 | , "any"
63 | , "are"
64 | , "as"
65 | , "at"
66 | , "be"
67 | , "because"
68 | , "been"
69 | , "but"
70 | , "by"
71 | , "can"
72 | , "cannot"
73 | , "could"
74 | , "dear"
75 | , "did"
76 | , "do"
77 | , "does"
78 | , "either"
79 | , "else"
80 | , "ever"
81 | , "every"
82 | , "for"
83 | , "from"
84 | , "get"
85 | , "got"
86 | , "had"
87 | , "has"
88 | , "have"
89 | , "he"
90 | , "her"
91 | , "hers"
92 | , "him"
93 | , "his"
94 | , "how"
95 | , "however"
96 | , "i"
97 | , "if"
98 | , "in"
99 | , "into"
100 | , "is"
101 | , "it"
102 | , "its"
103 | , "just"
104 | , "least"
105 | , "let"
106 | , "like"
107 | , "likely"
108 | , "may"
109 | , "me"
110 | , "might"
111 | , "most"
112 | , "must"
113 | , "my"
114 | , "neither"
115 | , "no"
116 | , "nor"
117 | , "not"
118 | , "of"
119 | , "off"
120 | , "often"
121 | , "on"
122 | , "only"
123 | , "or"
124 | , "other"
125 | , "our"
126 | , "own"
127 | , "rather"
128 | , "said"
129 | , "say"
130 | , "says"
131 | , "she"
132 | , "should"
133 | , "since"
134 | , "so"
135 | , "some"
136 | , "than"
137 | , "that"
138 | , "the"
139 | , "their"
140 | , "them"
141 | , "then"
142 | , "there"
143 | , "these"
144 | , "they"
145 | , "this"
146 | , "tis"
147 | , "to"
148 | , "too"
149 | , "twas"
150 | , "us"
151 | , "wants"
152 | , "was"
153 | , "we"
154 | , "were"
155 | , "what"
156 | , "when"
157 | , "where"
158 | , "which"
159 | , "while"
160 | , "who"
161 | , "whom"
162 | , "why"
163 | , "will"
164 | , "with"
165 | , "would"
166 | , "yet"
167 | , "you"
168 | , "your"
169 | ]
170 |
171 |
172 | {-| Default english stop word filter suitable for ElmTextSearch.
173 | -}
174 | createDefaultFilterFunc : FilterFactory doc
175 | createDefaultFilterFunc index =
176 | createFilterFunc stopEnglishWordList index
177 |
178 |
179 | {-| Create stop word list filter suitable for ElmTextSearch, this versions
180 | extends the default word list with the extra words provided.
181 | -}
182 | createFilterFuncWith : List String -> FilterFactory doc
183 | createFilterFuncWith extraWords index =
184 | createFilterFunc (List.append extraWords stopEnglishWordList) index
185 |
186 |
187 | {-| Create stop word filter for provided list of tokens suitable for ElmTextSearch.
188 |
189 | \*\* This creates a stop world filter purely from your own word list, understand
190 | what you are doing and consequences if you use this. \*\*
191 |
192 | The FilterFunc created returns True to allow words into index.
193 | So words found in the stopWordList return False
194 |
195 | -}
196 | createFilterFunc : List String -> FilterFactory doc
197 | createFilterFunc tokens index =
198 | let
199 | tokenSet =
200 | Set.fromList tokens
201 | in
202 | ( index, \word -> not (Set.member word tokenSet) )
203 |
--------------------------------------------------------------------------------
/src/TokenProcessors.elm:
--------------------------------------------------------------------------------
1 | module TokenProcessors exposing
2 | ( tokenizer
3 | , tokenizerList
4 | , tokenizerWith
5 | , tokenizerWithRegex
6 | , tokenizerWithRegexList
7 | , trimmer
8 | , tokenizerWithList
9 | )
10 |
11 | {-| TokenProcessors for strings.
12 |
13 |
14 | ## Create a tokenizer
15 |
16 | @docs tokenizer
17 | @docs tokenizerList
18 | @docs tokenizerWith
19 | @docs tokenizerWithRegex
20 | @docs tokenizerWithRegexList
21 |
22 |
23 | ## Word transformer
24 |
25 | @docs trimmer
26 |
27 | Copyright (c) 2016 Robin Luiten
28 |
29 | -}
30 |
31 | import Regex
32 | exposing
33 | ( Regex
34 | -- , HowMany(..)
35 | , fromString
36 | , replace
37 | , split
38 | )
39 | import String exposing (toLower, trim)
40 |
41 |
42 | forceRegex : String -> Regex
43 | forceRegex =
44 | Maybe.withDefault Regex.never << fromString
45 |
46 |
47 | defaultSeparator : Regex
48 | defaultSeparator =
49 | forceRegex "[\\s\\-]+"
50 |
51 |
52 | {-| Tokenize a String.
53 | Will not return any empty string tokens.
54 | By default this splits on whitespace and hyphens.
55 | -}
56 | tokenizer : String -> List String
57 | tokenizer =
58 | tokenizerWithRegex defaultSeparator
59 |
60 |
61 | {-| Tokenize a List String.
62 | Will not return any empty string tokens.
63 | By default this splits on whitespace and hyphens.
64 | -}
65 | tokenizerList : List String -> List String
66 | tokenizerList =
67 | tokenizerWithRegexList defaultSeparator
68 |
69 |
70 | {-| Tokenize a string.
71 | Will not return any empty string tokens.
72 | Supply your own regex for splitting the string.
73 | -}
74 | tokenizerWithRegex : Regex -> String -> List String
75 | tokenizerWithRegex seperatorRegex data =
76 | let
77 | splitter =
78 | split seperatorRegex << toLower << trim
79 | in
80 | List.filter
81 | (\token -> String.length token > 0)
82 | (splitter data)
83 |
84 |
85 | tokenizerWithRegexList : Regex -> List String -> List String
86 | tokenizerWithRegexList seperatorRegex listData =
87 | let
88 | splitter =
89 | split seperatorRegex << toLower << trim
90 |
91 | -- List.foldr (\set agg -> Set.intersect set agg) h tail
92 | -- tokens : List String
93 | tokens =
94 | List.foldr
95 | (\str agg ->
96 | List.append agg (splitter str)
97 | )
98 | []
99 | listData
100 | in
101 | List.filter
102 | (\token -> String.length token > 0)
103 | tokens
104 |
105 |
106 | {-| Tokenize a String.
107 | Will not return any empty string tokens.
108 | Supply your own String which is turned into a regex for splitting the string.
109 | -}
110 | tokenizerWith : String -> String -> List String
111 | tokenizerWith seperatorPattern =
112 | tokenizerWithRegex (forceRegex seperatorPattern)
113 |
114 |
115 | {-| Tokenize a List String.
116 | Will not return any empty string tokens.
117 | Supply your own String which is turned into a regex for splitting the string.
118 | -}
119 | tokenizerWithList : String -> List String -> List String
120 | tokenizerWithList seperatorPattern =
121 | tokenizerWithRegexList (forceRegex seperatorPattern)
122 |
123 |
124 | trimmerRegex =
125 | forceRegex "^\\W+|\\W+$"
126 |
127 |
128 | {-| Remove non word characters from start and end of tokens
129 | -}
130 | trimmer : String -> String
131 | trimmer =
132 | replace trimmerRegex (\_ -> "")
133 |
--------------------------------------------------------------------------------
/src/Utils.elm:
--------------------------------------------------------------------------------
1 | module Utils exposing (intersectSets)
2 |
3 | {-| Some misc utils
4 |
5 | @docs intersectSets
6 |
7 | Copyright (c) 2016 Robin Luiten
8 |
9 | -}
10 |
11 | import Set exposing (Set)
12 |
13 |
14 | {-| Return intersection of a list of sets
15 | -}
16 | intersectSets : List (Set String) -> Set String
17 | intersectSets sets =
18 | case sets of
19 | [] ->
20 | Set.empty
21 |
22 | h :: tail ->
23 | List.foldr (\set agg -> Set.intersect set agg) h tail
24 |
--------------------------------------------------------------------------------
/tests/DefaultTests.elm:
--------------------------------------------------------------------------------
1 | module DefaultTests exposing (testDefaultIndexType)
2 |
3 | import Expect
4 | import Index.Defaults
5 | import Test exposing (..)
6 |
7 |
8 | testDefaultIndexType : Test
9 | testDefaultIndexType =
10 | test "Check Index Type" <|
11 | \() ->
12 | Index.Defaults.getIndexSimpleConfig
13 | { ref = .cid
14 | , fields =
15 | [ ( .title, 5.0 )
16 | ]
17 | , listFields =
18 | [ ( .body, 1.0 )
19 | ]
20 | }
21 | |> .indexType
22 | |> Expect.equal "-= ElmTextSearch Index Type 1 =-"
23 |
--------------------------------------------------------------------------------
/tests/ElmTextSearchTests.elm:
--------------------------------------------------------------------------------
1 | module ElmTextSearchTests exposing (..)
2 |
3 | import ElmTextSearch
4 | import ElmTextSearchErrors exposing (AddError(..), RemoveError(..), SearchError(..))
5 | import Expect
6 | import Test exposing (..)
7 |
8 |
9 | type alias MyDoc =
10 | { cid : String
11 | , title : String
12 | , author : String
13 | , body : String
14 | }
15 |
16 |
17 | doc1_ : MyDoc
18 | doc1_ =
19 | { cid = "doc1"
20 | , title = "Examples of a Banana"
21 | , author = "Sally Apples"
22 | , body = "Sally writes words about a grown banana."
23 | }
24 |
25 |
26 | getEmptyIndex : () -> ElmTextSearch.Index MyDoc
27 | getEmptyIndex _ =
28 | ElmTextSearch.new
29 | { ref = .cid
30 | , fields = [ ( .title, 5 ), ( .body, 1 ) ]
31 | , listFields = []
32 | }
33 |
34 |
35 | test_searchT_CanUseErrorResultConstructors : Test
36 | test_searchT_CanUseErrorResultConstructors =
37 | test "If can case on error result" <|
38 | \() ->
39 | getEmptyIndex ()
40 | |> ElmTextSearch.searchT "hello"
41 | |> (\result ->
42 | case result of
43 | Ok _ ->
44 | False
45 |
46 | Err error ->
47 | case error of
48 | IndexIsEmpty ->
49 | True
50 |
51 | _ ->
52 | False
53 | )
54 | |> Expect.equal True
55 | >> Expect.onFail "Result should be an error"
56 |
--------------------------------------------------------------------------------
/tests/IndexDecoderTests.elm:
--------------------------------------------------------------------------------
1 | module IndexDecoderTests exposing (decodeAndEncodeRoundTripSameTest)
2 |
3 | import ElmTextSearch.Json.Decoder as IndexDecoder
4 | import ElmTextSearch.Json.Encoder as IndexEncoder
5 | import Expect
6 | import Json.Decode as Decode
7 | import Json.Encode as Encode
8 | import Test exposing (..)
9 | import TestUtils
10 |
11 |
12 | encodeAndDecodeHelper : String -> String
13 | encodeAndDecodeHelper string =
14 | string
15 | |> Decode.decodeString IndexDecoder.decoder
16 | |> TestUtils.getResultIgnoreError
17 | |> IndexEncoder.codecIndexRecordEncoder
18 | |> Encode.encode 0
19 |
20 |
21 | {-| From
22 | QUOTE: "Dictionary equality with (==) is unreliable and should not be used."
23 |
24 | Therefore decode then encode back to string to check its same.
25 |
26 | -}
27 | decodeAndEncodeRoundTripSameTest : Test
28 | decodeAndEncodeRoundTripSameTest =
29 | let
30 | -- from the encoder tests
31 | encodedIndex =
32 | String.concat
33 | [ "{\"indexVersion\":\"1.0.0\",\"indexType\":\"- IndexTest Type -\","
34 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
35 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
36 | , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
37 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
38 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
39 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
40 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
41 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
42 | ]
43 | in
44 | test "decode then encode ensure same" <|
45 | \() ->
46 | encodedIndex
47 | |> encodeAndDecodeHelper
48 | |> Expect.equal encodedIndex
49 |
--------------------------------------------------------------------------------
/tests/IndexEncoderTests.elm:
--------------------------------------------------------------------------------
1 | module IndexEncoderTests exposing (testEncodeList, testEncoder)
2 |
3 | import ElmTextSearch.Json.Encoder as IndexEncoder
4 | import Expect
5 | import Index
6 | import Index.Model exposing (..)
7 | import Json.Encode as Encode
8 | import Test exposing (..)
9 | import TestUtils
10 |
11 |
12 | encodedIndex : String
13 | encodedIndex =
14 | String.concat
15 | [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"- IndexTest Type -\","
16 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
17 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
18 | , "\"tokenStore\":{"
19 | , "\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
20 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
21 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
22 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
23 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
24 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
25 | ]
26 |
27 |
28 | testEncoder : Test
29 | testEncoder =
30 | test "Encode index with doc matches encodedIndex" <|
31 | \() ->
32 | Index.new
33 | { indexType = "- IndexTest Type -"
34 | , ref = .cid
35 | , fields = [ ( .title, 5 ), ( .body, 1 ) ]
36 | , listFields = []
37 | }
38 | |> Index.add
39 | { cid = "doc1"
40 | , title = "Examples of a Banana"
41 | , author = "Sally Apples"
42 | , body = "Sally writes words about a grown banana."
43 | }
44 | |> TestUtils.getResultIgnoreError
45 | |> IndexEncoder.encoder
46 | |> Encode.encode 0
47 | |> Expect.equal
48 | encodedIndex
49 |
50 |
51 | testEncodeList : Test
52 | testEncodeList =
53 | test "Encode index with doc matches encodedIndex using listFields" <|
54 | \() ->
55 | Index.new
56 | { indexType = "- IndexTest Type -"
57 | , ref = .cid
58 | , fields = [ ( .title, 5 ) ]
59 | , listFields = [ ( .body, 1 ) ]
60 | }
61 | |> Index.add
62 | { cid = "doc1"
63 | , title = "Examples of a Banana"
64 | , author = "Sally Apples"
65 | , body =
66 | [ "Sally writes words "
67 | , "about a grown banana."
68 | ]
69 | }
70 | |> TestUtils.getResultIgnoreError
71 | |> IndexEncoder.encoder
72 | |> Encode.encode 0
73 | |> Expect.equal
74 | encodedIndex
75 |
--------------------------------------------------------------------------------
/tests/IndexLoadTests.elm:
--------------------------------------------------------------------------------
1 | module IndexLoadTests exposing
2 | ( indexfromString1Test
3 | , loadIndexWith1Test
4 | , loadIndexWithErr1Test
5 | , loadIndexWithErr2Test
6 | )
7 |
8 | import ElmTextSearch
9 | import Expect
10 | import Index.Load
11 | import Index.Model exposing (Index(..))
12 | import Json.Decode exposing (Error(..))
13 | import Test exposing (..)
14 | import TestUtils
15 |
16 |
17 | loadIndexWithErr1Test : Test
18 | loadIndexWithErr1Test =
19 | test "Fails to load an index with wrong index version" <|
20 | \() ->
21 | String.concat
22 | [ "{\"indexVersion\":\"1.0.1\",\"indexType\":\"- IndexTest Type -\","
23 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
24 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
25 | , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
26 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
27 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
28 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
29 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
30 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
31 | ]
32 | |> Index.Load.loadIndexWith
33 | [ { indexType = "_______some string"
34 | , ref = .cid
35 | , fields = [ ( .title, 5 ), ( .body, 1 ) ]
36 | , listFields = []
37 | , initialTransformFactories = []
38 | , transformFactories = []
39 | , filterFactories = []
40 | }
41 | ]
42 | |> TestUtils.getErrorIgnoreResult
43 | |> TestUtils.getDecodeErrorFailureMessage
44 | |> Expect.equal "Error cannot load Index. Version supported is 1.1.0. Version tried to load is 1.0.1."
45 |
46 |
47 | loadIndexWithErr2Test : Test
48 | loadIndexWithErr2Test =
49 | test "Fails to load an index with an indexType not in configuration provided." <|
50 | \() ->
51 | String.concat
52 | [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"__IndexTest Type -\","
53 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
54 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
55 | , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
56 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
57 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
58 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
59 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
60 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
61 | ]
62 | |> Index.Load.loadIndexWith
63 | [ { indexType = "_______some string not matching the encoded index type"
64 | , ref = .cid
65 | , fields = [ ( .title, 5 ), ( .body, 1 ) ]
66 | , listFields = []
67 | , initialTransformFactories = []
68 | , transformFactories = []
69 | , filterFactories = []
70 | }
71 | ]
72 | |> TestUtils.getErrorIgnoreResult
73 | |> TestUtils.getDecodeErrorFailureMessage
74 | |> Expect.equal "Error cannot load Index. Tried to load index of type \"__IndexTest Type -\". It is not in supported index configurations."
75 |
76 |
77 | loadIndexWith1Test : Test
78 | loadIndexWith1Test =
79 | let
80 | config =
81 | { indexType = "not set"
82 | , ref = .cid
83 | , fields = [ ( .title, 5 ), ( .body, 1 ) ]
84 | , listFields = []
85 | , initialTransformFactories = []
86 | , transformFactories = []
87 | , filterFactories = []
88 | }
89 | in
90 | test "Load an index. really dumb check" <|
91 | \() ->
92 | String.concat
93 | [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"_______some string\","
94 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
95 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
96 | , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
97 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
98 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
99 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
100 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
101 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
102 | ]
103 | |> Index.Load.loadIndexWith
104 | [ config
105 | , { config | indexType = "_______some string" }
106 | ]
107 | |> TestUtils.expectOkWithGoodFailMessage
108 |
109 |
110 | indexfromString1Test : Test
111 | indexfromString1Test =
112 | test "Can succesfully load index from string with ElmTextSearch.SimpleConfig." <|
113 | \() ->
114 | String.concat
115 | [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"-= ElmTextSearch Index Type 1 =-\","
116 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
117 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
118 | , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
119 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
120 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
121 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
122 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
123 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
124 | ]
125 | |> ElmTextSearch.fromString
126 | { ref = .cid
127 | , fields =
128 | [ ( .title, 5 )
129 | , ( .body, 1 )
130 | ]
131 | , listFields = []
132 | }
133 | |> TestUtils.expectOkWithGoodFailMessage
134 |
--------------------------------------------------------------------------------
/tests/IndexTests.elm:
--------------------------------------------------------------------------------
1 | module IndexTests exposing
2 | ( addDocAlreadyInIndexReturnsError
3 | , addDocWithEmptyIdFieldReturnsError
4 | , addDocWithIndexFieldsEmptyReturnsError
5 | , addDocumentWithSameIdAsExistingReturnsError
6 | , addMultipleDocsReturnsErrorListForProblems
7 | , addOrUpdateDocNotInIndexReturnsSuccess
8 | , addOrUpdateDocWithSameIdReturnsSuccess
9 | , idfCacheIsClearedAfterASuccessfulAdd
10 | , idfCacheIsClearedAfterSuccessfulRemove
11 | , removeDocRefNotIndexReturnsError
12 | , removeDocWithEmptyIdFieldReturnsError
13 | , removeDoesNotBreakSearchResults
14 | , removeOnlyDocIndexReturnsIsEmpty
15 | , searchCasesTest
16 | , searchEmptyIndexReturnsError
17 | , searchIndexAfter2DocRemovedErrors
18 | , searchIndexAfterDocRemovedErrors
19 | , searchListFieldsSingleLetterWithLetterInBody
20 | , searchSingleLetterWithLetterInTitles
21 | , searchUsingEmptyQueryReturnsError
22 | , searchUsingQueryWithOnlyStopWordsWhichMeansEmptyReturnsError
23 | , searchWithOnlyListFieldsIndexReturnsValidScores
24 | , updateDocNotInIndexReturnsError
25 | , updateDocUsesNewDocContent
26 | )
27 |
28 | import Dict
29 | import ElmTextSearch.Json.Encoder as IndexEncoder
30 | import Expect
31 | import Index
32 | import Index.Model exposing (Index(..))
33 | import Json.Encode as Encode
34 | import Test exposing (..)
35 | import TestUtils
36 | import Trie
37 |
38 |
39 | {-| example record type for tests
40 | -}
41 | type alias MyDoc =
42 | { cid : String
43 | , title : String
44 | , author : String
45 | , body : String
46 | }
47 |
48 |
49 | type alias MyDoc2 =
50 | { cid : String
51 | , title : String
52 | , author : String
53 | , body : List String
54 | }
55 |
56 |
57 | doc1_ : MyDoc
58 | doc1_ =
59 | { cid = "doc1"
60 | , title = "Examples of a Banana"
61 | , author = "Sally Apples"
62 | , body = "Sally writes words about a grown banana."
63 | }
64 |
65 |
66 | doc2_ : MyDoc
67 | doc2_ =
68 | { cid = "doc2"
69 | , title = "Grown Bananas and there appeal"
70 | , author = "John Banana"
71 | , body = "An example of apple engineering."
72 | }
73 |
74 |
75 | doc3_ : MyDoc
76 | doc3_ =
77 | { cid = "doc3"
78 | , title = "Kites and Trees a tail of misery"
79 | , author = "Adam Winddriven"
80 | , body = "When a flyer meets an Elm it maybe a problem."
81 | }
82 |
83 |
84 | doc4_indexFieldsEmpty : { cid : String, title : String, author : String, body : String }
85 | doc4_indexFieldsEmpty =
86 | { cid = "doc4"
87 | , title = ""
88 | , author = "Some Author"
89 | , body = ""
90 | }
91 |
92 |
93 | doc5_idEmpty : MyDoc
94 | doc5_idEmpty =
95 | { cid = ""
96 | , title = "Empty Reference Title"
97 | , author = "Some Author"
98 | , body = "Empty Reference Body"
99 | }
100 |
101 |
102 | type alias SearchCaseRecord =
103 | { name : String
104 | , input : String
105 | , expect : List String
106 | , indexResult : Index MyDoc
107 | }
108 |
109 |
110 | searchCasesTest : Test
111 | searchCasesTest =
112 | describe "Index search tests"
113 | (List.map searchTestCase
114 | [ { name = "two docs one with term in title first and body second"
115 | , input = "example"
116 | , expect = [ "doc1", "doc2" ]
117 | , indexResult = getIndexDoc1Doc2 ()
118 | }
119 | , { name = "two docs one with term in title first"
120 | , input = "grown"
121 | , expect = [ "doc2", "doc1" ]
122 | , indexResult = getIndexDoc1Doc2 ()
123 | }
124 | , { name = "neither document contains both words so return nothing"
125 | , input = "-misery! .appeal,"
126 | , expect = []
127 | , indexResult = getIndexDoc1Doc2 ()
128 | }
129 | , { name = "with doc3 returns no docs with both words"
130 | , input = "-misery! .appeal,"
131 | , expect = []
132 | , indexResult = getIndexDoc1Doc2Doc3 ()
133 | }
134 | , { name = "returns doc1 and doc2 e expands to example and engineer which exist in both documents."
135 | , input = "e"
136 | , expect = [ "doc1", "doc2" ]
137 | , indexResult = getIndexDoc1Doc2 ()
138 | }
139 | , { name = "search \"ex\" returns doc1, doc2 as both contain example."
140 | , input = "ex"
141 | , expect = [ "doc1", "doc2" ]
142 | , indexResult = getIndexDoc1Doc2 ()
143 | }
144 | , { name = "search \"en\" returns doc2 as it contains engineering."
145 | , input = "en"
146 | , expect = [ "doc2" ]
147 | , indexResult = getIndexDoc1Doc2 ()
148 | }
149 | ]
150 | )
151 |
152 |
153 | searchTestCase : SearchCaseRecord -> Test
154 | searchTestCase { name, input, expect, indexResult } =
155 | test ("search \"" ++ input ++ "\" " ++ name) <|
156 | \() ->
157 | Expect.equal expect <|
158 | case Index.search input indexResult of
159 | Ok ( _, docs ) ->
160 | List.map Tuple.first docs
161 |
162 | Err err ->
163 | [ err ]
164 |
165 |
166 | getEmptyIndexMyDoc2IndexOnlyListFields : () -> Index.Index MyDoc2
167 | getEmptyIndexMyDoc2IndexOnlyListFields _ =
168 | Index.new
169 | { indexType = "- IndexTest Type -"
170 | , ref = .cid
171 | , fields = []
172 | , listFields =
173 | [ ( .body, 1 )
174 | ]
175 | }
176 |
177 |
178 | getEmptyIndexMyDoc2 : () -> Index.Index MyDoc2
179 | getEmptyIndexMyDoc2 _ =
180 | Index.new
181 | { indexType = "- IndexTest Type -"
182 | , ref = .cid
183 | , fields = [ ( .title, 5 ) ]
184 | , listFields = [ ( .body, 1 ) ]
185 | }
186 |
187 |
188 | getEmptyIndex : () -> Index.Index MyDoc
189 | getEmptyIndex _ =
190 | Index.new
191 | { indexType = "- IndexTest Type -"
192 | , ref = .cid
193 | , fields = [ ( .title, 5 ), ( .body, 1 ) ]
194 | , listFields = []
195 | }
196 |
197 |
198 | getIndexDoc1 : () -> Index.Index MyDoc
199 | getIndexDoc1 _ =
200 | getEmptyIndex ()
201 | |> Index.add doc1_
202 | |> TestUtils.getResultIgnoreError
203 |
204 |
205 | getIndexDoc1Doc2 : () -> Index.Index MyDoc
206 | getIndexDoc1Doc2 _ =
207 | getIndexDoc1 ()
208 | |> Index.add doc2_
209 | |> TestUtils.getResultIgnoreError
210 |
211 |
212 | getIndexDoc1Doc2Doc3 : () -> Index.Index MyDoc
213 | getIndexDoc1Doc2Doc3 _ =
214 | getIndexDoc1Doc2 ()
215 | |> Index.add doc3_
216 | |> TestUtils.getResultIgnoreError
217 |
218 |
219 | searchUsingEmptyQueryReturnsError : Test
220 | searchUsingEmptyQueryReturnsError =
221 | test "empty query returns Err" <|
222 | \() ->
223 | getIndexDoc1Doc2 ()
224 | |> Index.search ""
225 | |> Expect.equal (Err "Error query is empty.")
226 |
227 |
228 | searchUsingQueryWithOnlyStopWordsWhichMeansEmptyReturnsError : Test
229 | searchUsingQueryWithOnlyStopWordsWhichMeansEmptyReturnsError =
230 | test "query full of stop words (filtered out words) returns Err" <|
231 | \() ->
232 | getIndexDoc1Doc2 ()
233 | |> Index.search "if and but "
234 | |> Expect.equal (Err "Error after tokenisation there are no terms to search for.")
235 |
236 |
237 | searchEmptyIndexReturnsError : Test
238 | searchEmptyIndexReturnsError =
239 | test "no document returns Err" <|
240 | \() ->
241 | Index.search "hello world"
242 | (getEmptyIndex ())
243 | |> Expect.equal (Err "Error there are no documents in index to search.")
244 |
245 |
246 | idfCacheIsClearedAfterSuccessfulRemove : Test
247 | idfCacheIsClearedAfterSuccessfulRemove =
248 | test "idfCache is cleared after a successful remove document." <|
249 | \() ->
250 | getIndexDoc1Doc2 ()
251 | |> Index.search "banana"
252 | |> TestUtils.getResultIgnoreError
253 | |> Tuple.first
254 | |> Index.remove doc1_
255 | |> TestUtils.getResultIgnoreError
256 | |> getIdfCache
257 | |> Dict.isEmpty
258 | |> Expect.equal True
259 | |> Expect.onFail "IdfCache should be cleared after document remove"
260 |
261 |
262 |
263 | -- |> Expect.pass |> Expect.onFail "IdfCache should be cleared after document remove"
264 |
265 |
266 | idfCacheIsClearedAfterASuccessfulAdd : Test
267 | idfCacheIsClearedAfterASuccessfulAdd =
268 | test "idfCache is cleared after a successful add document." <|
269 | \() ->
270 | getIndexDoc1Doc2 ()
271 | |> Index.search "banana"
272 | |> TestUtils.getResultIgnoreError
273 | |> Tuple.first
274 | |> Index.add doc3_
275 | |> TestUtils.getResultIgnoreError
276 | |> getIdfCache
277 | |> Dict.isEmpty
278 | |> Expect.equal True
279 | |> Expect.onFail "IdfCache should be cleared after document remove"
280 |
281 |
282 | addDocWithIndexFieldsEmptyReturnsError : Test
283 | addDocWithIndexFieldsEmptyReturnsError =
284 | test "Add a doc which has all index fields empty returns Err" <|
285 | \() ->
286 | getEmptyIndex ()
287 | |> Index.add doc4_indexFieldsEmpty
288 | |> TestUtils.getErrorIgnoreResult
289 | |> Expect.equal "Error after tokenisation there are no terms to index."
290 |
291 |
292 | addDocWithEmptyIdFieldReturnsError : Test
293 | addDocWithEmptyIdFieldReturnsError =
294 | test "Add a doc empty ID field returns Err" <|
295 | \() ->
296 | getEmptyIndex ()
297 | |> Index.add doc5_idEmpty
298 | |> Expect.equal (Err "Error document has an empty unique id (ref).")
299 |
300 |
301 | addDocAlreadyInIndexReturnsError : Test
302 | addDocAlreadyInIndexReturnsError =
303 | test "Add a doc allready in index returns Err" <|
304 | \() ->
305 | getIndexDoc1Doc2Doc3 ()
306 | |> Index.add doc1_
307 | |> TestUtils.getErrorIgnoreResult
308 | |> Expect.equal "Error adding document that allready exists."
309 |
310 |
311 | getIdfCache : Index doc -> Dict.Dict String Float
312 | getIdfCache (Index irec) =
313 | irec.idfCache
314 |
315 |
316 | removeDocRefNotIndexReturnsError : Test
317 | removeDocRefNotIndexReturnsError =
318 | test "Remove a doc ref not in index returns Err." <|
319 | \() ->
320 | getIndexDoc1Doc2 ()
321 | |> Index.remove doc3_
322 | |> TestUtils.getErrorIgnoreResult
323 | |> Expect.equal "Error document is not in index."
324 |
325 |
326 | removeDocWithEmptyIdFieldReturnsError : Test
327 | removeDocWithEmptyIdFieldReturnsError =
328 | test "Remove a doc with empty id field is an error." <|
329 | \() ->
330 | getEmptyIndex ()
331 | |> Index.remove doc5_idEmpty
332 | |> Expect.equal (Err "Error document has an empty unique id (ref).")
333 |
334 |
335 | searchIndexAfterDocRemovedErrors : Test
336 | searchIndexAfterDocRemovedErrors =
337 | test "Search index where 1 doc from index was removed fails" <|
338 | \() ->
339 | getIndexDoc1 ()
340 | |> Index.remove doc1_
341 | |> TestUtils.getResultIgnoreError
342 | |> Index.search "Sally"
343 | |> TestUtils.getErrorIgnoreResult
344 | |> Expect.equal "Error there are no documents in index to search."
345 |
346 |
347 | searchIndexAfter2DocRemovedErrors : Test
348 | searchIndexAfter2DocRemovedErrors =
349 | test "Search Index where 2 docs from index removed fails" <|
350 | \() ->
351 | getIndexDoc1Doc2 ()
352 | |> Index.remove doc1_
353 | |> TestUtils.getResultIgnoreError
354 | |> Index.remove doc2_
355 | |> TestUtils.getResultIgnoreError
356 | |> Index.search "Sally"
357 | |> TestUtils.getErrorIgnoreResult
358 | |> Expect.equal "Error there are no documents in index to search."
359 |
360 |
361 | removeDoesNotBreakSearchResults : Test
362 | removeDoesNotBreakSearchResults =
363 | test "Remove does not break searching" <|
364 | \() ->
365 | getIndexDoc1Doc2 ()
366 | |> Index.remove doc2_
367 | |> TestUtils.getResultIgnoreError
368 | |> Index.search "Sally"
369 | |> TestUtils.getResultIgnoreError
370 | |> Tuple.second
371 | |> List.map Tuple.first
372 | |> Expect.equal [ doc1_.cid ]
373 |
374 |
375 | {-| Test to verify removing only document reports
376 | -}
377 | removeOnlyDocIndexReturnsIsEmpty : Test
378 | removeOnlyDocIndexReturnsIsEmpty =
379 | let
380 | testIndexU1 =
381 | getIndexDoc1 ()
382 | |> Index.remove doc1_
383 | |> TestUtils.getResultIgnoreError
384 |
385 | ( storeB, tokenStoreB ) =
386 | case testIndexU1 of
387 | Index { documentStore, tokenStore } ->
388 | ( documentStore, tokenStore )
389 | in
390 | describe "removing a doc"
391 | [ test "removes it from document store" <|
392 | \() ->
393 | Dict.member "doc1" storeB
394 | |> Expect.equal False
395 | |> Expect.onFail "oops its in document store"
396 | , test "removes trie nodes not leading to a reference. This is not testing trie, testing Index use of trie" <|
397 | \() ->
398 | Trie.isEmpty tokenStoreB
399 | |> Expect.equal True
400 | |> Expect.onFail "Trie model is not empty"
401 | ]
402 |
403 |
404 | addMultipleDocsReturnsErrorListForProblems : Test
405 | addMultipleDocsReturnsErrorListForProblems =
406 | describe "addAllDocs Tests" <|
407 | [ test "Add multiple docs returning list of docs with errors" <|
408 | \() ->
409 | getEmptyIndex ()
410 | |> Index.addDocs [ doc3_, doc4_indexFieldsEmpty ]
411 | |> Tuple.second
412 | |> Expect.equal [ ( 1, "Error after tokenisation there are no terms to index." ) ]
413 | , test "Add multiple docs returning list of errors swap order of documents." <|
414 | \() ->
415 | getEmptyIndex ()
416 | |> Index.addDocs [ doc4_indexFieldsEmpty, doc3_ ]
417 | |> Tuple.second
418 | |> Expect.equal [ ( 0, "Error after tokenisation there are no terms to index." ) ]
419 | ]
420 |
421 |
422 | helperAddDocsSearchIndexResults : String -> List doc -> Index doc -> List ( String, Float )
423 | helperAddDocsSearchIndexResults search docs index =
424 | index
425 | -- |> (\a -> Debug.log "foo" a)
426 | |> Index.addDocs docs
427 | |> Tuple.first
428 | |> Index.search search
429 | |> TestUtils.getResultIgnoreError
430 | |> Tuple.second
431 |
432 |
433 | {-| Case from
434 | Two docs with titles Question1 and Question2
435 | "q" search was not returning both documents.
436 | -}
437 | searchSingleLetterWithLetterInTitles : Test
438 | searchSingleLetterWithLetterInTitles =
439 | test "search single letter reports both documents with word starting with that letter in title field" <|
440 | \() ->
441 | getEmptyIndex ()
442 | |> helperAddDocsSearchIndexResults "q"
443 | [ { cid = "qdoc1"
444 | , title = "Question1"
445 | , author = "Sally Apples"
446 | , body = "Sally writes words about a grown banana."
447 | }
448 | , { cid = "qdoc2"
449 | , title = "Question2"
450 | , author = "John Banana"
451 | , body = "An example of apple engineering."
452 | }
453 | ]
454 | |> List.map Tuple.first
455 | |> Expect.equal [ "qdoc1", "qdoc2" ]
456 |
457 |
458 | searchListFieldsSingleLetterWithLetterInBody : Test
459 | searchListFieldsSingleLetterWithLetterInBody =
460 | test "search finds words in list fields body of MyDoc2" <|
461 | \() ->
462 | getEmptyIndexMyDoc2 ()
463 | |> helperAddDocsSearchIndexResults "green"
464 | [ { cid = "qdoc1"
465 | , title = "Question1 Notgreen"
466 | , author = "Sally Apples"
467 | , body =
468 | [ "Sally writes words about "
469 | , "a grown green banana."
470 | ]
471 | }
472 | , { cid = "qdoc2"
473 | , title = "Question2 Purple"
474 | , author = "John Banana"
475 | , body =
476 | [ "An example of "
477 | , "green apple engineering."
478 | ]
479 | }
480 | ]
481 | |> List.map Tuple.first
482 | |> Expect.equal [ "qdoc2", "qdoc1" ]
483 |
484 |
485 | {-| Configure to have some data in listFields body, match in listFields body, index with fields set to []
486 | Reproduce a bug reported.
487 | -}
488 | searchWithOnlyListFieldsIndexReturnsValidScores : Test
489 | searchWithOnlyListFieldsIndexReturnsValidScores =
490 | test "search index with only List fields configured, check for NaN values in scores" <|
491 | \() ->
492 | getEmptyIndexMyDoc2IndexOnlyListFields ()
493 | |> helperAddDocsSearchIndexResults "green"
494 | [ { cid = "qdoc1"
495 | , title = "Question1 Notgreen"
496 | , author = "Sally Apples"
497 | , body =
498 | [ "Sally writes words about "
499 | , "a grown green banana."
500 | ]
501 | }
502 | , { cid = "qdoc2"
503 | , title = "Question2 Purple"
504 | , author = "John Banana"
505 | , body =
506 | [ "An example of "
507 | , "green apple engineering."
508 | ]
509 | }
510 | ]
511 | |> List.map Tuple.second
512 | |> List.any Basics.isNaN
513 | |> Expect.equal False
514 | |> Expect.onFail "Expect searchScores to not contain any NaN values"
515 |
516 |
517 | addDocumentWithSameIdAsExistingReturnsError : Test
518 | addDocumentWithSameIdAsExistingReturnsError =
519 | test "add same document to index produces error" <|
520 | \() ->
521 | getIndexDoc1 ()
522 | |> Index.add doc1_
523 | |> TestUtils.getErrorIgnoreResult
524 | |> Expect.equal "Error adding document that allready exists."
525 |
526 |
527 | addOrUpdateDocWithSameIdReturnsSuccess : Test
528 | addOrUpdateDocWithSameIdReturnsSuccess =
529 | test "addOrUpdate same document does not produce error" <|
530 | \() ->
531 | getIndexDoc1 ()
532 | |> Index.addOrUpdate doc1_
533 | |> TestUtils.isOk
534 | |> Expect.equal True
535 | |> Expect.onFail "Expect Ok result to addOrUpdate if doc in index"
536 |
537 |
538 | addOrUpdateDocNotInIndexReturnsSuccess : Test
539 | addOrUpdateDocNotInIndexReturnsSuccess =
540 | test "addOrUpdate document not in index updates index with new doc" <|
541 | \() ->
542 | getEmptyIndex ()
543 | |> Index.addOrUpdate doc1_
544 | |> TestUtils.isOk
545 | |> Expect.equal True
546 | |> Expect.onFail "Expect Ok result to addOrUpdate if doc is new"
547 |
548 |
549 | updateDocNotInIndexReturnsError : Test
550 | updateDocNotInIndexReturnsError =
551 | test "index update with a doc not in index fails" <|
552 | \() ->
553 | getEmptyIndex ()
554 | |> Index.update doc1_
555 | |> TestUtils.isOk
556 | |> Expect.equal False
557 | |> Expect.onFail "Updating a doc not in index fails"
558 |
559 |
560 | {-| Updating a document removes old doc version and adds new doc version.
561 |
562 | This was a bug I noticed in code, writing test to confirm before fixing it.
563 |
564 | -}
565 | updateDocUsesNewDocContent : Test
566 | updateDocUsesNewDocContent =
567 | let
568 | indexT1 =
569 | getEmptyIndex ()
570 | |> Index.addDocs
571 | [ { cid = "qdoc1"
572 | , title = "Question1"
573 | , author = "Sally Apples"
574 | , body = "Sally writes words about a grown banana."
575 | }
576 | , { cid = "qdoc2"
577 | , title = "Question2"
578 | , author = "John Banana"
579 | , body = "An example of apple engineering."
580 | }
581 | ]
582 | |> Tuple.first
583 |
584 | indexT2 =
585 | indexT1
586 | |> Index.update
587 | { cid = "qdoc1"
588 | , title = "Yesterday"
589 | , author = "New User"
590 | , body = "Completely different document really"
591 | }
592 | |> TestUtils.getResultIgnoreError
593 |
594 | encodedT1 =
595 | indexT1
596 | |> IndexEncoder.encoder
597 | |> Encode.encode 0
598 |
599 | encodedT2 =
600 | indexT2
601 | |> IndexEncoder.encoder
602 | |> Encode.encode 0
603 | in
604 | test "updateDoc removes old doc and replaces it so index changes" <|
605 | \() ->
606 | encodedT1
607 | |> Expect.notEqual
608 | encodedT2
609 |
--------------------------------------------------------------------------------
/tests/IndexUtilsTests.elm:
--------------------------------------------------------------------------------
1 | module IndexUtilsTests exposing
2 | ( testDefaultTransforms
3 | , testGetTokens
4 | , test_processTokens_filterFactories
5 | , test_processTokens_initialTransformFactories
6 | , test_processTokens_transformFactories
7 | )
8 |
9 | import Expect
10 | import Index exposing (Index)
11 | import Index.Model exposing (FilterFactory, TransformFactory)
12 | import Index.Utils
13 | import StopWordFilter exposing (createFilterFunc)
14 | import Test exposing (..)
15 |
16 | type alias MyDoc =
17 | { cid : String
18 | , title : String
19 | , author : String
20 | , body : String
21 | }
22 |
23 |
24 | testDefaultTransforms : Test
25 | testDefaultTransforms =
26 | describe "apply default transform tests"
27 | (List.map testGetTokens
28 | [ ( "words of only non word chars removed"
29 | , "engineering ???"
30 | , [ "engin" ]
31 | )
32 | , ( "stemmer and non word chars removed"
33 | , ".This was very large.-"
34 | , [ "veri", "larg" ]
35 | )
36 | , ( "stop words removed"
37 | , "however among the dear .- -"
38 | , []
39 | )
40 |
41 | -- Bug https://github.com/rluiten/elm-text-search/issues/10
42 | , ( "\"on\" in the stop word list should not filter \"one\""
43 | , "one two three"
44 | -- note that "one" is transformed to "on" by stemmer
45 | , [ "on", "two", "three" ]
46 | )
47 | ]
48 | )
49 |
50 |
51 | testGetTokens : ( String, String, List String ) -> Test
52 | testGetTokens ( name, input, expected ) =
53 | test ("getTokens \"" ++ input ++ "\" " ++ name) <|
54 | \() ->
55 | let
56 | testMyDocIndex =
57 | Index.new
58 | { indexType = "- IndexTest Type -"
59 | , ref = .cid
60 | , fields =
61 | [ ( .title, 5 )
62 | , ( .body, 1 )
63 | ]
64 | , listFields = []
65 | }
66 | in
67 | Index.Utils.getTokens
68 | testMyDocIndex
69 | input
70 | |> Tuple.second
71 | |> Expect.equal expected
72 |
73 |
74 | createTestIndex1 :
75 | List (TransformFactory MyDoc)
76 | -> List (TransformFactory MyDoc)
77 | -> List (FilterFactory MyDoc)
78 | -> Index MyDoc
79 | createTestIndex1 initialTransformFactories transformFactories filterFactories =
80 | Index.newWith
81 | { indexType = "- IndexTest Type -"
82 | , ref = .cid
83 | , fields =
84 | [ ( .title, 5 )
85 | , ( .body, 1 )
86 | ]
87 | , listFields = []
88 | , initialTransformFactories = initialTransformFactories
89 | , transformFactories = transformFactories
90 | , filterFactories = filterFactories
91 | }
92 |
93 |
94 | test_processTokens_transformFactories : Test
95 | test_processTokens_transformFactories =
96 | test "test processTokens transformFactories list" <|
97 | \() ->
98 | Index.Utils.processTokens
99 | (createTestIndex1
100 | []
101 | [ Index.Utils.createFuncCreator (String.dropRight 1), Index.Utils.createFuncCreator (String.dropRight 1) ]
102 | []
103 | )
104 | [ "awords", "btesting", "ca" ]
105 | |> Tuple.second
106 | |> Expect.equal [ "awor", "btesti" ]
107 |
108 |
109 | test_processTokens_initialTransformFactories : Test
110 | test_processTokens_initialTransformFactories =
111 | test "test processTokens initialTransformFactories list" <|
112 | \() ->
113 | Index.Utils.processTokens
114 | (createTestIndex1
115 | [ Index.Utils.createFuncCreator (String.dropLeft 1), Index.Utils.createFuncCreator (String.dropRight 1) ]
116 | []
117 | []
118 | )
119 | [ "pwords", "qtesting", "ra" ]
120 | |> Tuple.second
121 | |> Expect.equal
122 | [ "word", "testin" ]
123 |
124 |
125 | test_processTokens_filterFactories : Test
126 | test_processTokens_filterFactories =
127 | test "test processTokens filterFactories list" <|
128 | \() ->
129 | Index.Utils.processTokens
130 | (createTestIndex1
131 | []
132 | []
133 | [ createFilterFunc [ "special" ], createFilterFunc [ "swimming" ] ]
134 | )
135 | [ "word", "special", "puzzle", "swimming" ]
136 | |> Tuple.second
137 | |> Expect.equal
138 | [ "word", "puzzle" ]
139 |
--------------------------------------------------------------------------------
/tests/SearchIndexTests.elm:
--------------------------------------------------------------------------------
1 | module SearchIndexTests exposing (saveAndLoadSameTest, searchReturnsEmptyResult, searchReturnsValidResult)
2 |
3 | {- Save and Load index check search results same -}
4 |
5 | import ElmTextSearch
6 | import Expect
7 | import Index.Model exposing (Index(..), IndexSimpleConfig)
8 | import Test exposing (..)
9 | import TestUtils
10 |
11 |
12 | type alias MyDoc =
13 | { cid : String
14 | , title : String
15 | , author : String
16 | , body : String
17 | }
18 |
19 |
20 | configElmTextSearchMyDoc : IndexSimpleConfig MyDoc
21 | configElmTextSearchMyDoc =
22 | { ref = .cid
23 | , fields =
24 | [ ( .title, 5 )
25 | , ( .body, 1 )
26 | ]
27 | , listFields = []
28 | }
29 |
30 |
31 | doc1 : MyDoc
32 | doc1 =
33 | { cid = "doc1"
34 | , title = "Examples of a Banana"
35 | , author = "Sally Apples"
36 | , body = "Sally writes words about a grown banana."
37 | }
38 |
39 |
40 | doc2 : MyDoc
41 | doc2 =
42 | { cid = "doc2"
43 | , title = "Words about a vehicle"
44 | , author = "John Barrel"
45 | , body = "All about a vehicle in exile."
46 | }
47 |
48 |
49 | {-| example index
50 | -}
51 | getEmptyIndex : () -> Index MyDoc
52 | getEmptyIndex _ =
53 | ElmTextSearch.new configElmTextSearchMyDoc
54 |
55 |
56 | getIndexDoc1 : () -> Index MyDoc
57 | getIndexDoc1 _ =
58 | getEmptyIndex ()
59 | |> ElmTextSearch.add doc1
60 | |> TestUtils.getResultIgnoreError
61 |
62 |
63 | getIndexDoc1Doc2 : () -> Index MyDoc
64 | getIndexDoc1Doc2 _ =
65 | getIndexDoc1 ()
66 | |> ElmTextSearch.add doc2
67 | |> TestUtils.getResultIgnoreError
68 |
69 |
70 | searchReturnsEmptyResult : Test
71 | searchReturnsEmptyResult =
72 | test "Search returns empty result" <|
73 | \() ->
74 | getIndexDoc1Doc2 ()
75 | |> ElmTextSearch.search "foreign"
76 | |> TestUtils.getResultIgnoreError
77 | |> Tuple.second
78 | |> Expect.equal []
79 |
80 |
81 | searchReturnsValidResult : Test
82 | searchReturnsValidResult =
83 | test "Search returns valid result" <|
84 | \() ->
85 | getIndexDoc1Doc2 ()
86 | |> ElmTextSearch.search "exile"
87 | |> TestUtils.getResultIgnoreError
88 | |> Tuple.second
89 | |> Expect.equal [ ( "doc2", 0.13898344497096093 ) ]
90 |
91 |
92 | {-| helper to save and load an index. and run a search in original index and loaded index.
93 | -}
94 | searchIndexSearchSavedLoadedIndex : String -> Index MyDoc -> ( List ( String, Float ), List ( String, Float ) )
95 | searchIndexSearchSavedLoadedIndex search index =
96 | let
97 | searchAnIndex index2 =
98 | index2
99 | |> ElmTextSearch.search search
100 | |> TestUtils.getResultIgnoreError
101 | |> Tuple.second
102 |
103 | savedAndLoadedIndex i =
104 | ElmTextSearch.storeToString i
105 | |> ElmTextSearch.fromString configElmTextSearchMyDoc
106 | |> TestUtils.getResultIgnoreError
107 | in
108 | ( searchAnIndex index, searchAnIndex <| savedAndLoadedIndex index )
109 |
110 |
111 | saveAndLoadSameTest : Test
112 | saveAndLoadSameTest =
113 | describe "results same before and after save and load index"
114 | [ test "x Search result of nothing for Index same as for Save and Loaded Index." <|
115 | \() ->
116 | let
117 | ( resultA, resultsB ) =
118 | getIndexDoc1Doc2 ()
119 | |> searchIndexSearchSavedLoadedIndex "foreign"
120 | in
121 | Expect.equal resultA resultsB
122 | , test "x Search result of something for Index same as for Save and Loaded Index." <|
123 | \() ->
124 | let
125 | ( resultA, resultsB ) =
126 | getIndexDoc1Doc2 ()
127 | |> searchIndexSearchSavedLoadedIndex "exile"
128 | in
129 | Expect.equal resultA resultsB
130 | ]
131 |
--------------------------------------------------------------------------------
/tests/StopWordFilterTests.elm:
--------------------------------------------------------------------------------
1 | module StopWordFilterTests exposing (newIndex, stopWordFilterTest, tests)
2 |
3 | import ElmTextSearch
4 | import Expect
5 | import StopWordFilter
6 | import Test exposing (..)
7 |
8 |
9 | type alias ExampleDocType =
10 | { cid : String
11 | , title : String
12 | , author : String
13 | , body : String
14 | }
15 |
16 |
17 | newIndex : ElmTextSearch.Index ExampleDocType
18 | newIndex =
19 | ElmTextSearch.new
20 | { ref = .cid
21 | , fields =
22 | [ ( .title, 5 )
23 | , ( .body, 1 )
24 | ]
25 | , listFields = []
26 | }
27 |
28 |
29 | tests : Test
30 | tests =
31 | describe "check stopEnglishWordList against default token processing"
32 | (List.map stopWordFilterTest StopWordFilter.stopEnglishWordList)
33 |
34 |
35 | stopWordFilterTest : String -> Test
36 | stopWordFilterTest word =
37 | let
38 | ( _, stopWordFilter ) =
39 | StopWordFilter.createDefaultFilterFunc newIndex
40 | in
41 | test ("This word \"" ++ word ++ "\" got past default stop word filter in error.") <|
42 | \() ->
43 | stopWordFilter word
44 | |> Expect.equal False
45 | >> Expect.onFail "These should all be stopped"
46 |
--------------------------------------------------------------------------------
/tests/TestUtils.elm:
--------------------------------------------------------------------------------
1 | module TestUtils exposing
2 | ( expectOkWithGoodFailMessage
3 | , getDecodeErrorFailureMessage
4 | , getErrorIgnoreResult
5 | , getResultIgnoreError
6 | , isErr
7 | , isOk
8 | )
9 |
10 | {-| Utilities to make test cases simpler.
11 | -}
12 |
13 | import Expect
14 | import Index
15 | import Index.Model exposing (Index(..))
16 | import Json.Decode exposing (Error(..))
17 | import Test exposing (..)
18 |
19 |
20 | expectOkWithGoodFailMessage : Result Error a -> Expect.Expectation
21 | expectOkWithGoodFailMessage result =
22 | case result of
23 | Ok _ ->
24 | Expect.pass |> Expect.onFail "Result OK as expected"
25 |
26 | Err error ->
27 | Expect.fail
28 | (String.concat
29 | [ "Result Err not expected: "
30 | , getDecodeErrorFailureMessage error
31 | ]
32 | )
33 |
34 |
35 | getResultIgnoreError : Result error a -> a
36 | getResultIgnoreError result =
37 | case result of
38 | Ok value ->
39 | value
40 |
41 | Err _ ->
42 | Debug.todo "Ignoring failure for testing"
43 |
44 |
45 | getErrorIgnoreResult : Result error a -> error
46 | getErrorIgnoreResult result =
47 | case result of
48 | Ok _ ->
49 | Debug.todo "Ignoring value for testing"
50 |
51 | Err error ->
52 | error
53 |
54 |
55 | getDecodeErrorFailureMessage : Error -> String
56 | getDecodeErrorFailureMessage error =
57 | case error of
58 | Failure message _ ->
59 | message
60 |
61 | _ ->
62 | Debug.todo "Ignoring all but Failures of Decode Error"
63 |
64 |
65 | isOk : Result e a -> Bool
66 | isOk x =
67 | case x of
68 | Ok _ ->
69 | True
70 |
71 | Err _ ->
72 | False
73 |
74 |
75 | isErr : Result e a -> Bool
76 | isErr x =
77 | case x of
78 | Ok _ ->
79 | False
80 |
81 | Err _ ->
82 | True
83 |
--------------------------------------------------------------------------------
/tests/TokenProcessorTests.elm:
--------------------------------------------------------------------------------
1 | module TokenProcessorTests exposing (tokenizerTest, tokenizerTests, trimmerTest, trimmerTests)
2 |
3 | import Expect
4 | import Test exposing (..)
5 | import TokenProcessors
6 |
7 |
8 | tokenizerTests : Test
9 | tokenizerTests =
10 | describe "Lunr TokenProcessors tokenizer tests" <|
11 | List.map tokenizerTest
12 | [ ( "splitting simple strings into tokens"
13 | , "this is a simple string"
14 | , [ "this", "is", "a", "simple", "string" ]
15 | )
16 | , ( "downcasing tokens"
17 | , "FOO BAR"
18 | , [ "foo", "bar" ]
19 | )
20 | , ( "splitting strings with hyphens"
21 | , "take the New York-San Francisco flight"
22 | , [ "take", "the", "new", "york", "san", "francisco", "flight" ]
23 | )
24 | , ( "splitting strings with hyphens and spaces"
25 | , "Solve for A - B"
26 | , [ "solve", "for", "a", "b" ]
27 | )
28 | , ( "leading - in query should not cause extra token ?"
29 | , "-misery! .appeal,"
30 | , [ "misery!", ".appeal," ]
31 | )
32 | ]
33 |
34 |
35 | tokenizerTest : ( String, String, List String ) -> Test
36 | tokenizerTest ( name, testString, expectedTokens ) =
37 | test name <|
38 | \() ->
39 | Expect.equal
40 | expectedTokens
41 | (TokenProcessors.tokenizer testString)
42 |
43 |
44 | trimmerTests : Test
45 | trimmerTests =
46 | describe "Lunr TokenProcessors trimmer tests" <|
47 | List.map trimmerTest
48 | [ ( "023hello", "023hello" )
49 | , ( "=hello", "hello" )
50 | , ( "hello.", "hello" )
51 | , ( ",hello,", "hello" )
52 | , ( ",hello_,", "hello_" )
53 | , ( "40%", "40" )
54 | ]
55 |
56 |
57 | trimmerTest : ( String, String ) -> Test
58 | trimmerTest ( testString, expectedString ) =
59 | test ("trimmer " ++ testString ++ " -> " ++ expectedString) <|
60 | \() ->
61 | Expect.equal
62 | expectedString
63 | (TokenProcessors.trimmer testString)
64 |
--------------------------------------------------------------------------------