├── .gitignore
├── LICENSE
├── README.md
├── TODO.md
├── elm.json
├── examples
    ├── elm.json
    └── src
    │   ├── IndexNewAddSearch.elm
    │   ├── IndexNewAddSearchListFields.elm
    │   ├── IndexNewWithAddSearch.elm
    │   └── MultipleAddSearch.elm
├── src
    ├── ElmTextSearch.elm
    ├── ElmTextSearch
    │   └── Json
    │   │   ├── Decoder.elm
    │   │   └── Encoder.elm
    ├── ElmTextSearchErrors.elm
    ├── Index.elm
    ├── Index
    │   ├── Defaults.elm
    │   ├── Load.elm
    │   ├── Model.elm
    │   ├── Utils.elm
    │   └── Vector.elm
    ├── StopWordFilter.elm
    ├── TokenProcessors.elm
    └── Utils.elm
└── tests
    ├── DefaultTests.elm
    ├── ElmTextSearchTests.elm
    ├── IndexDecoderTests.elm
    ├── IndexEncoderTests.elm
    ├── IndexLoadTests.elm
    ├── IndexTests.elm
    ├── IndexUtilsTests.elm
    ├── SearchIndexTests.elm
    ├── StopWordFilterTests.elm
    ├── TestUtils.elm
    └── TokenProcessorTests.elm


/.gitignore:
--------------------------------------------------------------------------------
1 | elm-stuff
2 | lunr.js
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, Robin Luiten www.github.com/rluiten
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of ElmTextSearch nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ElmTextSearch full text indexer
  2 | 
  3 | Copyright (c) 2025 Robin Luiten
  4 | 
  5 | This is a full text indexing engine inspired by lunr.js and written in Elm language.
  6 | See http://lunrjs.com/ for lunr.js
  7 | 
  8 | I am happy to hear about users of this package.
  9 | 
 10 | I am happy to receive contributions be they bug reports, pull requests, documentation updates or examples.
 11 | 
 12 | ### v4.0.0 will not load indexes saved with old version.
 13 | 
 14 | If you do not use `storeToValue` `storeToString` `fromString` `fromValue` in ElmTextSearch this update is not likely to introduce issues.
 15 | 
 16 | The way that filters and transforms are applied to the content of documents has changed.
 17 | This is to properly fix a bug reported see https://github.com/rluiten/elm-text-search/issues/10 where stop word filters were not correctly applied. This means saved indexes from previous version of ElmTextSearch will not load in this version.
 18 | 
 19 | * `Defaults.indexVersion` has changed value.
 20 | 
 21 | The reason this is a Major version bump is some generalisation was done to enable future support
 22 | for loading and saving of older version and types of default index configurations.
 23 | 
 24 | ### v5.0.0 updates for Elm 0.19
 25 | 
 26 | Result types from loading indexes are now Decode.Error not String.
 27 | 
 28 | ### v5.0.2, v5.1.0 
 29 | 
 30 | New functions addT for add, searchT for search and removeT for remove.
 31 | These replace the error type of result with a type.
 32 | v5.0.2 was a goof on my part i forgot to expose new functions correctly.
 33 | 
 34 | ### v5.1.1
 35 | 
 36 | Fixed a bug in Trie that affect this when removing documents quite seriously.
 37 | New version of Trie, a few extra tests created.
 38 | 
 39 | ### Packages
 40 | 
 41 | Several packages were created for this project and published separately for this package to depend on.
 42 | 
 43 | * trie
 44 |  * http://package.elm-lang.org/packages/rluiten/trie/latest
 45 | * stemmer
 46 |  * http://package.elm-lang.org/packages/rluiten/stemmer/latest
 47 | * sparsevector
 48 |  * http://package.elm-lang.org/packages/rluiten/sparsevector/latest
 49 | 
 50 | ### Parts of lunr.js were left out
 51 | 
 52 |  * This does not have an event system.
 53 |  * Its internal data structure is not compatible.
 54 | 
 55 | ### Notes captured along way writing this.
 56 | 
 57 | * lunr.js
 58 |  * tokenStore.remove does not decrement length, but it doesn't use length really only save/load
 59 |  * stemmer "lay" -> "lay" "try" -> "tri" is opposite to porter stemmer
 60 | * porter stemmer erlang implementation
 61 |  * step5b does not use endsWithDoubleCons which is required afaik to pass the voc.txt output.txt cases
 62 | 
 63 | 
 64 | ### Example
 65 | 
 66 | See examples folder for four examples.
 67 | You can run any of the examples if you navigate to the examples folder and run `elm reactor` and select an example in the src folder.
 68 | 
 69 | First example is included inline here.
 70 | 
 71 | IndexNewAddSearch.elm
 72 | ```elm
 73 | module Main exposing (ExampleDocType, createNewIndexExample, main, resultSearchIndex, resultUpdatedMyIndexAfterAdd)
 74 | 
 75 | {-| Create an index and add a document, search a document
 76 | 
 77 | Copyright (c) 2025 Robin Luiten
 78 | 
 79 | -}
 80 | 
 81 | import Browser
 82 | import ElmTextSearch
 83 | import Html exposing (Html, button, div, text)
 84 | 
 85 | 
 86 | {-| Example document type.
 87 | -}
 88 | type alias ExampleDocType =
 89 |     { cid : String
 90 |     , title : String
 91 |     , author : String
 92 |     , body : String
 93 |     }
 94 | 
 95 | 
 96 | {-| Create an index with default configuration.
 97 | See ElmTextSearch.SimpleConfig documentation for parameter information.
 98 | -}
 99 | createNewIndexExample : ElmTextSearch.Index ExampleDocType
100 | createNewIndexExample =
101 |     ElmTextSearch.new
102 |         { ref = .cid
103 |         , fields =
104 |             [ ( .title, 5.0 )
105 |             , ( .body, 1.0 )
106 |             ]
107 |         , listFields = []
108 |         }
109 | 
110 | 
111 | {-| Add a document to an index.
112 | -}
113 | resultUpdatedMyIndexAfterAdd : Result String (ElmTextSearch.Index ExampleDocType)
114 | resultUpdatedMyIndexAfterAdd =
115 |     ElmTextSearch.add
116 |         { cid = "id1"
117 |         , title = "First Title"
118 |         , author = "Some Author"
119 |         , body = "Words in this example document with explanations."
120 |         }
121 |         createNewIndexExample
122 | 
123 | 
124 | {-| Search the index.
125 | 
126 | The result includes an updated Index because a search causes internal
127 | caches to be updated to improve overall performance.
128 | 
129 | -}
130 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
131 | resultSearchIndex =
132 |     resultUpdatedMyIndexAfterAdd
133 |         |> Result.andThen
134 |             (ElmTextSearch.search "explanations")
135 | 
136 | 
137 | {-| Display search result.
138 | -}
139 | main =
140 |     Browser.sandbox { init = 0, update = update, view = view }
141 | 
142 | 
143 | type Msg
144 |     = DoNothing
145 | 
146 | 
147 | update msg model =
148 |     case msg of
149 |         DoNothing ->
150 |             model
151 | 
152 | 
153 | view model =
154 |     let
155 |         -- want only the search results not the returned index
156 |         searchResults =
157 |             Result.map Tuple.second resultSearchIndex
158 |     in
159 |     div []
160 |         [ text
161 |             ("Result of searching for \"explanations\" is "
162 |                 ++ Debug.toString searchResults
163 |             )
164 |         ]
165 | ```
166 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | TODO - TokenProcessor look at elm/parser instead of regex looks better and probably more efficient than regex.
2 | 
3 | TODO - Idea store index to corpus tokens for each document not the words. maybe worth it.
4 | 
5 | 


--------------------------------------------------------------------------------
/elm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "package",
 3 |     "name": "rluiten/elm-text-search",
 4 |     "summary": "Full text index engine in Elm language inspired by lunr.js.",
 5 |     "license": "BSD-3-Clause",
 6 |     "version": "5.1.1",
 7 |     "exposed-modules": [
 8 |         "ElmTextSearch",
 9 |         "ElmTextSearchErrors",
10 |         "Index.Defaults",
11 |         "StopWordFilter",
12 |         "ElmTextSearch.Json.Encoder",
13 |         "ElmTextSearch.Json.Decoder"
14 |     ],
15 |     "elm-version": "0.19.0 <= v < 0.20.0",
16 |     "dependencies": {
17 |         "NoRedInk/elm-json-decode-pipeline": "1.0.0 <= v < 2.0.0",
18 |         "elm/core": "1.0.0 <= v < 2.0.0",
19 |         "elm/json": "1.0.0 <= v < 2.0.0",
20 |         "elm/regex": "1.0.0 <= v < 2.0.0",
21 |         "rluiten/sparsevector": "1.0.3 <= v < 2.0.0",
22 |         "rluiten/stemmer": "1.0.4 <= v < 2.0.0",
23 |         "rluiten/trie": "2.1.1 <= v < 3.0.0"
24 |     },
25 |     "test-dependencies": {
26 |         "elm-explorations/test": "2.0.0 <= v <= 2.2.0"
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/examples/elm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "application",
 3 |     "source-directories": [
 4 |         "src",
 5 |         "../src"
 6 |     ],
 7 |     "elm-version": "0.19.0",
 8 |     "dependencies": {
 9 |         "direct": {
10 |             "NoRedInk/elm-json-decode-pipeline": "1.0.0",
11 |             "elm/browser": "1.0.0",
12 |             "elm/core": "1.0.0",
13 |             "elm/html": "1.0.0",
14 |             "elm/json": "1.0.0",
15 |             "elm/regex": "1.0.0",
16 |             "rluiten/sparsevector": "1.0.3",
17 |             "rluiten/stemmer": "1.0.4",
18 |             "rluiten/trie": "2.0.3"
19 |         },
20 |         "indirect": {
21 |             "elm/time": "1.0.0",
22 |             "elm/url": "1.0.0",
23 |             "elm/virtual-dom": "1.0.0"
24 |         }
25 |     },
26 |     "test-dependencies": {
27 |         "direct": {},
28 |         "indirect": {}
29 |     }
30 | }


--------------------------------------------------------------------------------
/examples/src/IndexNewAddSearch.elm:
--------------------------------------------------------------------------------
 1 | module Main exposing (ExampleDocType, createNewIndexExample, main, resultSearchIndex, resultUpdatedMyIndexAfterAdd)
 2 | 
 3 | {-| Create an index and add a document, search a document
 4 | 
 5 | Copyright (c) 2016 Robin Luiten
 6 | 
 7 | -}
 8 | 
 9 | import Browser
10 | import ElmTextSearch
11 | import Html exposing (Html, button, div, text)
12 | 
13 | 
14 | {-| Example document type.
15 | -}
16 | type alias ExampleDocType =
17 |     { cid : String
18 |     , title : String
19 |     , author : String
20 |     , body : String
21 |     }
22 | 
23 | 
24 | {-| Create an index with default configuration.
25 | See ElmTextSearch.SimpleConfig documentation for parameter information.
26 | -}
27 | createNewIndexExample : ElmTextSearch.Index ExampleDocType
28 | createNewIndexExample =
29 |     ElmTextSearch.new
30 |         { ref = .cid
31 |         , fields =
32 |             [ ( .title, 5.0 )
33 |             , ( .body, 1.0 )
34 |             ]
35 |         , listFields = []
36 |         }
37 | 
38 | 
39 | {-| Add a document to an index.
40 | -}
41 | resultUpdatedMyIndexAfterAdd : Result String (ElmTextSearch.Index ExampleDocType)
42 | resultUpdatedMyIndexAfterAdd =
43 |     ElmTextSearch.add
44 |         { cid = "id1"
45 |         , title = "First Title"
46 |         , author = "Some Author"
47 |         , body = "Words in this example document with explanations."
48 |         }
49 |         createNewIndexExample
50 | 
51 | 
52 | {-| Search the index.
53 | 
54 | The result includes an updated Index because a search causes internal
55 | caches to be updated to improve overall performance.
56 | 
57 | -}
58 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
59 | resultSearchIndex =
60 |     resultUpdatedMyIndexAfterAdd
61 |         |> Result.andThen
62 |             (ElmTextSearch.search "explanations")
63 | 
64 | 
65 | {-| Display search result.
66 | -}
67 | main =
68 |     Browser.sandbox { init = 0, update = update, view = view }
69 | 
70 | 
71 | type Msg
72 |     = DoNothing
73 | 
74 | 
75 | update msg model =
76 |     case msg of
77 |         DoNothing ->
78 |             model
79 | 
80 | 
81 | view model =
82 |     let
83 |         -- want only the search results not the returned index
84 |         searchResults =
85 |             Result.map Tuple.second resultSearchIndex
86 |     in
87 |     div []
88 |         [ text
89 |             ("Result of searching for \"explanations\" is "
90 |                 ++ Debug.toString searchResults
91 |             )
92 |         ]
93 | 


--------------------------------------------------------------------------------
/examples/src/IndexNewAddSearchListFields.elm:
--------------------------------------------------------------------------------
 1 | module Main exposing (ExampleDocType, createNewIndexExample, main, resultSearchIndex, resultUpdatedMyIndexAfterAdd)
 2 | 
 3 | {-| Create an index and add a document, search a document
 4 | This variation indexes words from a field which is List String.
 5 | 
 6 | Copyright (c) 2016 Robin Luiten
 7 | 
 8 | -}
 9 | 
10 | import Browser
11 | import ElmTextSearch
12 | import Html exposing (Html, div, text)
13 | 
14 | 
15 | {-| Example document type.
16 | -}
17 | type alias ExampleDocType =
18 |     { cid : String
19 |     , title : String
20 |     , author : String
21 |     , body : List String
22 |     }
23 | 
24 | 
25 | {-| Create an index with default configuration.
26 | See ElmTextSearch.SimpleConfig documentation for parameter information.
27 | -}
28 | createNewIndexExample : ElmTextSearch.Index ExampleDocType
29 | createNewIndexExample =
30 |     ElmTextSearch.new
31 |         { ref = .cid
32 |         , fields =
33 |             [ ( .title, 5.0 )
34 |             ]
35 |         , listFields =
36 |             [ ( .body, 1.0 )
37 |             ]
38 |         }
39 | 
40 | 
41 | {-| Add a document to an index.
42 | -}
43 | resultUpdatedMyIndexAfterAdd : Result String (ElmTextSearch.Index ExampleDocType)
44 | resultUpdatedMyIndexAfterAdd =
45 |     ElmTextSearch.add
46 |         { cid = "id1"
47 |         , title = "First Title"
48 |         , author = "Some Author"
49 |         , body =
50 |             [ "Words in this example "
51 |             , "document with explanations."
52 |             ]
53 |         }
54 |         createNewIndexExample
55 | 
56 | 
57 | {-| Search the index.
58 | 
59 | The result includes an updated Index because a search causes internal
60 | caches to be updated to improve overall performance.
61 | 
62 | -}
63 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
64 | resultSearchIndex =
65 |     resultUpdatedMyIndexAfterAdd
66 |         |> Result.andThen
67 |             (ElmTextSearch.search "explanations")
68 | 
69 | 
70 | {-| Display search result.
71 | -}
72 | main =
73 |     Browser.sandbox { init = 0, update = update, view = view }
74 | 
75 | 
76 | type Msg
77 |     = DoNothing
78 | 
79 | 
80 | update msg model =
81 |     case msg of
82 |         DoNothing ->
83 |             model
84 | 
85 | {-| Display search result.
86 | -}
87 | view model =
88 |     let
89 |         -- want only the search results not the returned index
90 |         searchResults =
91 |             Result.map Tuple.second resultSearchIndex
92 |     in
93 |     div []
94 |         [ text
95 |             ("Result of searching for \"explanations\" is "
96 |                 ++ Debug.toString searchResults
97 |             )
98 |         ]
99 | 


--------------------------------------------------------------------------------
/examples/src/IndexNewWithAddSearch.elm:
--------------------------------------------------------------------------------
  1 | module Main exposing (ExampleDocType, addDocToIndexExample, createMyStopWordFilter, createNewWithIndexExample, firstResultSearchIndex, main, secondResultSearchIndex)
  2 | 
  3 | {-| Create an index with customized stop word filter using
  4 | ElmTextSearch.newWith.
  5 | 
  6 | Copyright (c) 2016 Robin Luiten
  7 | 
  8 | -}
  9 | 
 10 | import Browser
 11 | import ElmTextSearch
 12 | import Html exposing (Html, div, text)
 13 | import Index.Defaults
 14 | import StopWordFilter
 15 | 
 16 | 
 17 | {-| Example document type.
 18 | -}
 19 | type alias ExampleDocType =
 20 |     { cid : String
 21 |     , title : String
 22 |     , author : String
 23 |     , body : String
 24 |     }
 25 | 
 26 | 
 27 | {-| Create an extended stop word filter.
 28 | 
 29 | Be careful about adding words to your stop word list, as any stop word
 30 | will not be indexed and you will not be able to search for the word in
 31 | documents as it will not be found.
 32 | 
 33 | It is possible to completely replace the stop word list and not
 34 | just extend it.
 35 | 
 36 | -}
 37 | createMyStopWordFilter =
 38 |     StopWordFilter.createFilterFuncWith
 39 |         [ "explanations" ]
 40 | 
 41 | 
 42 | {-| Create an index with extra options.
 43 | 
 44 |   - In this case a customized stop word filter is provided.
 45 |   - It is supplying the default transform factories.
 46 |   - It supplies an index type for the customized index config.
 47 |     This becomes important when loading back saved index.
 48 |   - It is a good idea to include a version in your index type string
 49 |     in case you update things and might still have old versions
 50 |     around that you need to work with.
 51 | 
 52 | -}
 53 | createNewWithIndexExample : ElmTextSearch.Index ExampleDocType
 54 | createNewWithIndexExample =
 55 |     ElmTextSearch.newWith
 56 |         { indexType = "ElmTextSearch - Customized Stop Words v1"
 57 |         , ref = .cid
 58 |         , fields =
 59 |             [ ( .title, 5.0 )
 60 |             , ( .body, 1.0 )
 61 |             ]
 62 |         , listFields = []
 63 |         , initialTransformFactories = Index.Defaults.defaultInitialTransformFactories
 64 |         , transformFactories = Index.Defaults.defaultTransformFactories
 65 |         , filterFactories = [ createMyStopWordFilter ]
 66 |         }
 67 | 
 68 | 
 69 | {-| Adding a document to the index.
 70 | -}
 71 | addDocToIndexExample : Result String (ElmTextSearch.Index ExampleDocType)
 72 | addDocToIndexExample =
 73 |     ElmTextSearch.add
 74 |         { cid = "id1"
 75 |         , title = "First Title"
 76 |         , author = "Some Author"
 77 |         , body = "Words in this example document with explanations."
 78 |         }
 79 |         createNewWithIndexExample
 80 | 
 81 | 
 82 | {-| Search the index for a word in our extended stop words.
 83 | This will return no matches.
 84 | -}
 85 | firstResultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
 86 | firstResultSearchIndex =
 87 |     addDocToIndexExample
 88 |         |> Result.andThen
 89 |             (ElmTextSearch.search "explanation")
 90 | 
 91 | 
 92 | {-| Search the index for a word that is not a stop word.
 93 | It will return an Err about no search terms.
 94 | -}
 95 | secondResultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
 96 | secondResultSearchIndex =
 97 |     addDocToIndexExample
 98 |         |> Result.andThen
 99 |             (ElmTextSearch.search "examples")
100 | 
101 | 
102 | {-| Display search result.
103 | -}
104 | main =
105 |     Browser.sandbox { init = 0, update = update, view = view }
106 | 
107 | 
108 | type Msg
109 |     = DoNothing
110 | 
111 | 
112 | update msg model =
113 |     case msg of
114 |         DoNothing ->
115 |             model
116 | 
117 | 
118 | view model =
119 |     let
120 |         searchResults1 =
121 |             Result.map Tuple.second firstResultSearchIndex
122 | 
123 |         searchResults2 =
124 |             Result.map Tuple.second secondResultSearchIndex
125 |     in
126 |     div []
127 |         [ div []
128 |             [ text
129 |                 ("Expecting no matches (because explanation is in stop words). Result of first search for \"explanation\" is "
130 |                     ++ Debug.toString searchResults1
131 |                 )
132 |             ]
133 |         , div []
134 |             [ text
135 |                 ("Result of second search for \"examples\" is "
136 |                     ++ Debug.toString searchResults2
137 |                 )
138 |             ]
139 |         ]
140 | 


--------------------------------------------------------------------------------
/examples/src/MultipleAddSearch.elm:
--------------------------------------------------------------------------------
  1 | module Main exposing (ExampleDocType, createNewIndexExample, documents, indexWithMulitpleDocumentsAdded, main, resultSearchIndex)
  2 | 
  3 | {-| Create an index and add multiple documents.
  4 | 
  5 | Copyright (c) 2016 Robin Luiten
  6 | 
  7 | -}
  8 | 
  9 | import Browser
 10 | import ElmTextSearch
 11 | import Html exposing (Html, div, text)
 12 | 
 13 | 
 14 | {-| Example document type.
 15 | -}
 16 | type alias ExampleDocType =
 17 |     { cid : String
 18 |     , title : String
 19 |     , author : String
 20 |     , body : String
 21 |     }
 22 | 
 23 | 
 24 | {-| Create an index with default configuration.
 25 | See ElmTextSearch.SimpleConfig documentation for parameter information.
 26 | -}
 27 | createNewIndexExample : ElmTextSearch.Index ExampleDocType
 28 | createNewIndexExample =
 29 |     ElmTextSearch.new
 30 |         { ref = .cid
 31 |         , fields =
 32 |             [ ( .title, 5.0 )
 33 |             , ( .body, 1.0 )
 34 |             ]
 35 |         , listFields = []
 36 |         }
 37 | 
 38 | 
 39 | documents =
 40 |     [ { cid = "id1"
 41 |       , title = "First Title"
 42 |       , author = "Some Author"
 43 |       , body = "Words in this example document with explanations."
 44 |       }
 45 |     , { cid = "id2"
 46 |       , title = "Is a cactus as pretty as a tree ?"
 47 |       , author = "Joe Greeny"
 48 |       , body = "This title contains information about cactuses."
 49 |       }
 50 |     ]
 51 | 
 52 | 
 53 | {-| Add a documents to index.
 54 | 
 55 | If any add result is an Err this returns the first failure.
 56 | 
 57 | -}
 58 | indexWithMulitpleDocumentsAdded : ( ElmTextSearch.Index ExampleDocType, List ( Int, String ) )
 59 | indexWithMulitpleDocumentsAdded =
 60 |     ElmTextSearch.addDocs
 61 |         documents
 62 |         createNewIndexExample
 63 | 
 64 | 
 65 | {-| Search the index.
 66 | 
 67 | The result includes an updated Index because a search causes internal
 68 | caches to be updated to improve overall performance.
 69 | 
 70 | This is ignoring any errors from call to addAllDocs
 71 | in indexWithMulitpleDocumentsAdded.
 72 | 
 73 | -}
 74 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) )
 75 | resultSearchIndex =
 76 |     ElmTextSearch.search "title" (Tuple.first indexWithMulitpleDocumentsAdded)
 77 | 
 78 | 
 79 | {-| Display search result.
 80 | -}
 81 | main =
 82 |     Browser.sandbox { init = 0, update = update, view = view }
 83 | 
 84 | 
 85 | type Msg
 86 |     = DoNothing
 87 | 
 88 | 
 89 | update msg model =
 90 |     case msg of
 91 |         DoNothing ->
 92 |             model
 93 | 
 94 | 
 95 | view model =
 96 |     let
 97 |         -- want only the search results not the returned index
 98 |         searchResults =
 99 |             Result.map Tuple.second resultSearchIndex
100 |     in
101 |     div []
102 |         [ text
103 |             ("Result of searching for \"title\" is "
104 |                 ++ Debug.toString searchResults
105 |             )
106 |         ]
107 | 


--------------------------------------------------------------------------------
/src/ElmTextSearch.elm:
--------------------------------------------------------------------------------
  1 | module ElmTextSearch exposing
  2 |     ( new
  3 |     , newWith
  4 |     , add
  5 |     , addT
  6 |     , addDocs
  7 |     , remove
  8 |     , removeT
  9 |     , update
 10 |     , addOrUpdate
 11 |     , search
 12 |     , searchT
 13 |     , Index
 14 |     , Config
 15 |     , SimpleConfig
 16 |     , storeToValue
 17 |     , storeToString
 18 |     , fromString
 19 |     , fromValue
 20 |     , fromStringWith
 21 |     , fromValueWith
 22 |     )
 23 | 
 24 | {-| A full text indexer written in Elm language inspired by lunr.js.
 25 | 
 26 | A useful article about lunr.js
 27 | <https://www.new-bamboo.co.uk/blog/2013/02/26/full-text-search-in-your-browser/>
 28 | 
 29 | 
 30 | ## Create Index
 31 | 
 32 | @docs new
 33 | @docs newWith
 34 | 
 35 | 
 36 | ## Modify Index
 37 | 
 38 | @docs add
 39 | @docs addT
 40 | @docs addDocs
 41 | @docs remove
 42 | @docs removeT
 43 | @docs update
 44 | @docs addOrUpdate
 45 | 
 46 | 
 47 | ## Query Index
 48 | 
 49 | @docs search
 50 | @docs searchT
 51 | 
 52 | 
 53 | ## Types
 54 | 
 55 | @docs Index
 56 | @docs Config
 57 | @docs SimpleConfig
 58 | 
 59 | 
 60 | ## Save and Load an Index
 61 | 
 62 |   - You can save an index using [`ElmTextSearch.Json.Encoder.encoder`](ElmTextSearch.Json.Encoder#encoder)
 63 |   - You can load a saved index using
 64 |     [`ElmTextSearch.Json.Decoder.decoder`](ElmTextSearch.Json.Decoder#decoder)
 65 |     to produce a [`Index.Model.CodecIndexRecord`](Index.Model#CodecIndexRecord).
 66 |   - You can save a [`Index.Model.CodecIndexRecord`](Index.Model#CodecIndexRecord)
 67 |     using [`ElmTextSearch.Json.Encoder.codecIndexRecordEncoder`](ElmTextSearch.Json.Encoder#codecIndexRecordEncoder)
 68 |   - \*\* Modifying an index outside of ElmTextSearch using the Decoder and Encoder directly
 69 |     may cause it to not work correctly loaded into ElmTextSearch. \*\*
 70 | 
 71 | @docs storeToValue
 72 | @docs storeToString
 73 | @docs fromString
 74 | @docs fromValue
 75 | @docs fromStringWith
 76 | @docs fromValueWith
 77 | 
 78 | Copyright (c) 2016 Robin Luiten
 79 | 
 80 | -}
 81 | 
 82 | import ElmTextSearch.Json.Encoder as IndexEncoder
 83 | import ElmTextSearchErrors
 84 | import Index
 85 | import Index.Defaults as Defaults
 86 | import Index.Load
 87 | import Index.Model as Model
 88 | import Json.Decode as Decode
 89 | import Json.Encode as Encode
 90 | 
 91 | 
 92 | {-| An Index holds the data to be able search for added documents.
 93 | -}
 94 | type alias Index doc =
 95 |     Index.Index doc
 96 | 
 97 | 
 98 | {-| A SimpleConfig is the least amount of configuration data
 99 | required to create an Index.
100 | 
101 | See [`ElmTextSearch.new`](ElmTextSearch#new) for fields.
102 | 
103 | -}
104 | type alias SimpleConfig doc =
105 |     Model.IndexSimpleConfig doc
106 | 
107 | 
108 | {-| A Config is required to create an Index.
109 | -}
110 | type alias Config doc =
111 |     Model.Config doc
112 | 
113 | 
114 | {-| Create new index.
115 | 
116 | Example
117 | 
118 |     import ElmTextSearch
119 | 
120 |     {-| Example document type.
121 |     -}
122 |     type alias ExampleDocType =
123 |         { cid : String
124 |         , title : String
125 |         , author : String
126 |         , body : String
127 |         }
128 | 
129 |     {-| Create an index with default configuration.
130 |     See ElmTextSearch.SimpleConfig documentation for parameter information.
131 |     -}
132 |     createNewIndexExample : ElmTextSearch.Index ExampleDocType
133 |     createNewIndexExample =
134 |         ElmTextSearch.new
135 |             { ref = .cid
136 |             , fields =
137 |                 [ ( .title, 5.0 )
138 |                 , ( .body, 1.0 )
139 |                 ]
140 |             , listFields = []
141 |             }
142 | 
143 | The `SimpleConfig` parameter to new is
144 | 
145 |   - ref
146 |       - The unique document reference will be extracted from each
147 |         document using `.cid`.
148 |   - fields
149 |       - Define which fields contain a strings to be indexed.
150 |       - The following fields will be indexed from each document
151 |           - `.title`
152 |           - `.body`
153 |       - When searching the index any word matches found in the
154 |         `.title` field (boost value 5.0) raise the document match score
155 |         more than if found in the `.body` field (boost value 1.0).
156 |           - The document match score determines the order of the list
157 |             of matching documents returned.
158 |   - listFields
159 |       - Define which fields contain list of strings to be indexed.
160 | 
161 | -}
162 | new : SimpleConfig doc -> Index doc
163 | new simpleConfig =
164 |     Index.new (Defaults.getIndexSimpleConfig simpleConfig)
165 | 
166 | 
167 | {-| Create new index with additional configuration.
168 | 
169 | Example.
170 | 
171 |     import ElmTextSearch
172 |     import Index.Defaults
173 |     import StopWordFilter
174 | 
175 |     type alias ExampleDocType =
176 |         { cid : String
177 |         , title : String
178 |         , author : String
179 |         , body : String
180 |         }
181 | 
182 |     createMyStopWordFilter =
183 |         StopWordFilter.createFilterFuncWith
184 |             [ "explanations" ]
185 | 
186 |     createNewWithIndexExample : ElmTextSearch.Index ExampleDocType
187 |     createNewWithIndexExample =
188 |         ElmTextSearch.newWith
189 |             { indexType = "ElmTextSearch - Customized Stop Words v1"
190 |             , ref = .cid
191 |             , fields =
192 |                 [ ( .title, 5.0 )
193 |                 , ( .body, 1.0 )
194 |                 ]
195 |             , listFields = []
196 |             , initialTransformFactories = Index.Defaults.defaultInitialTransformFactories
197 |             , transformFactories = Index.Defaults.defaultTransformFactories
198 |             , filterFactories = [ createMyStopWordFilter ]
199 |             }
200 | 
201 | -}
202 | newWith : Config doc -> Index doc
203 | newWith =
204 |     Index.newWith
205 | 
206 | 
207 | {-| Add a document to an index.
208 | 
209 | Starting with the ElmTextSearch.new example above this adds a document.
210 | 
211 |     addDocToIndexExample : Result String (ElmTextSearch.Index ExampleDocType)
212 |     addDocToIndexExample =
213 |         ElmTextSearch.add
214 |             { cid = "id1"
215 |             , title = "First Title"
216 |             , author = "Some Author"
217 |             , body = "Words in this example document with explanations."
218 |             }
219 |             createNewWithIndexExample
220 | 
221 | Conditions that cause a result Err with message.
222 | 
223 |   - Error document ref is empty.
224 |   - Error after tokenisation there are no terms to index.
225 |   - Error adding document that allready exists.
226 | 
227 | Original function signature retained for backward compatible.
228 | 
229 | -}
230 | add : doc -> Index doc -> Result String (Index doc)
231 | add =
232 |     Index.add
233 | 
234 | 
235 | {-| Add document to an Index if no error conditions found.
236 | 
237 | Variant of `add` that provides AddError type for error Results.
238 | 
239 | -}
240 | addT : doc -> Index doc -> Result ElmTextSearchErrors.AddError (Index doc)
241 | addT =
242 |     Index.addT
243 | 
244 | 
245 | {-| Add multiple documents. Tries to add all docs and collects errors..
246 | It does not stop adding at first error encountered.
247 | 
248 | The result part List (Int, String) is the list of document index
249 | and the error string message result of adding.
250 | Returns the index unchanged if all documents error when added.
251 | Returns the updated index after adding the documents.
252 | 
253 | -}
254 | addDocs : List doc -> Index doc -> ( Index doc, List ( Int, String ) )
255 | addDocs =
256 |     Index.addDocs
257 | 
258 | 
259 | {-| Remove a document from an index.
260 | 
261 | Starting with the ElmTextSearch.new example above this removes a document.
262 | 
263 |     removeDocFromIndexExample =
264 |         ElmTextSearch.remove
265 |             { cid = "123"
266 |             , title = "Examples of a Banana"
267 |             , author = "Sally Apples"
268 |             , body = "Sally writes words about a banana."
269 |             }
270 |             createNewIndexExample
271 | 
272 | Conditions that cause a result Err with message.
273 | 
274 |   - Error document has an empty unique id (ref).
275 |   - Error document is not in index.
276 | 
277 | Original function signature retained for backward compatible.
278 | 
279 | -}
280 | remove : doc -> Index doc -> Result String (Index doc)
281 | remove =
282 |     Index.remove
283 | 
284 | 
285 | {-| Add document to an Index if no error conditions found.
286 | 
287 | Variant of `remove` that provides RemoveError type for error Results.
288 | 
289 | -}
290 | removeT : doc -> Index doc -> Result ElmTextSearchErrors.RemoveError (Index doc)
291 | removeT =
292 |     Index.removeT
293 | 
294 | 
295 | {-| Update a document in an index.
296 | 
297 | Starting with the ElmTextSearch.new example above this updates a document.
298 | 
299 |       updatedIndex =
300 |         ElmTextSearch.update
301 |           { cid = "123"
302 |           , title = "Examples of a Bananas in every day life."
303 |           , author = "Sally Apples"
304 |           , body = "Sally writes more words about a banana."
305 |           }
306 |           createNewIndexExample
307 | 
308 | Conditions that cause an error result are those for
309 | [`ElmTextSearch.remove`](ElmTextSearch#remove) and
310 | [`ElmTextSearch.add`](ElmTextSearch#add).
311 | 
312 | -}
313 | update : doc -> Index doc -> Result String (Index doc)
314 | update =
315 |     Index.update
316 | 
317 | 
318 | {-| Add or Update a document in an index.
319 | This removes the document first if it is allready in index then adds it.
320 | -}
321 | addOrUpdate : doc -> Index doc -> Result String (Index doc)
322 | addOrUpdate =
323 |     Index.addOrUpdate
324 | 
325 | 
326 | {-| Search an index with query.
327 | 
328 | Tokens are extracted from the query string and passed through the
329 | same processing used when indexing documents.
330 | 
331 | Each token is expanded, so that the term "he" might be expanded to "hello"
332 | and "help" if those terms were already included in the document index.
333 | 
334 | Multiple tokens are allowed and will lead to an AND based query.
335 | 
336 | The following example runs a search for documents containing both "apple" and "banana".
337 | 
338 |     searchResult =
339 |         Index.search "Apple banana" createNewIndexExample
340 | 
341 | Results are a list of matching document reference identifiers with
342 | there similarity to query score, ordered by score descending, so the
343 | best matches are earliest in the list.
344 | 
345 | An index is returned from search as well. This is because the data model may
346 | be updated to improve performance for later searches.
347 | 
348 | Adding or removing a new document will cause some of the internal caching
349 | to be reset.
350 | 
351 | Conditions that cause a result Err with message.
352 | 
353 |   - Error there are no documents in index to search.
354 |   - Error query is empty.
355 |   - Error after tokenisation there are no terms to search for.
356 | 
357 | Original function signature retained for backward compatible.
358 | 
359 | -}
360 | search :
361 |     String
362 |     -> Index doc
363 |     -> Result String ( Index doc, List ( String, Float ) )
364 | search =
365 |     Index.search
366 | 
367 | 
368 | {-| Add document to an Index if no error conditions found.
369 | 
370 | Variant of `search` that provides SearchError type for error Results.
371 | 
372 | -}
373 | searchT :
374 |     String
375 |     -> Index doc
376 |     -> Result ElmTextSearchErrors.SearchError ( Index doc, List ( String, Float ) )
377 | searchT =
378 |     Index.searchT
379 | 
380 | 
381 | {-| Store an index to a Value.
382 | You can also use [`ElmTextSearch.Json.Encoder`](ElmTextSearch.Json.Encoder).
383 | -}
384 | storeToValue : Index doc -> Encode.Value
385 | storeToValue =
386 |     IndexEncoder.encoder
387 | 
388 | 
389 | {-| Store an index to a String.
390 | You can also use [`ElmTextSearch.Json.Encoder`](ElmTextSearch.Json.Encoder).
391 | -}
392 | storeToString : Index doc -> String
393 | storeToString index =
394 |     Encode.encode 0 (IndexEncoder.encoder index)
395 | 
396 | 
397 | {-| Create an Index from a String which has a stored Index in it and the
398 | supplied basic configurations.
399 | See [`ElmTextSearch.fromStringWith`](ElmTextSearch#fromStringWith) for possible Err results.
400 | -}
401 | fromString : SimpleConfig doc -> String -> Result Decode.Error (Index doc)
402 | fromString simpleConfig inputString =
403 |     Index.Load.loadIndex
404 |         (Defaults.getIndexSimpleConfig simpleConfig)
405 |         inputString
406 | 
407 | 
408 | {-| Create an Index from a Value which has a stored Index in it.
409 | See [`ElmTextSearch.fromStringWith`](ElmTextSearch#fromStringWith) for possible Err results.
410 | -}
411 | fromValue : SimpleConfig doc -> Decode.Value -> Result Decode.Error (Index doc)
412 | fromValue simpleConfig inputValue =
413 |     Index.Load.loadIndexValue
414 |         (Defaults.getIndexSimpleConfig simpleConfig)
415 |         inputValue
416 | 
417 | 
418 | {-| Create an Index from a String which has a stored Index in it.
419 | 
420 | If none of the indexVersion in the list of Config match the index
421 | type being loaded it will return an Err.
422 | 
423 | The list of configurations wil be searched for a matching indexType
424 | so you should provide configs for all types you may be trying to load.
425 | No more than the config that matches is required though.
426 | 
427 | If the none of the supplied Config match the loaded Index then it
428 | will try if the index being loaded matches the default version if so
429 | it will still load the index.
430 | 
431 | The following Err results may be returned.
432 | 
433 |   - "Error cannot load Index. Tried to load index of type "\_\_IndexTest Type -". It is not in supported index configurations."
434 |       - It contains the loaded version index type which comes from input.
435 |   - "Error cannot load Index. Version supported is 1.0.0. Version tried to load is 1.0.1."
436 |       - It includes both expected and loaded versions which may vary.
437 | 
438 | -}
439 | fromStringWith : List (Config doc) -> String -> Result Decode.Error (Index doc)
440 | fromStringWith =
441 |     Index.Load.loadIndexWith
442 | 
443 | 
444 | {-| Create an Index from a String which has a stored Index in it.
445 | If none of the indexVersion in the list of SimpleConfig match the index
446 | being decoded it will return an Err.
447 | 
448 | See [`ElmTextSearch.fromStringWith`](ElmTextSearch#fromStringWith) for possible Err results.
449 | 
450 | -}
451 | fromValueWith : List (Config doc) -> Decode.Value -> Result Decode.Error (Index doc)
452 | fromValueWith =
453 |     Index.Load.loadIndexValueWith
454 | 


--------------------------------------------------------------------------------
/src/ElmTextSearch/Json/Decoder.elm:
--------------------------------------------------------------------------------
 1 | module ElmTextSearch.Json.Decoder exposing (decoder)
 2 | 
 3 | {-| Decoder for Index.
 4 | 
 5 | It decodes to a CodecIndexRecord.
 6 | 
 7 | @docs decoder
 8 | 
 9 | Copyright (c) 2016 Robin Luiten
10 | 
11 | -}
12 | 
13 | import Dict exposing (Dict)
14 | import Index.Model as Model
15 | import Json.Decode as Decode exposing (..)
16 | import Json.Decode.Pipeline exposing (required)
17 | import Set exposing (Set)
18 | import Trie.Json.Decoder as TrieDecoder
19 | 
20 | 
21 | {-| CodecIndexRecord decoder.
22 | -}
23 | decoder : Decoder Model.CodecIndexRecord
24 | decoder =
25 |     Decode.succeed Model.CodecIndexRecord
26 |         |> required "indexVersion" string
27 |         |> required "indexType" string
28 |         |> required "documentStore" documentStoreDecoder
29 |         |> required "corpusTokens" setDecoder
30 |         |> required "tokenStore" (TrieDecoder.decoder float)
31 | 
32 | 
33 | documentStoreDecoder : Decoder (Dict String (Set String))
34 | documentStoreDecoder =
35 |     dict setDecoder
36 | 
37 | 
38 | setDecoder : Decoder (Set String)
39 | setDecoder =
40 |     map Set.fromList (list string)
41 | 


--------------------------------------------------------------------------------
/src/ElmTextSearch/Json/Encoder.elm:
--------------------------------------------------------------------------------
 1 | module ElmTextSearch.Json.Encoder exposing
 2 |     ( encoder
 3 |     , codecIndexRecordEncoder
 4 |     )
 5 | 
 6 | {-| Encoder for Index.
 7 | 
 8 | @docs encoder
 9 | @docs codecIndexRecordEncoder
10 | 
11 | Copyright (c) 2016 Robin Luiten
12 | 
13 | -}
14 | 
15 | import Dict exposing (Dict)
16 | import Index
17 | import Index.Model as Model exposing (Index(..))
18 | import Json.Encode as Encode
19 | import Set exposing (Set)
20 | import Trie exposing (Trie)
21 | import Trie.Json.Encoder as TrieEncoder
22 | 
23 | 
24 | {-| Encoder for Index a.
25 | 
26 | Only encoding fields required to recreate a working index.
27 | 
28 | The following fields are not saved as they are restored via
29 | the provided Config on fromString.
30 | 
31 |   - ref
32 |   - fields
33 |   - transformFactories
34 |   - filterFactories
35 | 
36 | The following fields are not saved because they are an
37 | acceleration model, decoder needs to set it on fromString.
38 | 
39 |   - corpusTokensIndex
40 | 
41 | The following fields are not saved because they are caches
42 | and are cached as operationg requires
43 | 
44 |   - transforms
45 |   - filters
46 |   - idfCache
47 | 
48 | Do not need an (a -> Encode.Value) because a is a document
49 | type and that is never encoded from an Index.
50 | 
51 | -}
52 | encoder : Index doc -> Encode.Value
53 | encoder (Index irec) =
54 |     codecIndexRecordEncoder
55 |         { indexVersion = irec.indexVersion
56 |         , indexType = irec.indexType
57 |         , documentStore = irec.documentStore
58 |         , corpusTokens = irec.corpusTokens
59 |         , tokenStore = irec.tokenStore
60 |         }
61 | 
62 | 
63 | {-| Encode CodecIndexRecord.
64 | -}
65 | codecIndexRecordEncoder : Model.CodecIndexRecord -> Encode.Value
66 | codecIndexRecordEncoder rec =
67 |     Encode.object
68 |         [ ( "indexVersion", Encode.string rec.indexVersion )
69 |         , ( "indexType", Encode.string rec.indexType )
70 |         , ( "documentStore", documentStoreEncoder rec.documentStore )
71 |         , ( "corpusTokens", corpusTokensEncoder rec.corpusTokens )
72 |         , ( "tokenStore", tokenStore rec.tokenStore )
73 |         ]
74 | 
75 | 
76 | documentStoreEncoder : Dict String (Set String) -> Encode.Value
77 | documentStoreEncoder dict =
78 |     Encode.object <|
79 |         List.map
80 |             (\( key, val ) ->
81 |                 ( key
82 |                 , Encode.list Encode.string (Set.toList val)
83 |                 )
84 |             )
85 |             (Dict.toList dict)
86 | 
87 | 
88 | corpusTokensEncoder : Set String -> Encode.Value
89 | corpusTokensEncoder setVal =
90 |     Encode.list Encode.string (Set.toList setVal)
91 | 
92 | tokenStore : Trie Float -> Encode.Value
93 | tokenStore =
94 |     TrieEncoder.encoder Encode.float
95 | 


--------------------------------------------------------------------------------
/src/ElmTextSearchErrors.elm:
--------------------------------------------------------------------------------
 1 | module ElmTextSearchErrors exposing
 2 |     ( AddError(..)
 3 |     , RemoveError(..)
 4 |     , SearchError(..)
 5 |     )
 6 | 
 7 | {-| Error types used in ElmTextSearch results.
 8 | 
 9 | 
10 | ## Types
11 | 
12 | @docs AddError
13 | @docs RemoveError
14 | @docs SearchError
15 | 
16 | -}
17 | 
18 | 
19 | {-| Used in error Result case of ElmTextSearh.addT
20 | -}
21 | type AddError
22 |     = AddErrorUniqueRefIsEmpty
23 |     | NoTermsToIndexAfterTokenisation
24 |     | DocAlreadyExists
25 | 
26 | 
27 | {-| Used in error Result case of ElmTextSearh.removeT
28 | -}
29 | type RemoveError
30 |     = RemoveErrorUniqueRefIsEmpty
31 |     | DocIsNotInIndex
32 | 
33 | 
34 | {-| Used in error Result case of ElmTextSearh.searchT
35 | -}
36 | type SearchError
37 |     = IndexIsEmpty
38 |     | QueryIsEmpty
39 |     | NoTermsToSearchAfterTokenisation
40 | 


--------------------------------------------------------------------------------
/src/Index.elm:
--------------------------------------------------------------------------------
  1 | module Index exposing
  2 |     ( new
  3 |     , newWith
  4 |     , add
  5 |     , addT
  6 |     , addDocs
  7 |     , remove
  8 |     , removeT
  9 |     , update
 10 |     , addOrUpdate
 11 |     , search
 12 |     , searchT
 13 |     , Index
 14 |     )
 15 | 
 16 | {-| Index module for full text indexer
 17 | 
 18 | Added addT, removeT and searchT functions that provide
 19 | a strong type for Error in the Result.
 20 | 
 21 | 
 22 | ## Create Index
 23 | 
 24 | @docs new
 25 | @docs newWith
 26 | 
 27 | 
 28 | ## Update Index
 29 | 
 30 | @docs add
 31 | @docs addT
 32 | @docs addDocs
 33 | @docs remove
 34 | @docs removeT
 35 | @docs update
 36 | @docs addOrUpdate
 37 | 
 38 | 
 39 | ## Query Index
 40 | 
 41 | @docs search
 42 | @docs searchT
 43 | 
 44 | 
 45 | ## Types
 46 | 
 47 | @docs Index
 48 | 
 49 | Copyright (c) 2016 Robin Luiten
 50 | 
 51 | -}
 52 | 
 53 | import Dict
 54 | import ElmTextSearchErrors exposing (AddError(..), RemoveError(..), SearchError(..))
 55 | import Index.Defaults as Defaults
 56 | import Index.Model as Model exposing (Index(..))
 57 | import Index.Utils
 58 | import Index.Vector exposing (..)
 59 | import Maybe
 60 | import Set exposing (Set)
 61 | import String
 62 | import Trie
 63 | import Utils
 64 | 
 65 | 
 66 | type alias Index doc =
 67 |     Model.Index doc
 68 | 
 69 | 
 70 | type alias Config doc =
 71 |     Model.Config doc
 72 | 
 73 | 
 74 | type alias SimpleConfig doc =
 75 |     Model.ModelSimpleConfig doc
 76 | 
 77 | 
 78 | {-| Create new index.
 79 | -}
 80 | new : SimpleConfig doc -> Index doc
 81 | new simpleConfig =
 82 |     newWith
 83 |         (Defaults.getDefaultIndexConfig simpleConfig)
 84 | 
 85 | 
 86 | {-| Create new index with control of transformers and filters.
 87 | -}
 88 | newWith : Config doc -> Index doc
 89 | newWith { indexType, ref, fields, listFields, initialTransformFactories, transformFactories, filterFactories } =
 90 |     Index
 91 |         { indexVersion = Defaults.indexVersion
 92 |         , indexType = indexType
 93 |         , ref = ref
 94 |         , fields = fields
 95 |         , listFields = listFields
 96 |         , initialTransformFactories = initialTransformFactories
 97 |         , transformFactories = transformFactories
 98 |         , filterFactories = filterFactories
 99 |         , initialTransforms = Nothing
100 |         , transforms = Nothing
101 |         , filters = Nothing
102 |         , corpusTokens = Set.empty
103 |         , corpusTokensIndex = Dict.empty
104 |         , documentStore = Dict.empty
105 |         , tokenStore = Trie.empty
106 |         , idfCache = Dict.empty
107 |         }
108 | 
109 | 
110 | {-| Add document to an Index if no error conditions found.
111 | See ElmTextSearch documentation for `add` to see error conditions.
112 | 
113 | Original function signature retained for backward compatible.
114 | 
115 | -}
116 | add : doc -> Index doc -> Result String (Index doc)
117 | add doc index =
118 |     case addT doc index of
119 |         Ok resultValue ->
120 |             Ok resultValue
121 | 
122 |         Err error ->
123 |             case error of
124 |                 AddErrorUniqueRefIsEmpty ->
125 |                     Err "Error document has an empty unique id (ref)."
126 | 
127 |                 DocAlreadyExists ->
128 |                     Err "Error adding document that allready exists."
129 | 
130 |                 NoTermsToIndexAfterTokenisation ->
131 |                     Err "Error after tokenisation there are no terms to index."
132 | 
133 | 
134 | {-| Add document to an Index if no error conditions found.
135 | 
136 | Variant that supports AddError type for Result
137 | 
138 | See ElmTextSearch documentation for `add` to see error conditions.
139 | 
140 | -}
141 | addT : doc -> Index doc -> Result AddError (Index doc)
142 | addT doc ((Index irec) as index) =
143 |     let
144 |         docRef =
145 |             irec.ref doc
146 |     in
147 |     if String.isEmpty docRef then
148 |         Err AddErrorUniqueRefIsEmpty
149 | 
150 |     else if Index.Utils.refExists docRef index then
151 |         Err DocAlreadyExists
152 | 
153 |     else
154 |         let
155 |             ( u1index, fieldsWordListAndBoost ) =
156 |                 List.foldr
157 |                     (getWordsForField doc)
158 |                     ( index, [] )
159 |                     irec.fields
160 | 
161 |             ( u2index, u2fieldsWordListAndBoost ) =
162 |                 List.foldr
163 |                     (getWordsForFieldList doc)
164 |                     ( u1index, fieldsWordListAndBoost )
165 |                     irec.listFields
166 | 
167 |             docTokens =
168 |                 List.map Tuple.first u2fieldsWordListAndBoost
169 |                     |> List.foldr Set.union Set.empty
170 |         in
171 |         if Set.isEmpty docTokens then
172 |             Err NoTermsToIndexAfterTokenisation
173 | 
174 |         else
175 |             Ok (addDoc docRef u2fieldsWordListAndBoost docTokens u2index)
176 | 
177 | 
178 | {-| Add multiple documents. Tries to add all docs and collects errors..
179 | It does not stop adding at first error encountered.
180 | 
181 | The result part List (Int, String) is the list of document index
182 | and the error string message result of adding.
183 | Returns the index unchanged if all documents error when added.
184 | Returns the updated index after adding the documents.
185 | 
186 | -}
187 | addDocs : List doc -> Index doc -> ( Index doc, List ( Int, String ) )
188 | addDocs docs index =
189 |     addDocsCore 0 docs index []
190 | 
191 | 
192 | addDocsCore :
193 |     Int
194 |     -> List doc
195 |     -> Index doc
196 |     -> List ( Int, String )
197 |     -> ( Index doc, List ( Int, String ) )
198 | addDocsCore docsI docs index errors =
199 |     case docs of
200 |         [] ->
201 |             ( index, errors )
202 | 
203 |         headDoc :: tailDocs ->
204 |             case add headDoc index of
205 |                 Ok u1index ->
206 |                     addDocsCore (docsI + 1) tailDocs u1index errors
207 | 
208 |                 Err msg ->
209 |                     addDocsCore (docsI + 1) tailDocs index (errors ++ [ ( docsI, msg ) ])
210 | 
211 | 
212 | {-| Reducer to extract tokens from each field String from doc.
213 | -}
214 | getWordsForField :
215 |     doc
216 |     -> ( doc -> String, Float )
217 |     -> ( Index doc, List ( Set String, Float ) )
218 |     -> ( Index doc, List ( Set String, Float ) )
219 | getWordsForField doc ( getField, fieldBoost ) ( index, fieldsLists ) =
220 |     -- GRR fieldBoost goes where? dammmit. it doesnt belong here :( its not part of aggregate
221 |     let
222 |         ( u1index, tokens ) =
223 |             Index.Utils.getTokens index (getField doc)
224 |     in
225 |     ( u1index, ( Set.fromList tokens, fieldBoost ) :: fieldsLists )
226 | 
227 | 
228 | {-| Reducer to extract tokens from each field List String from doc.
229 | -}
230 | getWordsForFieldList :
231 |     doc
232 |     -> ( doc -> List String, Float )
233 |     -> ( Index doc, List ( Set String, Float ) )
234 |     -> ( Index doc, List ( Set String, Float ) )
235 | getWordsForFieldList doc ( getFieldList, fieldBoost ) ( index, fieldsLists ) =
236 |     let
237 |         ( u1index, tokens ) =
238 |             Index.Utils.getTokensList index (getFieldList doc)
239 |     in
240 |     ( u1index, ( Set.fromList tokens, fieldBoost ) :: fieldsLists )
241 | 
242 | 
243 | {-| Add the document to the index.
244 | -}
245 | addDoc : String -> List ( Set String, Float ) -> Set String -> Index doc -> Index doc
246 | addDoc docRef fieldTokensAndBoosts docTokens (Index irec) =
247 |     let
248 |         addTokenScore ( token, score ) trie =
249 |             Trie.add ( docRef, score ) token trie
250 | 
251 |         updatedDocumentStore =
252 |             Dict.insert docRef docTokens irec.documentStore
253 | 
254 |         updatedCorpusTokens =
255 |             Set.union irec.corpusTokens docTokens
256 | 
257 |         -- can the cost of this be reduced ?
258 |         updatedCorpusTokensIndex =
259 |             Index.Utils.buildOrderIndex updatedCorpusTokens
260 | 
261 |         tokenAndScores =
262 |             List.map
263 |                 (scoreToken fieldTokensAndBoosts)
264 |                 (Set.toList docTokens)
265 | 
266 |         updatedTokenStore =
267 |             List.foldr addTokenScore irec.tokenStore tokenAndScores
268 |     in
269 |     Index
270 |         { irec
271 |             | documentStore = updatedDocumentStore
272 |             , corpusTokens = updatedCorpusTokens
273 |             , corpusTokensIndex = updatedCorpusTokensIndex
274 |             , tokenStore = updatedTokenStore
275 |             , idfCache = Dict.empty
276 |         }
277 | 
278 | 
279 | {-| Return term frequency score for a token in document.
280 | 
281 | Overall score for a token is based on the number of fields the word
282 | appears and weighted by boost score on each field.
283 | 
284 | -}
285 | scoreToken : List ( Set String, Float ) -> String -> ( String, Float )
286 | scoreToken fieldTokensAndBoost token =
287 |     let
288 |         score : ( Set String, Float ) -> Float -> Float
289 |         score ( tokenSet, fieldBoost ) scoreSum =
290 |             if Set.isEmpty tokenSet then
291 |                 scoreSum
292 | 
293 |             else
294 |                 let
295 |                     tokenBoost =
296 |                         if Set.member token tokenSet then
297 |                             fieldBoost / toFloat (Set.size tokenSet)
298 | 
299 |                         else
300 |                             0
301 |                 in
302 |                 scoreSum + tokenBoost
303 |     in
304 |     ( token, List.foldr score 0 fieldTokensAndBoost )
305 | 
306 | 
307 | {-| Remove document from an Index if no error result conditions encountered.
308 | 
309 | Original function signature retained for backward compatible.
310 | 
311 | See ElmTextSearch documentation for `remove` to see error result conditions.
312 | 
313 | This does the following things
314 | 
315 |   - Remove the document tags from documentStore.
316 |   - Remove all the document references in tokenStore.
317 |   - It does not modify corpusTokens - as this requires
318 |     reprocessing tokens for all documents to recreate corpusTokens.
319 |       - This may skew the results over time after many removes but not badly.
320 |       - It appears lunr.js operates this way as well for remove.
321 | 
322 | -}
323 | remove : doc -> Index doc -> Result String (Index doc)
324 | remove doc index =
325 |     case removeT doc index of
326 |         Ok value ->
327 |             Ok value
328 | 
329 |         Err err ->
330 |             case err of
331 |                 DocIsNotInIndex ->
332 |                     Err "Error document is not in index."
333 | 
334 |                 RemoveErrorUniqueRefIsEmpty ->
335 |                     Err "Error document has an empty unique id (ref)."
336 | 
337 | 
338 | {-| Remove document from an Index if no error result conditions encountered.
339 | 
340 | Variant that supports RemoveError type for Result
341 | 
342 | See ElmTextSearch documentation for `remove` to see error result conditions.
343 | 
344 | This does the following things
345 | 
346 |   - Remove the document tags from documentStore.
347 |   - Remove all the document references in tokenStore.
348 |   - It does not modify corpusTokens - as this requires
349 |     reprocessing tokens for all documents to recreate corpusTokens.
350 |       - This may skew the results over time after many removes but not badly.
351 |       - It appears lunr.js operates this way as well for remove.
352 | 
353 | -}
354 | removeT : doc -> Index doc -> Result RemoveError (Index doc)
355 | removeT doc ((Index irec) as index) =
356 |     let
357 |         docRef =
358 |             irec.ref doc
359 |     in
360 |     if String.isEmpty docRef then
361 |         Err RemoveErrorUniqueRefIsEmpty
362 | 
363 |     else if not (Index.Utils.refExists docRef index) then
364 |         Err DocIsNotInIndex
365 | 
366 |     else
367 |         Ok
368 |             (Maybe.withDefault index <|
369 |                 Maybe.map
370 |                     (removeDoc docRef index)
371 |                     (Dict.get docRef irec.documentStore)
372 |             )
373 | 
374 | 
375 | errorMessageNotIndex : String
376 | errorMessageNotIndex =
377 |     "Error document is not in index."
378 | 
379 | 
380 | {-| Remove the doc by docRef id from the index.
381 | -}
382 | removeDoc : String -> Index doc -> Set String -> Index doc
383 | removeDoc docRef (Index irec) docTokens =
384 |     let
385 |         removeToken token trie =
386 |             Trie.remove token docRef trie
387 | 
388 |         updatedDocumentStore =
389 |             Dict.remove docRef irec.documentStore
390 | 
391 |         updatedTokenStore =
392 |             List.foldr removeToken irec.tokenStore (Set.toList docTokens)
393 |     in
394 |     Index
395 |         { irec
396 |             | documentStore = updatedDocumentStore
397 |             , tokenStore = updatedTokenStore
398 |             , idfCache = Dict.empty
399 |         }
400 | 
401 | 
402 | {-| Update document in Index. Does a remove then add.
403 | See ElmTextSearch documentation for `add` and `remove` to see error result conditions.
404 | -}
405 | update : doc -> Index doc -> Result String (Index doc)
406 | update doc index =
407 |     remove doc index
408 |         |> Result.andThen (add doc)
409 | 
410 | 
411 | {-| Add or Update document in Index.
412 | This does an add if document is not in index.
413 | -}
414 | addOrUpdate : doc -> Index doc -> Result String (Index doc)
415 | addOrUpdate doc index =
416 |     case remove doc index of
417 |         Ok u1index ->
418 |             add doc u1index
419 | 
420 |         Err msg ->
421 |             if msg == errorMessageNotIndex then
422 |                 add doc index
423 | 
424 |             else
425 |                 Err msg
426 | 
427 | 
428 | {-| Search index with query.
429 | See ElmTextSearch documentation for `search` to see error result conditions.
430 | 
431 | Original function signature retained for backward compatible.
432 | 
433 | -}
434 | search : String -> Index doc -> Result String ( Index doc, List ( String, Float ) )
435 | search query index =
436 |     case searchT query index of
437 |         Ok value ->
438 |             Ok value
439 | 
440 |         Err error ->
441 |             Err <|
442 |                 case error of
443 |                     IndexIsEmpty ->
444 |                         "Error there are no documents in index to search."
445 | 
446 |                     QueryIsEmpty ->
447 |                         "Error query is empty."
448 | 
449 |                     NoTermsToSearchAfterTokenisation ->
450 |                         "Error after tokenisation there are no terms to search for."
451 | 
452 | 
453 | {-| Search index with query.
454 | See ElmTextSearch documentation for `search` to see error result conditions.
455 | 
456 | Variant that supports RemoveError type for Result
457 | 
458 | -}
459 | searchT : String -> Index doc -> Result SearchError ( Index doc, List ( String, Float ) )
460 | searchT query index =
461 |     let
462 |         ( (Index i1irec) as i1index, tokens ) =
463 |             Index.Utils.getTokens index query
464 | 
465 |         tokenInStore token =
466 |             Trie.getNode token i1irec.tokenStore /= Nothing
467 |     in
468 |     if Dict.isEmpty i1irec.documentStore then
469 |         Err IndexIsEmpty
470 | 
471 |     else if String.isEmpty (String.trim query) then
472 |         Err QueryIsEmpty
473 | 
474 |     else if List.isEmpty tokens then
475 |         Err NoTermsToSearchAfterTokenisation
476 | 
477 |     else if List.isEmpty tokens || not (List.any tokenInStore tokens) then
478 |         Ok ( i1index, [] )
479 | 
480 |     else
481 |         Ok (searchTokens tokens i1index)
482 | 
483 | 
484 | {-| Return list of document ref's with score, ordered by score descending.
485 | 
486 | This had a bug it used "fields" boosts but did not use "listFields" for all fields indexed.
487 | This meant that if you only indexed with listFields that fieldsBoosts would be zero and
488 | resultant score would end up NaN.
489 | 
490 | In addition a second problem was that it makes little to no sense to scale query vector
491 | by average of all fields boost as it does not change the relative score document matches.
492 | So removing boost on queries is a simpler solution than including "listFields" into boosts.
493 | 
494 | -}
495 | searchTokens :
496 |     List String
497 |     -> Index doc
498 |     -> ( Index doc, List ( String, Float ) )
499 | searchTokens tokens index =
500 |     let
501 |         ( tokenDocSets, queryVector, u1index ) =
502 |             Index.Vector.getQueryVector
503 |                 tokens
504 |                 index
505 | 
506 |         ( u2index, matchedDocs ) =
507 |             List.foldr
508 |                 (scoreAndCompare queryVector)
509 |                 ( u1index, [] )
510 |                 (Set.toList (Utils.intersectSets tokenDocSets))
511 | 
512 |         -- _ = Debug.log "searchTokens intersect" (Utils.intersectSets tokenDocSets)
513 |     in
514 |     ( u2index, List.reverse (List.sortBy Tuple.second matchedDocs) )
515 | 


--------------------------------------------------------------------------------
/src/Index/Defaults.elm:
--------------------------------------------------------------------------------
  1 | module Index.Defaults exposing
  2 |     ( indexVersion
  3 |     , elmTextSearchIndexType
  4 |     , defaultTransformFactories
  5 |     , defaultFilterFactories
  6 |     , defaultTokenTrimmerFuncCreator
  7 |     , defaultStemmerFuncCreator
  8 |     , defaultStopWordFilterFuncCreator
  9 |     , defaultInitialTransformFactories
 10 |     , getDefaultIndexConfig
 11 |     , getIndexSimpleConfig
 12 |     )
 13 | 
 14 | {-| Defaults for indexes and configurations.
 15 | 
 16 | 
 17 | ## Index Storage Engine Version and Type
 18 | 
 19 | @docs indexVersion
 20 | @docs elmTextSearchIndexType
 21 | 
 22 | 
 23 | ## Built in Transforms and Filters
 24 | 
 25 | @docs defaultTransformFactories
 26 | @docs defaultFilterFactories
 27 | @docs defaultTokenTrimmerFuncCreator
 28 | @docs defaultStemmerFuncCreator
 29 | @docs defaultStopWordFilterFuncCreator
 30 | @docs defaultInitialTransformFactories
 31 | 
 32 | 
 33 | ## Config type adapters
 34 | 
 35 | @docs getDefaultIndexConfig
 36 | @docs getIndexSimpleConfig
 37 | 
 38 | Copyright (c) 2016 Robin Luiten
 39 | 
 40 | -}
 41 | 
 42 | import Index.Model as Model
 43 |     exposing
 44 |         ( FilterFactory
 45 |         , IndexSimpleConfig
 46 |         , TransformFactory
 47 |         )
 48 | import Index.Utils
 49 | import Stemmer
 50 | import StopWordFilter
 51 | import TokenProcessors
 52 | 
 53 | 
 54 | {-| The version of index, for loading a saved index.
 55 | 
 56 | This is not the same as package version.
 57 | 
 58 | This needs to change if the encoded format changes. Be careful of updates to
 59 | Trie package, if Trie encoding format changes this version needs to change as
 60 | well.
 61 | 
 62 | -}
 63 | indexVersion : String
 64 | indexVersion =
 65 |     "1.1.0"
 66 | 
 67 | 
 68 | {-| The type of index defaults to using.
 69 | It defines the default token transforms and filters.
 70 | -}
 71 | elmTextSearchIndexType : String
 72 | elmTextSearchIndexType =
 73 |     "-= ElmTextSearch Index Type 1 =-"
 74 | 
 75 | 
 76 | {-| Index default transform factories.
 77 | -}
 78 | defaultTransformFactories : List (TransformFactory doc)
 79 | defaultTransformFactories =
 80 |     [ defaultStemmerFuncCreator
 81 |     ]
 82 | 
 83 | 
 84 | {-| Index default transform factories that apply before filters.
 85 | -}
 86 | defaultInitialTransformFactories : List (TransformFactory doc)
 87 | defaultInitialTransformFactories =
 88 |     [ defaultTokenTrimmerFuncCreator
 89 |     ]
 90 | 
 91 | 
 92 | {-| Index default filter factories.
 93 | -}
 94 | defaultFilterFactories : List (FilterFactory doc)
 95 | defaultFilterFactories =
 96 |     [ defaultStopWordFilterFuncCreator
 97 |     ]
 98 | 
 99 | 
100 | {-| The default token trimmer transform function creator.
101 | Normally applied first in transform functions.
102 | -}
103 | defaultTokenTrimmerFuncCreator : TransformFactory doc
104 | defaultTokenTrimmerFuncCreator =
105 |     Index.Utils.createFuncCreator TokenProcessors.trimmer
106 | 
107 | 
108 | {-| The default token stemmer transform function creator.
109 | -}
110 | defaultStemmerFuncCreator : TransformFactory doc
111 | defaultStemmerFuncCreator =
112 |     Index.Utils.createFuncCreator Stemmer.stem
113 | 
114 | 
115 | {-| The default stop word filter function creator.
116 | -}
117 | defaultStopWordFilterFuncCreator : FilterFactory doc
118 | defaultStopWordFilterFuncCreator =
119 |     StopWordFilter.createDefaultFilterFunc
120 | 
121 | 
122 | {-| Convert Index.Model.ModelSimpleConfig to Index.Model.Config
123 | Filling in default values for fields not in SimpleConfig
124 | This is the definition of the default index configuration.
125 | -}
126 | getDefaultIndexConfig : Model.ModelSimpleConfig doc -> Model.Config doc
127 | getDefaultIndexConfig { indexType, ref, fields, listFields } =
128 |     { indexType = indexType
129 |     , ref = ref
130 |     , fields = fields
131 |     , listFields = listFields
132 |     , initialTransformFactories = defaultInitialTransformFactories
133 |     , transformFactories = defaultTransformFactories
134 |     , filterFactories = defaultFilterFactories
135 |     }
136 | 
137 | 
138 | {-| convert ElmTextSearch.IndexSimpleConfig to Index.Model.ModelSimpleConfig
139 | -}
140 | getIndexSimpleConfig : IndexSimpleConfig doc -> Model.ModelSimpleConfig doc
141 | getIndexSimpleConfig { ref, fields, listFields } =
142 |     { indexType = elmTextSearchIndexType
143 |     , ref = ref
144 |     , fields = fields
145 |     , listFields = listFields
146 |     }
147 | 


--------------------------------------------------------------------------------
/src/Index/Load.elm:
--------------------------------------------------------------------------------
  1 | module Index.Load exposing (errorPrefix, loadIndex, loadIndexValue, loadIndexValueWith, loadIndexWith)
  2 | 
  3 | {-| Load an index from Value or String
  4 | 
  5 | Copyright (c) 2016 Robin Luiten
  6 | 
  7 | -}
  8 | 
  9 | import Dict
 10 | import ElmTextSearch.Json.Decoder as IndexDecoder
 11 | import Index.Defaults as Defaults
 12 | import Index.Model exposing (..)
 13 | import Index.Utils
 14 | import Json.Decode as Decode
 15 | 
 16 | 
 17 | errorPrefix : String
 18 | errorPrefix =
 19 |     "Error cannot load Index."
 20 | 
 21 | 
 22 | {-| Decode an index with one of provided configs.
 23 | 
 24 | The configurations supplied will be used in the order provided in
 25 | the list so the earliest one that matches indexType is used.
 26 | Try to use a supported index config first.
 27 | Then try the default just in case.
 28 | 
 29 | -}
 30 | loadIndexWith : List (Config doc) -> String -> Result Decode.Error (Index doc)
 31 | loadIndexWith supportedIndexConfigs inputString =
 32 |     Decode.decodeString
 33 |         (IndexDecoder.decoder
 34 |             |> Decode.andThen (mapIndexConfig supportedIndexConfigs)
 35 |             |> Decode.andThen createIndex
 36 |         )
 37 |         inputString
 38 | 
 39 | 
 40 | mapIndexConfig : List (Config doc) -> CodecIndexRecord -> Decode.Decoder ( Config doc, CodecIndexRecord )
 41 | mapIndexConfig supportedIndexConfigs index =
 42 |     if Defaults.indexVersion /= index.indexVersion then
 43 |         Decode.fail <|
 44 |             (errorPrefix
 45 |                 ++ " Version supported is "
 46 |                 ++ Defaults.indexVersion
 47 |                 ++ ". Version tried to load is "
 48 |                 ++ index.indexVersion
 49 |                 ++ "."
 50 |             )
 51 | 
 52 |     else
 53 |         let
 54 |             config =
 55 |                 List.filter
 56 |                     (\cfg -> cfg.indexType == index.indexType)
 57 |                     supportedIndexConfigs
 58 |         in
 59 |         case config of
 60 |             [] ->
 61 |                 Decode.fail <|
 62 |                     (errorPrefix
 63 |                         ++ " Tried to load index of type \""
 64 |                         ++ index.indexType
 65 |                         ++ "\". It is not in supported index configurations."
 66 |                     )
 67 | 
 68 |             matchedConfig :: _ ->
 69 |                 Decode.succeed ( matchedConfig, index )
 70 | 
 71 | 
 72 | loadIndexValueWith : List (Config doc) -> Decode.Value -> Result Decode.Error (Index doc)
 73 | loadIndexValueWith supportedIndexConfigs inputValue =
 74 |     Decode.decodeValue
 75 |         (IndexDecoder.decoder
 76 |             |> Decode.andThen (mapIndexConfig supportedIndexConfigs)
 77 |             |> Decode.andThen createIndex
 78 |         )
 79 |         inputValue
 80 | 
 81 | 
 82 | createIndex : ( Config doc, CodecIndexRecord ) -> Decode.Decoder (Index doc)
 83 | createIndex ( config, decodedIndex ) =
 84 |     Decode.succeed <|
 85 |         Index
 86 |             { indexVersion = decodedIndex.indexVersion
 87 |             , indexType = decodedIndex.indexType
 88 |             , ref = config.ref
 89 |             , fields = config.fields
 90 |             , listFields = config.listFields
 91 |             , initialTransformFactories = config.initialTransformFactories
 92 |             , transformFactories = config.transformFactories
 93 |             , filterFactories = config.filterFactories
 94 |             , documentStore = decodedIndex.documentStore
 95 |             , corpusTokens = decodedIndex.corpusTokens
 96 |             , tokenStore = decodedIndex.tokenStore
 97 |             , corpusTokensIndex =
 98 |                 Index.Utils.buildOrderIndex decodedIndex.corpusTokens
 99 |             , initialTransforms = Nothing
100 |             , transforms = Nothing
101 |             , filters = Nothing
102 |             , idfCache = Dict.empty
103 |             }
104 | 
105 | 
106 | loadIndex : ModelSimpleConfig doc -> String -> Result Decode.Error (Index doc)
107 | loadIndex simpleConfig inputString =
108 |     loadIndexWith
109 |         [ Defaults.getDefaultIndexConfig simpleConfig ]
110 |         inputString
111 | 
112 | 
113 | loadIndexValue : ModelSimpleConfig doc -> Decode.Value -> Result Decode.Error (Index doc)
114 | loadIndexValue simpleConfig inputValue =
115 |     loadIndexValueWith
116 |         [ Defaults.getDefaultIndexConfig simpleConfig ]
117 |         inputValue
118 | 


--------------------------------------------------------------------------------
/src/Index/Model.elm:
--------------------------------------------------------------------------------
  1 | module Index.Model exposing
  2 |     ( Index(..)
  3 |     , IndexSimpleConfig
  4 |     , CodecIndexRecord, Config, FilterFactory, FilterFunc, FuncFactory, ModelSimpleConfig, TransformFactory, TransformFunc, TransformFunc2
  5 |     )
  6 | 
  7 | {-| Define the Index Model
  8 | 
  9 | @docs Index
 10 | @docs IndexSimpleConfig
 11 | @docs IndexConfig
 12 | 
 13 | Copyright (c) 2016 Robin Luiten
 14 | 
 15 | -}
 16 | 
 17 | import Dict exposing (Dict)
 18 | import Set exposing (Set)
 19 | import Trie exposing (Trie)
 20 | 
 21 | 
 22 | {-| Func and Factory types used with ElmTextSearch.
 23 | -}
 24 | type alias FuncFactory doc func =
 25 |     Index doc -> ( Index doc, func )
 26 | 
 27 | 
 28 | type alias TransformFunc =
 29 |     String -> String
 30 | 
 31 | 
 32 | {-| Variant, making composition nicer in code
 33 | The old version just isn't right was using "" as no token rather than Maybe.
 34 | Makes composition lots better
 35 | -}
 36 | type alias TransformFunc2 =
 37 |     String -> Maybe String
 38 | 
 39 | 
 40 | type alias TransformFactory doc =
 41 |     Index doc -> ( Index doc, String -> String )
 42 | 
 43 | 
 44 | type alias FilterFunc =
 45 |     String -> Bool
 46 | 
 47 | 
 48 | type alias FilterFactory doc =
 49 |     Index doc -> ( Index doc, String -> Bool )
 50 | 
 51 | 
 52 | {-| Index is a full text index for a document type.
 53 | 
 54 | The internal data model of Index
 55 | 
 56 |   - indexType
 57 |       - a string that can be used on load to provide the correct set
 58 | 
 59 |   - indexVersion
 60 |       - a version string
 61 | 
 62 |   - ref
 63 |       - how to get at unique id of documents added
 64 | 
 65 |   - fields
 66 |       - list of fields of type String to index from document
 67 |           - first field is function to get String content of field
 68 |           - second field Float is a boost to text frequency of tokens in this field
 69 | 
 70 |   - listFields
 71 |       - list of fields of type List String to index from document
 72 |           - first field is function to get List String content of field
 73 |           - second field Float is a boost to text frequency of tokens in this field
 74 | 
 75 |   - initialTransformFactories
 76 |       - list of factory functions to create transform functions
 77 |       - this list is of transforms is applied before filters
 78 |       - the ones in `transformFactories` are applied after filters
 79 | 
 80 |   - transformFactories
 81 |       - list of factory functions to create transform functions
 82 | 
 83 |   - filterFactories
 84 |       - list of factory functions to create filter functions
 85 | 
 86 |   - transforms
 87 |       - the transforms in index token processing
 88 |       - lazy populated from transformFactories
 89 | 
 90 |   - filters
 91 |       - the files in index token processing
 92 |       - lazy populated from filterFactories
 93 | 
 94 |   - documentStore
 95 |       - contains dictionary of document ref to Set of document tokens
 96 | 
 97 |   - corpusTokens
 98 |       - Set of all indexed tokens from all documentStore
 99 | 
100 |   - corpusTokensIndex
101 |       - to get the position of a token in the order list of corpusTokens
102 | 
103 |   - tokenStore
104 |       - tokenStore is used for efficient storing and lookup of the
105 |         reverse index of token to document ref and holding the
106 |         token term frequency
107 | 
108 |   - idfCache
109 |       - cached idf (inverse document frequency scores)
110 |       - cache is reset (cleared) if any document is added removed or updated in index
111 | 
112 | -}
113 | type Index doc
114 |     = Index (IndexRecord doc)
115 | 
116 | 
117 | {-| The Record model in an Index.
118 | -}
119 | type alias IndexRecord doc =
120 |     { indexVersion : String
121 |     , indexType : String
122 |     , ref : doc -> String
123 |     , fields : List ( doc -> String, Float )
124 |     , listFields : List ( doc -> List String, Float )
125 |     , initialTransformFactories : List (TransformFactory doc)
126 |     , transformFactories : List (TransformFactory doc)
127 |     , filterFactories : List (FilterFactory doc)
128 |     , documentStore : Dict String (Set String)
129 |     , corpusTokens : Set String
130 |     , tokenStore : Trie Float
131 |     , corpusTokensIndex : Dict String Int
132 |     , initialTransforms : Maybe (List TransformFunc2)
133 |     , transforms : Maybe (List TransformFunc2)
134 |     , filters : Maybe (List TransformFunc2)
135 |     , idfCache : Dict String Float
136 |     }
137 | 
138 | 
139 | {-| Simple index config with default token processing.
140 | 
141 | Simple still requires configuring the fields for your document type.
142 | See [`ElmTextSearch.SimpleConfig`](ElmTextSearch#SimpleConfig)
143 | for explantions of `ref`, `fields` and `listFields` fields.
144 | 
145 |   - ElmTextSearch.SimpleConfig does not include `indexType`.
146 |       - In this case the user is getting the ElmTextSearch default token processing.
147 |   - Index.SimpleConfig includes `indexType`.
148 | 
149 | `indexType` is an identifier used to determine the transforms and filters the
150 | index uses for operation. It should be unique for all possible differently
151 | configured indexes you plan to use.
152 | 
153 | 
154 | ### The default transform factories.
155 | 
156 |     Index.Defaults.defaultTransformFactories
157 | 
158 | 
159 | ### The default filter factories.
160 | 
161 |     Index.Defaults.defaultFilterFactories
162 | 
163 | -}
164 | type alias ModelSimpleConfig doc =
165 |     { indexType : String
166 |     , ref : doc -> String
167 |     , fields : List ( doc -> String, Float )
168 |     , listFields : List ( doc -> List String, Float )
169 |     }
170 | 
171 | 
172 | {-| Index config with customized token processing.
173 | 
174 | If a configuration does not match an index being loaded
175 | you will get an Err Result returned.
176 | 
177 | -}
178 | type alias Config doc =
179 |     { indexType : String
180 |     , ref : doc -> String
181 |     , fields : List ( doc -> String, Float )
182 |     , listFields : List ( doc -> List String, Float )
183 |     , initialTransformFactories : List (TransformFactory doc)
184 |     , transformFactories : List (TransformFactory doc)
185 |     , filterFactories : List (FilterFactory doc)
186 |     }
187 | 
188 | 
189 | {-| Just the fields encoded for an Index.
190 | -}
191 | type alias CodecIndexRecord =
192 |     { indexVersion : String
193 |     , indexType : String
194 |     , documentStore : Dict String (Set String)
195 |     , corpusTokens : Set String
196 |     , tokenStore : Trie Float
197 |     }
198 | 
199 | 
200 | {-| A SimpleConfig is the least amount of configuration data
201 | required to create an Index.
202 | -}
203 | type alias IndexSimpleConfig doc =
204 |     { ref : doc -> String
205 |     , fields : List ( doc -> String, Float )
206 |     , listFields : List ( doc -> List String, Float )
207 |     }
208 | 


--------------------------------------------------------------------------------
/src/Index/Utils.elm:
--------------------------------------------------------------------------------
  1 | module Index.Utils exposing
  2 |     ( createFuncCreator
  3 |     , getTokens
  4 |     , getTokensList
  5 |     , processTokens
  6 |     , idf
  7 |     , refExists
  8 |     , buildOrderIndex
  9 |     )
 10 | 
 11 | {-| Index Utilities
 12 | 
 13 | 
 14 | ## Functions
 15 | 
 16 | @docs createFuncCreator
 17 | @docs getTokens
 18 | @docs getTokensList
 19 | @docs processTokens
 20 | @docs idf
 21 | @docs refExists
 22 | @docs buildOrderIndex
 23 | 
 24 | Copyright (c) 2016 Robin Luiten
 25 | 
 26 | -}
 27 | 
 28 | import Dict exposing (Dict)
 29 | import Index.Model
 30 |     exposing
 31 |         ( FilterFactory
 32 |         , FuncFactory
 33 |         , Index(..)
 34 |         , TransformFunc
 35 |         , TransformFunc2
 36 |         )
 37 | import Set exposing (Set)
 38 | import TokenProcessors
 39 | import Trie
 40 | 
 41 | 
 42 | {-| Create a function creator (FuncFactory)
 43 | given the simple Function to start with
 44 | -}
 45 | createFuncCreator : func -> FuncFactory doc func
 46 | createFuncCreator func index =
 47 |     ( index, func )
 48 | 
 49 | 
 50 | {-| Extract tokens from string, and process them.
 51 | -}
 52 | getTokens : Index doc -> String -> ( Index doc, List String )
 53 | getTokens index string =
 54 |     processTokens index (TokenProcessors.tokenizer string)
 55 | 
 56 | 
 57 | getTokensList : Index doc -> List String -> ( Index doc, List String )
 58 | getTokensList index listString =
 59 |     processTokens index (TokenProcessors.tokenizerList listString)
 60 | 
 61 | 
 62 | {-| Transform list of words into tokens for index and search.
 63 | 
 64 | Applies filters and transformers configured in index.
 65 | 
 66 | Applies filters first then tokenizers.
 67 | So filters apply to untokenized words from document.
 68 | 
 69 | -}
 70 | processTokens : Index doc -> List String -> ( Index doc, List String )
 71 | processTokens index tokens =
 72 |     let
 73 |         ( u1index, initialTransformTokens ) =
 74 |             applyInitialTransform index tokens
 75 | 
 76 |         ( u2index, filterTokens ) =
 77 |             applyFilter u1index initialTransformTokens
 78 |     in
 79 |     applyTransform u2index filterTokens
 80 | 
 81 | 
 82 | {-| Apply the transforms to tokens.
 83 | If any transform converts a token to an empty string no further transforms
 84 | are applied and the empty string is removed from the set of tokens.
 85 | -}
 86 | applyTransform : Index doc -> List String -> ( Index doc, List String )
 87 | applyTransform index strings =
 88 |     let
 89 |         ( u1index, transformList2 ) =
 90 |             getOrSetTransformList index
 91 |     in
 92 |     ( u1index
 93 |     , List.filterMap
 94 |         (applyTransformList transformList2)
 95 |         strings
 96 |     )
 97 | 
 98 | 
 99 | {-| Would prefer to pass just accessors (eg .transforms) to
100 | getOrSetIndexFuncList but so far the types are beating me.
101 | -}
102 | getOrSetTransformList : Index doc -> ( Index doc, List TransformFunc2 )
103 | getOrSetTransformList index =
104 |     getOrSetIndexFuncListA
105 |         (\(Index irec) -> irec.transforms)
106 |         (\(Index irec) -> irec.transformFactories)
107 |         setIndexTransforms
108 |         index
109 | 
110 | 
111 | {-| set Index transforms func field
112 | 
113 | Added listFuncs2
114 | 
115 | -}
116 | setIndexTransforms : Index doc -> List TransformFunc2 -> Index doc
117 | setIndexTransforms (Index irec) listFuncs2 =
118 |     Index { irec | transforms = Just listFuncs2 }
119 | 
120 | 
121 | applyInitialTransform : Index doc -> List String -> ( Index doc, List String )
122 | applyInitialTransform index strings =
123 |     let
124 |         ( u1index, intitialTransformList2 ) =
125 |             getOrSetInitialTransformList index
126 |     in
127 |     ( u1index
128 |     , List.filterMap
129 |         (applyTransformList intitialTransformList2)
130 |         strings
131 |     )
132 | 
133 | 
134 | getOrSetInitialTransformList : Index doc -> ( Index doc, List TransformFunc2 )
135 | getOrSetInitialTransformList index =
136 |     getOrSetIndexFuncListA
137 |         (\(Index irec) -> irec.initialTransforms)
138 |         (\(Index irec) -> irec.initialTransformFactories)
139 |         setIndexInitialTransforms
140 |         index
141 | 
142 | 
143 | setIndexInitialTransforms : Index doc -> List TransformFunc2 -> Index doc
144 | setIndexInitialTransforms (Index irec) listFuncs2 =
145 |     Index { irec | initialTransforms = Just listFuncs2 }
146 | 
147 | 
148 | {-| Apply all transforms in sequence to input token.
149 | 
150 | This works it came from reference learn-maybe/src/Transforms.elm my test project.
151 | 
152 | -}
153 | applyTransformList : List TransformFunc2 -> String -> Maybe String
154 | applyTransformList transforms token =
155 |     List.foldl (\t -> Maybe.andThen t) (Just token) transforms
156 | 
157 | 
158 | {-| Adapt function String -> String
159 | Into String -> Maybe String
160 | Where an empty string maps to Nothing.
161 | 
162 | This is only exposed to test AUGH!
163 | -}
164 | adaptFuncStrA : a -> (String -> a) -> (String -> Maybe a)
165 | adaptFuncStrA aValue func =
166 |     \string ->
167 |         let
168 |             result =
169 |                 func string
170 |         in
171 |         if result /= aValue then
172 |             Just result
173 | 
174 |         else
175 |             Nothing
176 | 
177 | 
178 | adaptFuncStrB : (String -> Bool) -> (String -> Maybe String)
179 | adaptFuncStrB func =
180 |     \string ->
181 |         let
182 |             result =
183 |                 func string
184 |         in
185 |         if result then
186 |             Just string
187 | 
188 |         else
189 |             Nothing
190 | 
191 | 
192 | {-| Apply index filters to tokens.
193 | 
194 | If any token is an empty string it will be filtered out as well.
195 | 
196 | -}
197 | applyFilter : Index doc -> List String -> ( Index doc, List String )
198 | applyFilter index strings =
199 |     let
200 |         ( u1index, filterList2 ) =
201 |             getOrSetFilterList index
202 |     in
203 |     ( u1index
204 |     , List.filterMap
205 |         (applyTransformList filterList2)
206 |         strings
207 |     )
208 | 
209 | 
210 | getOrSetFilterList : Index doc -> ( Index doc, List TransformFunc2 )
211 | getOrSetFilterList index =
212 |     getOrSetIndexFuncListB
213 |         (\(Index irec) -> irec.filters)
214 |         (\(Index irec) -> irec.filterFactories)
215 |         setIndexFilters
216 |         index
217 | 
218 | 
219 | {-| set Index filters func field
220 | -}
221 | setIndexFilters : Index doc -> List TransformFunc2 -> Index doc
222 | setIndexFilters (Index irec) listFuncs2 =
223 |     Index { irec | filters = Just listFuncs2 }
224 | 
225 | 
226 | {-| String TranformFunc source type variant.
227 | 
228 | See getOrSetIndexFuncListB for FilterFunc variant
229 | Generic type `a` isnt helping me here so splitting for specific types
230 | Dang and these two variants work.
231 | 
232 | -}
233 | getOrSetIndexFuncListA :
234 |     (Index doc -> Maybe (List TransformFunc2))
235 |     -> (Index doc -> List (FuncFactory doc TransformFunc))
236 |     -> (Index doc -> List TransformFunc2 -> Index doc)
237 |     -> Index doc
238 |     -> ( Index doc, List TransformFunc2 )
239 | getOrSetIndexFuncListA getFuncs2 getFactoryFuncs setFuncs index =
240 |     case getFuncs2 index of
241 |         -- init allready run
242 |         Just funcList2 ->
243 |             ( index, funcList2 )
244 | 
245 |         -- rebuild function lists
246 |         _ ->
247 |             let
248 |                 ( u1index, newFuncList ) =
249 |                     runFactories (getFactoryFuncs index) index
250 | 
251 |                 newFunc2List =
252 |                     List.map (adaptFuncStrA "") newFuncList
253 | 
254 |                 u2index =
255 |                     setFuncs u1index newFunc2List
256 |             in
257 |             ( u2index, newFunc2List )
258 | 
259 | 
260 | {-| Variant for FilterFunc hydration
261 | 
262 | If i switch FilterFunc to be TransformFunc instead i can share above code, just one less variation.
263 | 
264 | -}
265 | getOrSetIndexFuncListB :
266 |     (Index doc -> Maybe (List TransformFunc2))
267 |     -> (Index doc -> List (FilterFactory doc))
268 |     -> (Index doc -> List TransformFunc2 -> Index doc)
269 |     -> Index doc
270 |     -> ( Index doc, List TransformFunc2 )
271 | getOrSetIndexFuncListB getFuncs2 getFactoryFuncs setFuncs index =
272 |     case getFuncs2 index of
273 |         -- init allready run
274 |         Just funcList2 ->
275 |             ( index, funcList2 )
276 | 
277 |         -- rebuild function lists
278 |         _ ->
279 |             let
280 |                 ( u1index, newFuncList ) =
281 |                     runFactories (getFactoryFuncs index) index
282 | 
283 |                 newFunc2List =
284 |                     List.map adaptFuncStrB newFuncList
285 | 
286 |                 u2index =
287 |                     setFuncs u1index newFunc2List
288 |             in
289 |             ( u2index, newFunc2List )
290 | 
291 | 
292 | {-| Run each of the function factories returning the list of functions.
293 | 
294 | TODO use foldr?, probably dont mater here
295 | 
296 | -}
297 | runFactories : List (FuncFactory doc func) -> Index doc -> ( Index doc, List func )
298 | runFactories factoryList index =
299 |     List.foldr
300 |         (\factory ( u1index, funcList ) ->
301 |             let
302 |                 ( u2index, newFunc ) =
303 |                     factory u1index
304 |             in
305 |             ( u2index, newFunc :: funcList )
306 |         )
307 |         ( index, [] )
308 |         factoryList
309 | 
310 | 
311 | {-| Calculate the inverse document frequency for a token in the Index.
312 | 
313 | Model will update if token has no cached value for idf.
314 | 
315 | -}
316 | idf : Index doc -> String -> ( Index doc, Float )
317 | idf ((Index irec) as index) token =
318 |     case Dict.get token irec.idfCache of
319 |         Nothing ->
320 |             calcIdf index token
321 | 
322 |         Just idfValue ->
323 |             ( index, idfValue )
324 | 
325 | 
326 | calcIdf : Index doc -> String -> ( Index doc, Float )
327 | calcIdf (Index irec) token =
328 |     let
329 |         -- _ = Debug.log("calcIdf") (token)
330 |         docFrequency =
331 |             toFloat (Trie.valueCount token irec.tokenStore)
332 | 
333 |         idfLocal =
334 |             if docFrequency > 0 then
335 |                 1
336 |                     + logBase 10
337 |                         (toFloat (Dict.size irec.documentStore) / docFrequency)
338 | 
339 |             else
340 |                 toFloat 1
341 | 
342 |         updatedIdfCache =
343 |             Dict.insert token idfLocal irec.idfCache
344 | 
345 |         u1index =
346 |             Index
347 |                 { irec
348 |                     | idfCache = updatedIdfCache
349 |                 }
350 |     in
351 |     ( u1index, idfLocal )
352 | 
353 | 
354 | {-| Return True if document reference is indexed.
355 | -}
356 | refExists : String -> Index doc -> Bool
357 | refExists docRef (Index irec) =
358 |     Dict.member docRef irec.documentStore
359 | 
360 | 
361 | {-| Build an index of string to index from Set where key is
362 | Set word and value is ordered index of word in Set.
363 | -}
364 | buildOrderIndex : Set String -> Dict String Int
365 | buildOrderIndex tokenSet =
366 |     let
367 |         withIndex =
368 |             List.indexedMap Tuple.pair (Set.toList tokenSet)
369 |     in
370 |     List.foldr (\( i, v ) d -> Dict.insert v i d) Dict.empty withIndex
371 | 


--------------------------------------------------------------------------------
/src/Index/Vector.elm:
--------------------------------------------------------------------------------
  1 | module Index.Vector exposing (buildDocVector, getDocVector, getQueryVector, scoreAndCompare, similarityBoost, updateDocVector, updateSetAndVec)
  2 | 
  3 | {-| Index document vector support.
  4 | 
  5 | Copyright (c) 2016 Robin Luiten
  6 | 
  7 | -}
  8 | 
  9 | import Dict
 10 | import Index.Model exposing (Index(..))
 11 | import Index.Utils
 12 | import Maybe
 13 | import Set exposing (Set)
 14 | import SparseVector exposing (SparseVector)
 15 | import String
 16 | import Trie
 17 | 
 18 | 
 19 | {-| Build a query vector and the sets of candidate document matches
 20 | for each token in our query tokens.
 21 | 
 22 | Each token in our query will have a seperate Set String entry in
 23 | the returned List. As all query token document result sets are
 24 | intersected together for final list of documents matched. (a logical and
 25 | of all the query tokens)
 26 | 
 27 | -}
 28 | getQueryVector :
 29 |     List String
 30 |     -> Index doc
 31 |     -> ( List (Set String), SparseVector, Index doc )
 32 | getQueryVector tokens index =
 33 |     List.foldr
 34 |         (buildDocVector (List.length tokens))
 35 |         ( [], SparseVector.empty, index )
 36 |         tokens
 37 | 
 38 | 
 39 | {-| Update query vector elements to create query vector.
 40 | Update the list of documents that match for each query token (baseToken).
 41 | -}
 42 | buildDocVector :
 43 |     Int
 44 |     -> String
 45 |     -> ( List (Set String), SparseVector, Index doc )
 46 |     -> ( List (Set String), SparseVector, Index doc )
 47 | buildDocVector tokensLength baseToken ( docSets, vec, (Index irec) as index ) =
 48 |     let
 49 |         termFrequency =
 50 |             1 / toFloat tokensLength
 51 | 
 52 |         expandedTokens =
 53 |             Trie.expand baseToken irec.tokenStore
 54 | 
 55 |         -- _ = Debug.log("buildDocVector") (tokensLength, baseToken, expandedTokens)
 56 |         ( docs, vecU1, indexU1 ) =
 57 |             List.foldr
 58 |                 (updateSetAndVec termFrequency baseToken)
 59 |                 ( Set.empty, vec, index )
 60 |                 expandedTokens
 61 |     in
 62 |     ( docs :: docSets, vecU1, indexU1 )
 63 | 
 64 | 
 65 | {-| Calculate Term frequency-inverse document frequency (tf-idf).
 66 | Union of documents for each expandedToken for this (base)token.
 67 | -}
 68 | updateSetAndVec :
 69 |     Float
 70 |     -> String
 71 |     -> String
 72 |     -> ( Set String, SparseVector, Index doc )
 73 |     -> ( Set String, SparseVector, Index doc )
 74 | updateSetAndVec tf token expandedToken ( docSets, vec, (Index irec) as index ) =
 75 |     let
 76 |         ( (Index u1irec) as u1index, keyIdf ) =
 77 |             Index.Utils.idf index expandedToken
 78 | 
 79 |         tfidf =
 80 |             tf * keyIdf * similarityBoost token expandedToken
 81 | 
 82 |         -- _ = Debug.log("updateSetAndVec") (tf, token, expandedToken, (similarityBoost token expandedToken), keyIdf, tfidf)
 83 |         -- _ = Debug.log("updateSetAndVec corpus") (irec.corpusTokensIndex)
 84 |         u1vec =
 85 |             Maybe.withDefault vec <|
 86 |                 Maybe.map
 87 |                     (\pos -> SparseVector.insert pos tfidf vec)
 88 |                     (Dict.get expandedToken irec.corpusTokensIndex)
 89 | 
 90 |         expandedTokenDocSet =
 91 |             Maybe.withDefault Set.empty <|
 92 |                 Maybe.map
 93 |                     (\dict -> Set.fromList (Dict.keys dict))
 94 |                     (Trie.get expandedToken u1irec.tokenStore)
 95 | 
 96 |         u1docSets =
 97 |             Set.union expandedTokenDocSet docSets
 98 | 
 99 |         -- _ = Debug.log("updateSetAndVec u1docSets u1vec") (expandedToken, u1docSets, u1vec)
100 |     in
101 |     ( u1docSets, u1vec, u1index )
102 | 
103 | 
104 | {-| if the expanded token is not an exact match to the token then
105 | penalise the score for this key by how different the key is
106 | to the token.
107 | -}
108 | similarityBoost : String -> String -> Float
109 | similarityBoost token expandedToken =
110 |     if expandedToken == token then
111 |         1
112 | 
113 |     else
114 |         1
115 |             / logBase 10
116 |                 (toFloat
117 |                     (max 3
118 |                         (String.length expandedToken
119 |                             - String.length token
120 |                         )
121 |                     )
122 |                 )
123 | 
124 | 
125 | {-| calculate the score for each doc
126 | -}
127 | scoreAndCompare :
128 |     SparseVector
129 |     -> String
130 |     -> ( Index doc, List ( String, Float ) )
131 |     -> ( Index doc, List ( String, Float ) )
132 | scoreAndCompare queryVector ref ( index, docs ) =
133 |     let
134 |         ( u1index, docVector ) =
135 |             getDocVector index ref
136 | 
137 |         -- _ = Debug.log("scoreAndCompare") (docVector)
138 |     in
139 |     ( u1index, ( ref, SparseVector.cosineSimilarity queryVector docVector ) :: docs )
140 | 
141 | 
142 | {-| build vector for docRef
143 | -}
144 | getDocVector : Index doc -> String -> ( Index doc, SparseVector )
145 | getDocVector ((Index irec) as index) docRef =
146 |     Maybe.withDefault ( index, SparseVector.empty ) <|
147 |         Maybe.map
148 |             (\tokenSet ->
149 |                 List.foldr
150 |                     (updateDocVector docRef)
151 |                     ( index, SparseVector.empty )
152 |                     (Set.toList tokenSet)
153 |             )
154 |             (Dict.get docRef irec.documentStore)
155 | 
156 | 
157 | {-| reducer for docRef docVector for this token
158 | -}
159 | updateDocVector : String -> String -> ( Index doc, SparseVector ) -> ( Index doc, SparseVector )
160 | updateDocVector docRef token (( (Index irec) as index, docVector ) as inputTuple) =
161 |     Maybe.withDefault inputTuple <|
162 |         Maybe.map2
163 |             (\position termFrequency ->
164 |                 let
165 |                     ( u1index, idfScore ) =
166 |                         Index.Utils.idf index token
167 |                 in
168 |                 ( u1index, SparseVector.insert position (termFrequency * idfScore) docVector )
169 |             )
170 |             (Dict.get token irec.corpusTokensIndex)
171 |             (Trie.get token irec.tokenStore
172 |                 |> Maybe.andThen (Dict.get docRef)
173 |             )
174 | 


--------------------------------------------------------------------------------
/src/StopWordFilter.elm:
--------------------------------------------------------------------------------
  1 | module StopWordFilter exposing
  2 |     ( createDefaultFilterFunc
  3 |     , stopEnglishWordList
  4 |     , createFilterFuncWith
  5 |     , createFilterFunc
  6 |     )
  7 | 
  8 | {-| StopWordFilter is an English language stop word list filter, any words
  9 | contained in the list are not stored in the index.
 10 | 
 11 | This is intended to be used in the ElmTextSearch token processing pipeline.
 12 | 
 13 | 
 14 | ### Things to know about stop word lists.
 15 | 
 16 |   - Words in document are split on white space to create tokens.
 17 |   - Tokens have non word characters from prefix and suffix to improve matching filters.
 18 |   - Input tokens to create stop word filters should be full words.
 19 |   - It is more efficient to merge all your stop words into a single
 20 |     stop word filter.
 21 | 
 22 | 
 23 | ## create default stop word filter func
 24 | 
 25 | @docs createDefaultFilterFunc
 26 | 
 27 | 
 28 | ## A default stop word english filter list
 29 | 
 30 | @docs stopEnglishWordList
 31 | 
 32 | 
 33 | ## Create a custom stop word filter list
 34 | 
 35 | @docs createFilterFuncWith
 36 | @docs createFilterFunc
 37 | 
 38 | Copyright (c) 2016 Robin Luiten
 39 | 
 40 | -}
 41 | 
 42 | import Index.Model exposing (FilterFactory)
 43 | import Set
 44 | 
 45 | 
 46 | {-| Default english stop word list to create filter.
 47 | -}
 48 | stopEnglishWordList : List String
 49 | stopEnglishWordList =
 50 |     [ "a"
 51 |     , "able"
 52 |     , "about"
 53 |     , "across"
 54 |     , "after"
 55 |     , "all"
 56 |     , "almost"
 57 |     , "also"
 58 |     , "am"
 59 |     , "among"
 60 |     , "an"
 61 |     , "and"
 62 |     , "any"
 63 |     , "are"
 64 |     , "as"
 65 |     , "at"
 66 |     , "be"
 67 |     , "because"
 68 |     , "been"
 69 |     , "but"
 70 |     , "by"
 71 |     , "can"
 72 |     , "cannot"
 73 |     , "could"
 74 |     , "dear"
 75 |     , "did"
 76 |     , "do"
 77 |     , "does"
 78 |     , "either"
 79 |     , "else"
 80 |     , "ever"
 81 |     , "every"
 82 |     , "for"
 83 |     , "from"
 84 |     , "get"
 85 |     , "got"
 86 |     , "had"
 87 |     , "has"
 88 |     , "have"
 89 |     , "he"
 90 |     , "her"
 91 |     , "hers"
 92 |     , "him"
 93 |     , "his"
 94 |     , "how"
 95 |     , "however"
 96 |     , "i"
 97 |     , "if"
 98 |     , "in"
 99 |     , "into"
100 |     , "is"
101 |     , "it"
102 |     , "its"
103 |     , "just"
104 |     , "least"
105 |     , "let"
106 |     , "like"
107 |     , "likely"
108 |     , "may"
109 |     , "me"
110 |     , "might"
111 |     , "most"
112 |     , "must"
113 |     , "my"
114 |     , "neither"
115 |     , "no"
116 |     , "nor"
117 |     , "not"
118 |     , "of"
119 |     , "off"
120 |     , "often"
121 |     , "on"
122 |     , "only"
123 |     , "or"
124 |     , "other"
125 |     , "our"
126 |     , "own"
127 |     , "rather"
128 |     , "said"
129 |     , "say"
130 |     , "says"
131 |     , "she"
132 |     , "should"
133 |     , "since"
134 |     , "so"
135 |     , "some"
136 |     , "than"
137 |     , "that"
138 |     , "the"
139 |     , "their"
140 |     , "them"
141 |     , "then"
142 |     , "there"
143 |     , "these"
144 |     , "they"
145 |     , "this"
146 |     , "tis"
147 |     , "to"
148 |     , "too"
149 |     , "twas"
150 |     , "us"
151 |     , "wants"
152 |     , "was"
153 |     , "we"
154 |     , "were"
155 |     , "what"
156 |     , "when"
157 |     , "where"
158 |     , "which"
159 |     , "while"
160 |     , "who"
161 |     , "whom"
162 |     , "why"
163 |     , "will"
164 |     , "with"
165 |     , "would"
166 |     , "yet"
167 |     , "you"
168 |     , "your"
169 |     ]
170 | 
171 | 
172 | {-| Default english stop word filter suitable for ElmTextSearch.
173 | -}
174 | createDefaultFilterFunc : FilterFactory doc
175 | createDefaultFilterFunc index =
176 |     createFilterFunc stopEnglishWordList index
177 | 
178 | 
179 | {-| Create stop word list filter suitable for ElmTextSearch, this versions
180 | extends the default word list with the extra words provided.
181 | -}
182 | createFilterFuncWith : List String -> FilterFactory doc
183 | createFilterFuncWith extraWords index =
184 |     createFilterFunc (List.append extraWords stopEnglishWordList) index
185 | 
186 | 
187 | {-| Create stop word filter for provided list of tokens suitable for ElmTextSearch.
188 | 
189 | \*\* This creates a stop world filter purely from your own word list, understand
190 | what you are doing and consequences if you use this. \*\*
191 | 
192 | The FilterFunc created returns True to allow words into index.
193 | So words found in the stopWordList return False
194 | 
195 | -}
196 | createFilterFunc : List String -> FilterFactory doc
197 | createFilterFunc tokens index =
198 |     let
199 |         tokenSet =
200 |             Set.fromList tokens
201 |     in
202 |     ( index, \word -> not (Set.member word tokenSet) )
203 | 


--------------------------------------------------------------------------------
/src/TokenProcessors.elm:
--------------------------------------------------------------------------------
  1 | module TokenProcessors exposing
  2 |     ( tokenizer
  3 |     , tokenizerList
  4 |     , tokenizerWith
  5 |     , tokenizerWithRegex
  6 |     , tokenizerWithRegexList
  7 |     , trimmer
  8 |     , tokenizerWithList
  9 |     )
 10 | 
 11 | {-| TokenProcessors for strings.
 12 | 
 13 | 
 14 | ## Create a tokenizer
 15 | 
 16 | @docs tokenizer
 17 | @docs tokenizerList
 18 | @docs tokenizerWith
 19 | @docs tokenizerWithRegex
 20 | @docs tokenizerWithRegexList
 21 | 
 22 | 
 23 | ## Word transformer
 24 | 
 25 | @docs trimmer
 26 | 
 27 | Copyright (c) 2016 Robin Luiten
 28 | 
 29 | -}
 30 | 
 31 | import Regex
 32 |     exposing
 33 |         ( Regex
 34 |           -- , HowMany(..)
 35 |         , fromString
 36 |         , replace
 37 |         , split
 38 |         )
 39 | import String exposing (toLower, trim)
 40 | 
 41 | 
 42 | forceRegex : String -> Regex
 43 | forceRegex =
 44 |     Maybe.withDefault Regex.never << fromString
 45 | 
 46 | 
 47 | defaultSeparator : Regex
 48 | defaultSeparator =
 49 |     forceRegex "[\\s\\-]+"
 50 | 
 51 | 
 52 | {-| Tokenize a String.
 53 | Will not return any empty string tokens.
 54 | By default this splits on whitespace and hyphens.
 55 | -}
 56 | tokenizer : String -> List String
 57 | tokenizer =
 58 |     tokenizerWithRegex defaultSeparator
 59 | 
 60 | 
 61 | {-| Tokenize a List String.
 62 | Will not return any empty string tokens.
 63 | By default this splits on whitespace and hyphens.
 64 | -}
 65 | tokenizerList : List String -> List String
 66 | tokenizerList =
 67 |     tokenizerWithRegexList defaultSeparator
 68 | 
 69 | 
 70 | {-| Tokenize a string.
 71 | Will not return any empty string tokens.
 72 | Supply your own regex for splitting the string.
 73 | -}
 74 | tokenizerWithRegex : Regex -> String -> List String
 75 | tokenizerWithRegex seperatorRegex data =
 76 |     let
 77 |         splitter =
 78 |             split seperatorRegex << toLower << trim
 79 |     in
 80 |     List.filter
 81 |         (\token -> String.length token > 0)
 82 |         (splitter data)
 83 | 
 84 | 
 85 | tokenizerWithRegexList : Regex -> List String -> List String
 86 | tokenizerWithRegexList seperatorRegex listData =
 87 |     let
 88 |         splitter =
 89 |             split seperatorRegex << toLower << trim
 90 | 
 91 |         -- List.foldr (\set agg -> Set.intersect set agg) h tail
 92 |         -- tokens : List String
 93 |         tokens =
 94 |             List.foldr
 95 |                 (\str agg ->
 96 |                     List.append agg (splitter str)
 97 |                 )
 98 |                 []
 99 |                 listData
100 |     in
101 |     List.filter
102 |         (\token -> String.length token > 0)
103 |         tokens
104 | 
105 | 
106 | {-| Tokenize a String.
107 | Will not return any empty string tokens.
108 | Supply your own String which is turned into a regex for splitting the string.
109 | -}
110 | tokenizerWith : String -> String -> List String
111 | tokenizerWith seperatorPattern =
112 |     tokenizerWithRegex (forceRegex seperatorPattern)
113 | 
114 | 
115 | {-| Tokenize a List String.
116 | Will not return any empty string tokens.
117 | Supply your own String which is turned into a regex for splitting the string.
118 | -}
119 | tokenizerWithList : String -> List String -> List String
120 | tokenizerWithList seperatorPattern =
121 |     tokenizerWithRegexList (forceRegex seperatorPattern)
122 | 
123 | 
124 | trimmerRegex =
125 |     forceRegex "^\\W+|\\W+$"
126 | 
127 | 
128 | {-| Remove non word characters from start and end of tokens
129 | -}
130 | trimmer : String -> String
131 | trimmer =
132 |     replace trimmerRegex (\_ -> "")
133 | 


--------------------------------------------------------------------------------
/src/Utils.elm:
--------------------------------------------------------------------------------
 1 | module Utils exposing (intersectSets)
 2 | 
 3 | {-| Some misc utils
 4 | 
 5 | @docs intersectSets
 6 | 
 7 | Copyright (c) 2016 Robin Luiten
 8 | 
 9 | -}
10 | 
11 | import Set exposing (Set)
12 | 
13 | 
14 | {-| Return intersection of a list of sets
15 | -}
16 | intersectSets : List (Set String) -> Set String
17 | intersectSets sets =
18 |     case sets of
19 |         [] ->
20 |             Set.empty
21 | 
22 |         h :: tail ->
23 |             List.foldr (\set agg -> Set.intersect set agg) h tail
24 | 


--------------------------------------------------------------------------------
/tests/DefaultTests.elm:
--------------------------------------------------------------------------------
 1 | module DefaultTests exposing (testDefaultIndexType)
 2 | 
 3 | import Expect
 4 | import Index.Defaults
 5 | import Test exposing (..)
 6 | 
 7 | 
 8 | testDefaultIndexType : Test
 9 | testDefaultIndexType =
10 |     test "Check Index Type" <|
11 |         \() ->
12 |             Index.Defaults.getIndexSimpleConfig
13 |                 { ref = .cid
14 |                 , fields =
15 |                     [ ( .title, 5.0 )
16 |                     ]
17 |                 , listFields =
18 |                     [ ( .body, 1.0 )
19 |                     ]
20 |                 }
21 |                 |> .indexType
22 |                 |> Expect.equal "-= ElmTextSearch Index Type 1 =-"
23 | 


--------------------------------------------------------------------------------
/tests/ElmTextSearchTests.elm:
--------------------------------------------------------------------------------
 1 | module ElmTextSearchTests exposing (..)
 2 | 
 3 | import ElmTextSearch
 4 | import ElmTextSearchErrors exposing (AddError(..), RemoveError(..), SearchError(..))
 5 | import Expect
 6 | import Test exposing (..)
 7 | 
 8 | 
 9 | type alias MyDoc =
10 |     { cid : String
11 |     , title : String
12 |     , author : String
13 |     , body : String
14 |     }
15 | 
16 | 
17 | doc1_ : MyDoc
18 | doc1_ =
19 |     { cid = "doc1"
20 |     , title = "Examples of a Banana"
21 |     , author = "Sally Apples"
22 |     , body = "Sally writes words about a grown banana."
23 |     }
24 | 
25 | 
26 | getEmptyIndex : () -> ElmTextSearch.Index MyDoc
27 | getEmptyIndex _ =
28 |     ElmTextSearch.new
29 |         { ref = .cid
30 |         , fields = [ ( .title, 5 ), ( .body, 1 ) ]
31 |         , listFields = []
32 |         }
33 | 
34 | 
35 | test_searchT_CanUseErrorResultConstructors : Test
36 | test_searchT_CanUseErrorResultConstructors =
37 |     test "If can case on error result" <|
38 |         \() ->
39 |             getEmptyIndex ()
40 |                 |> ElmTextSearch.searchT "hello"
41 |                 |> (\result ->
42 |                         case result of
43 |                             Ok _ ->
44 |                                 False
45 | 
46 |                             Err error ->
47 |                                 case error of
48 |                                     IndexIsEmpty ->
49 |                                         True
50 | 
51 |                                     _ ->
52 |                                         False
53 |                    )
54 |                 |> Expect.equal True
55 |                 >> Expect.onFail "Result should be an error"
56 | 


--------------------------------------------------------------------------------
/tests/IndexDecoderTests.elm:
--------------------------------------------------------------------------------
 1 | module IndexDecoderTests exposing (decodeAndEncodeRoundTripSameTest)
 2 | 
 3 | import ElmTextSearch.Json.Decoder as IndexDecoder
 4 | import ElmTextSearch.Json.Encoder as IndexEncoder
 5 | import Expect
 6 | import Json.Decode as Decode
 7 | import Json.Encode as Encode
 8 | import Test exposing (..)
 9 | import TestUtils
10 | 
11 | 
12 | encodeAndDecodeHelper : String -> String
13 | encodeAndDecodeHelper string =
14 |     string
15 |         |> Decode.decodeString IndexDecoder.decoder
16 |         |> TestUtils.getResultIgnoreError
17 |         |> IndexEncoder.codecIndexRecordEncoder
18 |         |> Encode.encode 0
19 | 
20 | 
21 | {-| From <http://package.elm-lang.org/packages/elm-lang/core/3.0.0/Dict>
22 | QUOTE: "Dictionary equality with (==) is unreliable and should not be used."
23 | 
24 | Therefore decode then encode back to string to check its same.
25 | 
26 | -}
27 | decodeAndEncodeRoundTripSameTest : Test
28 | decodeAndEncodeRoundTripSameTest =
29 |     let
30 |         -- from the encoder tests
31 |         encodedIndex =
32 |             String.concat
33 |                 [ "{\"indexVersion\":\"1.0.0\",\"indexType\":\"- IndexTest Type -\","
34 |                 , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
35 |                 , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
36 |                 , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
37 |                 , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
38 |                 , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
39 |                 , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
40 |                 , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
41 |                 , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
42 |                 ]
43 |     in
44 |     test "decode then encode ensure same" <|
45 |         \() ->
46 |             encodedIndex
47 |                 |> encodeAndDecodeHelper
48 |                 |> Expect.equal encodedIndex
49 | 


--------------------------------------------------------------------------------
/tests/IndexEncoderTests.elm:
--------------------------------------------------------------------------------
 1 | module IndexEncoderTests exposing (testEncodeList, testEncoder)
 2 | 
 3 | import ElmTextSearch.Json.Encoder as IndexEncoder
 4 | import Expect
 5 | import Index
 6 | import Index.Model exposing (..)
 7 | import Json.Encode as Encode
 8 | import Test exposing (..)
 9 | import TestUtils
10 | 
11 | 
12 | encodedIndex : String
13 | encodedIndex =
14 |     String.concat
15 |         [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"- IndexTest Type -\","
16 |         , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
17 |         , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
18 |         , "\"tokenStore\":{"
19 |         , "\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
20 |         , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
21 |         , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
22 |         , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
23 |         , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
24 |         , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
25 |         ]
26 | 
27 | 
28 | testEncoder : Test
29 | testEncoder =
30 |     test "Encode index with doc matches encodedIndex" <|
31 |         \() ->
32 |             Index.new
33 |                 { indexType = "- IndexTest Type -"
34 |                 , ref = .cid
35 |                 , fields = [ ( .title, 5 ), ( .body, 1 ) ]
36 |                 , listFields = []
37 |                 }
38 |                 |> Index.add
39 |                     { cid = "doc1"
40 |                     , title = "Examples of a Banana"
41 |                     , author = "Sally Apples"
42 |                     , body = "Sally writes words about a grown banana."
43 |                     }
44 |                 |> TestUtils.getResultIgnoreError
45 |                 |> IndexEncoder.encoder
46 |                 |> Encode.encode 0
47 |                 |> Expect.equal
48 |                     encodedIndex
49 | 
50 | 
51 | testEncodeList : Test
52 | testEncodeList =
53 |     test "Encode index with doc matches encodedIndex using listFields" <|
54 |         \() ->
55 |             Index.new
56 |                 { indexType = "- IndexTest Type -"
57 |                 , ref = .cid
58 |                 , fields = [ ( .title, 5 ) ]
59 |                 , listFields = [ ( .body, 1 ) ]
60 |                 }
61 |                 |> Index.add
62 |                     { cid = "doc1"
63 |                     , title = "Examples of a Banana"
64 |                     , author = "Sally Apples"
65 |                     , body =
66 |                         [ "Sally writes words "
67 |                         , "about a grown banana."
68 |                         ]
69 |                     }
70 |                 |> TestUtils.getResultIgnoreError
71 |                 |> IndexEncoder.encoder
72 |                 |> Encode.encode 0
73 |                 |> Expect.equal
74 |                     encodedIndex
75 | 


--------------------------------------------------------------------------------
/tests/IndexLoadTests.elm:
--------------------------------------------------------------------------------
  1 | module IndexLoadTests exposing
  2 |     ( indexfromString1Test
  3 |     , loadIndexWith1Test
  4 |     , loadIndexWithErr1Test
  5 |     , loadIndexWithErr2Test
  6 |     )
  7 | 
  8 | import ElmTextSearch
  9 | import Expect
 10 | import Index.Load
 11 | import Index.Model exposing (Index(..))
 12 | import Json.Decode exposing (Error(..))
 13 | import Test exposing (..)
 14 | import TestUtils
 15 | 
 16 | 
 17 | loadIndexWithErr1Test : Test
 18 | loadIndexWithErr1Test =
 19 |     test "Fails to load an index with wrong index version" <|
 20 |         \() ->
 21 |             String.concat
 22 |                 [ "{\"indexVersion\":\"1.0.1\",\"indexType\":\"- IndexTest Type -\","
 23 |                 , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
 24 |                 , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
 25 |                 , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
 26 |                 , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
 27 |                 , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
 28 |                 , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
 29 |                 , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
 30 |                 , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
 31 |                 ]
 32 |                 |> Index.Load.loadIndexWith
 33 |                     [ { indexType = "_______some string"
 34 |                       , ref = .cid
 35 |                       , fields = [ ( .title, 5 ), ( .body, 1 ) ]
 36 |                       , listFields = []
 37 |                       , initialTransformFactories = []
 38 |                       , transformFactories = []
 39 |                       , filterFactories = []
 40 |                       }
 41 |                     ]
 42 |                 |> TestUtils.getErrorIgnoreResult
 43 |                 |> TestUtils.getDecodeErrorFailureMessage
 44 |                 |> Expect.equal "Error cannot load Index. Version supported is 1.1.0. Version tried to load is 1.0.1."
 45 | 
 46 | 
 47 | loadIndexWithErr2Test : Test
 48 | loadIndexWithErr2Test =
 49 |     test "Fails to load an index with an indexType not in configuration provided." <|
 50 |         \() ->
 51 |             String.concat
 52 |                 [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"__IndexTest Type -\","
 53 |                 , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
 54 |                 , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
 55 |                 , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
 56 |                 , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
 57 |                 , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
 58 |                 , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
 59 |                 , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
 60 |                 , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
 61 |                 ]
 62 |                 |> Index.Load.loadIndexWith
 63 |                     [ { indexType = "_______some string not matching the encoded index type"
 64 |                       , ref = .cid
 65 |                       , fields = [ ( .title, 5 ), ( .body, 1 ) ]
 66 |                       , listFields = []
 67 |                       , initialTransformFactories = []
 68 |                       , transformFactories = []
 69 |                       , filterFactories = []
 70 |                       }
 71 |                     ]
 72 |                 |> TestUtils.getErrorIgnoreResult
 73 |                 |> TestUtils.getDecodeErrorFailureMessage
 74 |                 |> Expect.equal "Error cannot load Index. Tried to load index of type \"__IndexTest Type -\". It is not in supported index configurations."
 75 | 
 76 | 
 77 | loadIndexWith1Test : Test
 78 | loadIndexWith1Test =
 79 |     let
 80 |         config =
 81 |             { indexType = "not set"
 82 |             , ref = .cid
 83 |             , fields = [ ( .title, 5 ), ( .body, 1 ) ]
 84 |             , listFields = []
 85 |             , initialTransformFactories = []
 86 |             , transformFactories = []
 87 |             , filterFactories = []
 88 |             }
 89 |     in
 90 |     test "Load an index. really dumb check" <|
 91 |         \() ->
 92 |             String.concat
 93 |                 [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"_______some string\","
 94 |                 , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
 95 |                 , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
 96 |                 , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
 97 |                 , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
 98 |                 , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
 99 |                 , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
100 |                 , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
101 |                 , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
102 |                 ]
103 |                 |> Index.Load.loadIndexWith
104 |                     [ config
105 |                     , { config | indexType = "_______some string" }
106 |                     ]
107 |                 |> TestUtils.expectOkWithGoodFailMessage
108 | 
109 | 
110 | indexfromString1Test : Test
111 | indexfromString1Test =
112 |     test "Can succesfully load index from string with ElmTextSearch.SimpleConfig." <|
113 |         \() ->
114 |             String.concat
115 |                 [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"-= ElmTextSearch Index Type 1 =-\","
116 |                 , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]},"
117 |                 , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"],"
118 |                 , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}},"
119 |                 , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}},"
120 |                 , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}},"
121 |                 , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}},"
122 |                 , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}},"
123 |                 , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}"
124 |                 ]
125 |                 |> ElmTextSearch.fromString
126 |                     { ref = .cid
127 |                     , fields =
128 |                         [ ( .title, 5 )
129 |                         , ( .body, 1 )
130 |                         ]
131 |                     , listFields = []
132 |                     }
133 |                 |> TestUtils.expectOkWithGoodFailMessage
134 | 


--------------------------------------------------------------------------------
/tests/IndexTests.elm:
--------------------------------------------------------------------------------
  1 | module IndexTests exposing
  2 |     ( addDocAlreadyInIndexReturnsError
  3 |     , addDocWithEmptyIdFieldReturnsError
  4 |     , addDocWithIndexFieldsEmptyReturnsError
  5 |     , addDocumentWithSameIdAsExistingReturnsError
  6 |     , addMultipleDocsReturnsErrorListForProblems
  7 |     , addOrUpdateDocNotInIndexReturnsSuccess
  8 |     , addOrUpdateDocWithSameIdReturnsSuccess
  9 |     , idfCacheIsClearedAfterASuccessfulAdd
 10 |     , idfCacheIsClearedAfterSuccessfulRemove
 11 |     , removeDocRefNotIndexReturnsError
 12 |     , removeDocWithEmptyIdFieldReturnsError
 13 |     , removeDoesNotBreakSearchResults
 14 |     , removeOnlyDocIndexReturnsIsEmpty
 15 |     , searchCasesTest
 16 |     , searchEmptyIndexReturnsError
 17 |     , searchIndexAfter2DocRemovedErrors
 18 |     , searchIndexAfterDocRemovedErrors
 19 |     , searchListFieldsSingleLetterWithLetterInBody
 20 |     , searchSingleLetterWithLetterInTitles
 21 |     , searchUsingEmptyQueryReturnsError
 22 |     , searchUsingQueryWithOnlyStopWordsWhichMeansEmptyReturnsError
 23 |     , searchWithOnlyListFieldsIndexReturnsValidScores
 24 |     , updateDocNotInIndexReturnsError
 25 |     , updateDocUsesNewDocContent
 26 |     )
 27 | 
 28 | import Dict
 29 | import ElmTextSearch.Json.Encoder as IndexEncoder
 30 | import Expect
 31 | import Index
 32 | import Index.Model exposing (Index(..))
 33 | import Json.Encode as Encode
 34 | import Test exposing (..)
 35 | import TestUtils
 36 | import Trie
 37 | 
 38 | 
 39 | {-| example record type for tests
 40 | -}
 41 | type alias MyDoc =
 42 |     { cid : String
 43 |     , title : String
 44 |     , author : String
 45 |     , body : String
 46 |     }
 47 | 
 48 | 
 49 | type alias MyDoc2 =
 50 |     { cid : String
 51 |     , title : String
 52 |     , author : String
 53 |     , body : List String
 54 |     }
 55 | 
 56 | 
 57 | doc1_ : MyDoc
 58 | doc1_ =
 59 |     { cid = "doc1"
 60 |     , title = "Examples of a Banana"
 61 |     , author = "Sally Apples"
 62 |     , body = "Sally writes words about a grown banana."
 63 |     }
 64 | 
 65 | 
 66 | doc2_ : MyDoc
 67 | doc2_ =
 68 |     { cid = "doc2"
 69 |     , title = "Grown Bananas and there appeal"
 70 |     , author = "John Banana"
 71 |     , body = "An example of apple engineering."
 72 |     }
 73 | 
 74 | 
 75 | doc3_ : MyDoc
 76 | doc3_ =
 77 |     { cid = "doc3"
 78 |     , title = "Kites and Trees a tail of misery"
 79 |     , author = "Adam Winddriven"
 80 |     , body = "When a flyer meets an Elm it maybe a problem."
 81 |     }
 82 | 
 83 | 
 84 | doc4_indexFieldsEmpty : { cid : String, title : String, author : String, body : String }
 85 | doc4_indexFieldsEmpty =
 86 |     { cid = "doc4"
 87 |     , title = ""
 88 |     , author = "Some Author"
 89 |     , body = ""
 90 |     }
 91 | 
 92 | 
 93 | doc5_idEmpty : MyDoc
 94 | doc5_idEmpty =
 95 |     { cid = ""
 96 |     , title = "Empty Reference Title"
 97 |     , author = "Some Author"
 98 |     , body = "Empty Reference Body"
 99 |     }
100 | 
101 | 
102 | type alias SearchCaseRecord =
103 |     { name : String
104 |     , input : String
105 |     , expect : List String
106 |     , indexResult : Index MyDoc
107 |     }
108 | 
109 | 
110 | searchCasesTest : Test
111 | searchCasesTest =
112 |     describe "Index search tests"
113 |         (List.map searchTestCase
114 |             [ { name = "two docs one with term in title first and body second"
115 |               , input = "example"
116 |               , expect = [ "doc1", "doc2" ]
117 |               , indexResult = getIndexDoc1Doc2 ()
118 |               }
119 |             , { name = "two docs one with term in title first"
120 |               , input = "grown"
121 |               , expect = [ "doc2", "doc1" ]
122 |               , indexResult = getIndexDoc1Doc2 ()
123 |               }
124 |             , { name = "neither document contains both words so return nothing"
125 |               , input = "-misery! .appeal,"
126 |               , expect = []
127 |               , indexResult = getIndexDoc1Doc2 ()
128 |               }
129 |             , { name = "with doc3 returns no docs with both words"
130 |               , input = "-misery! .appeal,"
131 |               , expect = []
132 |               , indexResult = getIndexDoc1Doc2Doc3 ()
133 |               }
134 |             , { name = "returns doc1 and doc2 e expands to example and engineer which exist in both documents."
135 |               , input = "e"
136 |               , expect = [ "doc1", "doc2" ]
137 |               , indexResult = getIndexDoc1Doc2 ()
138 |               }
139 |             , { name = "search \"ex\" returns doc1, doc2 as both contain example."
140 |               , input = "ex"
141 |               , expect = [ "doc1", "doc2" ]
142 |               , indexResult = getIndexDoc1Doc2 ()
143 |               }
144 |             , { name = "search \"en\" returns doc2 as it contains engineering."
145 |               , input = "en"
146 |               , expect = [ "doc2" ]
147 |               , indexResult = getIndexDoc1Doc2 ()
148 |               }
149 |             ]
150 |         )
151 | 
152 | 
153 | searchTestCase : SearchCaseRecord -> Test
154 | searchTestCase { name, input, expect, indexResult } =
155 |     test ("search \"" ++ input ++ "\" " ++ name) <|
156 |         \() ->
157 |             Expect.equal expect <|
158 |                 case Index.search input indexResult of
159 |                     Ok ( _, docs ) ->
160 |                         List.map Tuple.first docs
161 | 
162 |                     Err err ->
163 |                         [ err ]
164 | 
165 | 
166 | getEmptyIndexMyDoc2IndexOnlyListFields : () -> Index.Index MyDoc2
167 | getEmptyIndexMyDoc2IndexOnlyListFields _ =
168 |     Index.new
169 |         { indexType = "- IndexTest Type -"
170 |         , ref = .cid
171 |         , fields = []
172 |         , listFields =
173 |             [ ( .body, 1 )
174 |             ]
175 |         }
176 | 
177 | 
178 | getEmptyIndexMyDoc2 : () -> Index.Index MyDoc2
179 | getEmptyIndexMyDoc2 _ =
180 |     Index.new
181 |         { indexType = "- IndexTest Type -"
182 |         , ref = .cid
183 |         , fields = [ ( .title, 5 ) ]
184 |         , listFields = [ ( .body, 1 ) ]
185 |         }
186 | 
187 | 
188 | getEmptyIndex : () -> Index.Index MyDoc
189 | getEmptyIndex _ =
190 |     Index.new
191 |         { indexType = "- IndexTest Type -"
192 |         , ref = .cid
193 |         , fields = [ ( .title, 5 ), ( .body, 1 ) ]
194 |         , listFields = []
195 |         }
196 | 
197 | 
198 | getIndexDoc1 : () -> Index.Index MyDoc
199 | getIndexDoc1 _ =
200 |     getEmptyIndex ()
201 |         |> Index.add doc1_
202 |         |> TestUtils.getResultIgnoreError
203 | 
204 | 
205 | getIndexDoc1Doc2 : () -> Index.Index MyDoc
206 | getIndexDoc1Doc2 _ =
207 |     getIndexDoc1 ()
208 |         |> Index.add doc2_
209 |         |> TestUtils.getResultIgnoreError
210 | 
211 | 
212 | getIndexDoc1Doc2Doc3 : () -> Index.Index MyDoc
213 | getIndexDoc1Doc2Doc3 _ =
214 |     getIndexDoc1Doc2 ()
215 |         |> Index.add doc3_
216 |         |> TestUtils.getResultIgnoreError
217 | 
218 | 
219 | searchUsingEmptyQueryReturnsError : Test
220 | searchUsingEmptyQueryReturnsError =
221 |     test "empty query returns Err" <|
222 |         \() ->
223 |             getIndexDoc1Doc2 ()
224 |                 |> Index.search ""
225 |                 |> Expect.equal (Err "Error query is empty.")
226 | 
227 | 
228 | searchUsingQueryWithOnlyStopWordsWhichMeansEmptyReturnsError : Test
229 | searchUsingQueryWithOnlyStopWordsWhichMeansEmptyReturnsError =
230 |     test "query full of stop words (filtered out words) returns Err" <|
231 |         \() ->
232 |             getIndexDoc1Doc2 ()
233 |                 |> Index.search "if and but "
234 |                 |> Expect.equal (Err "Error after tokenisation there are no terms to search for.")
235 | 
236 | 
237 | searchEmptyIndexReturnsError : Test
238 | searchEmptyIndexReturnsError =
239 |     test "no document returns Err" <|
240 |         \() ->
241 |             Index.search "hello world"
242 |                 (getEmptyIndex ())
243 |                 |> Expect.equal (Err "Error there are no documents in index to search.")
244 | 
245 | 
246 | idfCacheIsClearedAfterSuccessfulRemove : Test
247 | idfCacheIsClearedAfterSuccessfulRemove =
248 |     test "idfCache is cleared after a successful remove document." <|
249 |         \() ->
250 |             getIndexDoc1Doc2 ()
251 |                 |> Index.search "banana"
252 |                 |> TestUtils.getResultIgnoreError
253 |                 |> Tuple.first
254 |                 |> Index.remove doc1_
255 |                 |> TestUtils.getResultIgnoreError
256 |                 |> getIdfCache
257 |                 |> Dict.isEmpty
258 |                 |> Expect.equal True
259 |                 |> Expect.onFail "IdfCache should be cleared after document remove"
260 | 
261 | 
262 | 
263 | -- |> Expect.pass |> Expect.onFail "IdfCache should be cleared after document remove"
264 | 
265 | 
266 | idfCacheIsClearedAfterASuccessfulAdd : Test
267 | idfCacheIsClearedAfterASuccessfulAdd =
268 |     test "idfCache is cleared after a successful add document." <|
269 |         \() ->
270 |             getIndexDoc1Doc2 ()
271 |                 |> Index.search "banana"
272 |                 |> TestUtils.getResultIgnoreError
273 |                 |> Tuple.first
274 |                 |> Index.add doc3_
275 |                 |> TestUtils.getResultIgnoreError
276 |                 |> getIdfCache
277 |                 |> Dict.isEmpty
278 |                 |> Expect.equal True
279 |                 |> Expect.onFail "IdfCache should be cleared after document remove"
280 | 
281 | 
282 | addDocWithIndexFieldsEmptyReturnsError : Test
283 | addDocWithIndexFieldsEmptyReturnsError =
284 |     test "Add a doc which has all index fields empty returns Err" <|
285 |         \() ->
286 |             getEmptyIndex ()
287 |                 |> Index.add doc4_indexFieldsEmpty
288 |                 |> TestUtils.getErrorIgnoreResult
289 |                 |> Expect.equal "Error after tokenisation there are no terms to index."
290 | 
291 | 
292 | addDocWithEmptyIdFieldReturnsError : Test
293 | addDocWithEmptyIdFieldReturnsError =
294 |     test "Add a doc empty ID field returns Err" <|
295 |         \() ->
296 |             getEmptyIndex ()
297 |                 |> Index.add doc5_idEmpty
298 |                 |> Expect.equal (Err "Error document has an empty unique id (ref).")
299 | 
300 | 
301 | addDocAlreadyInIndexReturnsError : Test
302 | addDocAlreadyInIndexReturnsError =
303 |     test "Add a doc allready in index returns Err" <|
304 |         \() ->
305 |             getIndexDoc1Doc2Doc3 ()
306 |                 |> Index.add doc1_
307 |                 |> TestUtils.getErrorIgnoreResult
308 |                 |> Expect.equal "Error adding document that allready exists."
309 | 
310 | 
311 | getIdfCache : Index doc -> Dict.Dict String Float
312 | getIdfCache (Index irec) =
313 |     irec.idfCache
314 | 
315 | 
316 | removeDocRefNotIndexReturnsError : Test
317 | removeDocRefNotIndexReturnsError =
318 |     test "Remove a doc ref not in index returns Err." <|
319 |         \() ->
320 |             getIndexDoc1Doc2 ()
321 |                 |> Index.remove doc3_
322 |                 |> TestUtils.getErrorIgnoreResult
323 |                 |> Expect.equal "Error document is not in index."
324 | 
325 | 
326 | removeDocWithEmptyIdFieldReturnsError : Test
327 | removeDocWithEmptyIdFieldReturnsError =
328 |     test "Remove a doc with empty id field is an error." <|
329 |         \() ->
330 |             getEmptyIndex ()
331 |                 |> Index.remove doc5_idEmpty
332 |                 |> Expect.equal (Err "Error document has an empty unique id (ref).")
333 | 
334 | 
335 | searchIndexAfterDocRemovedErrors : Test
336 | searchIndexAfterDocRemovedErrors =
337 |     test "Search index where 1 doc from index was removed fails" <|
338 |         \() ->
339 |             getIndexDoc1 ()
340 |                 |> Index.remove doc1_
341 |                 |> TestUtils.getResultIgnoreError
342 |                 |> Index.search "Sally"
343 |                 |> TestUtils.getErrorIgnoreResult
344 |                 |> Expect.equal "Error there are no documents in index to search."
345 | 
346 | 
347 | searchIndexAfter2DocRemovedErrors : Test
348 | searchIndexAfter2DocRemovedErrors =
349 |     test "Search Index where 2 docs from index removed fails" <|
350 |         \() ->
351 |             getIndexDoc1Doc2 ()
352 |                 |> Index.remove doc1_
353 |                 |> TestUtils.getResultIgnoreError
354 |                 |> Index.remove doc2_
355 |                 |> TestUtils.getResultIgnoreError
356 |                 |> Index.search "Sally"
357 |                 |> TestUtils.getErrorIgnoreResult
358 |                 |> Expect.equal "Error there are no documents in index to search."
359 | 
360 | 
361 | removeDoesNotBreakSearchResults : Test
362 | removeDoesNotBreakSearchResults =
363 |     test "Remove does not break searching" <|
364 |         \() ->
365 |             getIndexDoc1Doc2 ()
366 |                 |> Index.remove doc2_
367 |                 |> TestUtils.getResultIgnoreError
368 |                 |> Index.search "Sally"
369 |                 |> TestUtils.getResultIgnoreError
370 |                 |> Tuple.second
371 |                 |> List.map Tuple.first
372 |                 |> Expect.equal [ doc1_.cid ]
373 | 
374 | 
375 | {-| Test to verify removing only document reports
376 | -}
377 | removeOnlyDocIndexReturnsIsEmpty : Test
378 | removeOnlyDocIndexReturnsIsEmpty =
379 |     let
380 |         testIndexU1 =
381 |             getIndexDoc1 ()
382 |                 |> Index.remove doc1_
383 |                 |> TestUtils.getResultIgnoreError
384 | 
385 |         ( storeB, tokenStoreB ) =
386 |             case testIndexU1 of
387 |                 Index { documentStore, tokenStore } ->
388 |                     ( documentStore, tokenStore )
389 |     in
390 |     describe "removing a doc"
391 |         [ test "removes it from document store" <|
392 |             \() ->
393 |                 Dict.member "doc1" storeB
394 |                     |> Expect.equal False
395 |                     |> Expect.onFail "oops its in document store"
396 |         , test "removes trie nodes not leading to a reference. This is not testing trie, testing Index use of trie" <|
397 |             \() ->
398 |                 Trie.isEmpty tokenStoreB
399 |                     |> Expect.equal True
400 |                     |> Expect.onFail "Trie model is not empty"
401 |         ]
402 | 
403 | 
404 | addMultipleDocsReturnsErrorListForProblems : Test
405 | addMultipleDocsReturnsErrorListForProblems =
406 |     describe "addAllDocs Tests" <|
407 |         [ test "Add multiple docs returning list of docs with errors" <|
408 |             \() ->
409 |                 getEmptyIndex ()
410 |                     |> Index.addDocs [ doc3_, doc4_indexFieldsEmpty ]
411 |                     |> Tuple.second
412 |                     |> Expect.equal [ ( 1, "Error after tokenisation there are no terms to index." ) ]
413 |         , test "Add multiple docs returning list of errors swap order of documents." <|
414 |             \() ->
415 |                 getEmptyIndex ()
416 |                     |> Index.addDocs [ doc4_indexFieldsEmpty, doc3_ ]
417 |                     |> Tuple.second
418 |                     |> Expect.equal [ ( 0, "Error after tokenisation there are no terms to index." ) ]
419 |         ]
420 | 
421 | 
422 | helperAddDocsSearchIndexResults : String -> List doc -> Index doc -> List ( String, Float )
423 | helperAddDocsSearchIndexResults search docs index =
424 |     index
425 |         -- |> (\a -> Debug.log "foo" a)
426 |         |> Index.addDocs docs
427 |         |> Tuple.first
428 |         |> Index.search search
429 |         |> TestUtils.getResultIgnoreError
430 |         |> Tuple.second
431 | 
432 | 
433 | {-| Case from <https://github.com/rluiten/elm-text-search/issues/4>
434 | Two docs with titles Question1 and Question2
435 | "q" search was not returning both documents.
436 | -}
437 | searchSingleLetterWithLetterInTitles : Test
438 | searchSingleLetterWithLetterInTitles =
439 |     test "search single letter reports both documents with word starting with that letter in title field" <|
440 |         \() ->
441 |             getEmptyIndex ()
442 |                 |> helperAddDocsSearchIndexResults "q"
443 |                     [ { cid = "qdoc1"
444 |                       , title = "Question1"
445 |                       , author = "Sally Apples"
446 |                       , body = "Sally writes words about a grown banana."
447 |                       }
448 |                     , { cid = "qdoc2"
449 |                       , title = "Question2"
450 |                       , author = "John Banana"
451 |                       , body = "An example of apple engineering."
452 |                       }
453 |                     ]
454 |                 |> List.map Tuple.first
455 |                 |> Expect.equal [ "qdoc1", "qdoc2" ]
456 | 
457 | 
458 | searchListFieldsSingleLetterWithLetterInBody : Test
459 | searchListFieldsSingleLetterWithLetterInBody =
460 |     test "search finds words in list fields body of MyDoc2" <|
461 |         \() ->
462 |             getEmptyIndexMyDoc2 ()
463 |                 |> helperAddDocsSearchIndexResults "green"
464 |                     [ { cid = "qdoc1"
465 |                       , title = "Question1 Notgreen"
466 |                       , author = "Sally Apples"
467 |                       , body =
468 |                             [ "Sally writes words about "
469 |                             , "a grown green banana."
470 |                             ]
471 |                       }
472 |                     , { cid = "qdoc2"
473 |                       , title = "Question2 Purple"
474 |                       , author = "John Banana"
475 |                       , body =
476 |                             [ "An example of "
477 |                             , "green apple engineering."
478 |                             ]
479 |                       }
480 |                     ]
481 |                 |> List.map Tuple.first
482 |                 |> Expect.equal [ "qdoc2", "qdoc1" ]
483 | 
484 | 
485 | {-| Configure to have some data in listFields body, match in listFields body, index with fields set to []
486 | Reproduce a bug reported.
487 | -}
488 | searchWithOnlyListFieldsIndexReturnsValidScores : Test
489 | searchWithOnlyListFieldsIndexReturnsValidScores =
490 |     test "search index with only List fields configured, check for NaN values in scores" <|
491 |         \() ->
492 |             getEmptyIndexMyDoc2IndexOnlyListFields ()
493 |                 |> helperAddDocsSearchIndexResults "green"
494 |                     [ { cid = "qdoc1"
495 |                       , title = "Question1 Notgreen"
496 |                       , author = "Sally Apples"
497 |                       , body =
498 |                             [ "Sally writes words about "
499 |                             , "a grown green banana."
500 |                             ]
501 |                       }
502 |                     , { cid = "qdoc2"
503 |                       , title = "Question2 Purple"
504 |                       , author = "John Banana"
505 |                       , body =
506 |                             [ "An example of "
507 |                             , "green apple engineering."
508 |                             ]
509 |                       }
510 |                     ]
511 |                 |> List.map Tuple.second
512 |                 |> List.any Basics.isNaN
513 |                 |> Expect.equal False
514 |                 |> Expect.onFail "Expect searchScores to not contain any NaN values"
515 | 
516 | 
517 | addDocumentWithSameIdAsExistingReturnsError : Test
518 | addDocumentWithSameIdAsExistingReturnsError =
519 |     test "add same document to index produces error" <|
520 |         \() ->
521 |             getIndexDoc1 ()
522 |                 |> Index.add doc1_
523 |                 |> TestUtils.getErrorIgnoreResult
524 |                 |> Expect.equal "Error adding document that allready exists."
525 | 
526 | 
527 | addOrUpdateDocWithSameIdReturnsSuccess : Test
528 | addOrUpdateDocWithSameIdReturnsSuccess =
529 |     test "addOrUpdate same document does not produce error" <|
530 |         \() ->
531 |             getIndexDoc1 ()
532 |                 |> Index.addOrUpdate doc1_
533 |                 |> TestUtils.isOk
534 |                 |> Expect.equal True
535 |                 |> Expect.onFail "Expect Ok result to addOrUpdate if doc in index"
536 | 
537 | 
538 | addOrUpdateDocNotInIndexReturnsSuccess : Test
539 | addOrUpdateDocNotInIndexReturnsSuccess =
540 |     test "addOrUpdate document not in index updates index with new doc" <|
541 |         \() ->
542 |             getEmptyIndex ()
543 |                 |> Index.addOrUpdate doc1_
544 |                 |> TestUtils.isOk
545 |                 |> Expect.equal True
546 |                 |> Expect.onFail "Expect Ok result to addOrUpdate if doc is new"
547 | 
548 | 
549 | updateDocNotInIndexReturnsError : Test
550 | updateDocNotInIndexReturnsError =
551 |     test "index update with a doc not in index fails" <|
552 |         \() ->
553 |             getEmptyIndex ()
554 |                 |> Index.update doc1_
555 |                 |> TestUtils.isOk
556 |                 |> Expect.equal False
557 |                 |> Expect.onFail "Updating a doc not in index fails"
558 | 
559 | 
560 | {-| Updating a document removes old doc version and adds new doc version.
561 | 
562 | This was a bug I noticed in code, writing test to confirm before fixing it.
563 | 
564 | -}
565 | updateDocUsesNewDocContent : Test
566 | updateDocUsesNewDocContent =
567 |     let
568 |         indexT1 =
569 |             getEmptyIndex ()
570 |                 |> Index.addDocs
571 |                     [ { cid = "qdoc1"
572 |                       , title = "Question1"
573 |                       , author = "Sally Apples"
574 |                       , body = "Sally writes words about a grown banana."
575 |                       }
576 |                     , { cid = "qdoc2"
577 |                       , title = "Question2"
578 |                       , author = "John Banana"
579 |                       , body = "An example of apple engineering."
580 |                       }
581 |                     ]
582 |                 |> Tuple.first
583 | 
584 |         indexT2 =
585 |             indexT1
586 |                 |> Index.update
587 |                     { cid = "qdoc1"
588 |                     , title = "Yesterday"
589 |                     , author = "New User"
590 |                     , body = "Completely different document really"
591 |                     }
592 |                 |> TestUtils.getResultIgnoreError
593 | 
594 |         encodedT1 =
595 |             indexT1
596 |                 |> IndexEncoder.encoder
597 |                 |> Encode.encode 0
598 | 
599 |         encodedT2 =
600 |             indexT2
601 |                 |> IndexEncoder.encoder
602 |                 |> Encode.encode 0
603 |     in
604 |     test "updateDoc removes old doc and replaces it so index changes" <|
605 |         \() ->
606 |             encodedT1
607 |                 |> Expect.notEqual
608 |                     encodedT2
609 | 


--------------------------------------------------------------------------------
/tests/IndexUtilsTests.elm:
--------------------------------------------------------------------------------
  1 | module IndexUtilsTests exposing
  2 |     ( testDefaultTransforms
  3 |     , testGetTokens
  4 |     , test_processTokens_filterFactories
  5 |     , test_processTokens_initialTransformFactories
  6 |     , test_processTokens_transformFactories
  7 |     )
  8 | 
  9 | import Expect
 10 | import Index exposing (Index)
 11 | import Index.Model exposing (FilterFactory, TransformFactory)
 12 | import Index.Utils
 13 | import StopWordFilter exposing (createFilterFunc)
 14 | import Test exposing (..)
 15 | 
 16 | type alias MyDoc =
 17 |     { cid : String
 18 |     , title : String
 19 |     , author : String
 20 |     , body : String
 21 |     }
 22 | 
 23 | 
 24 | testDefaultTransforms : Test
 25 | testDefaultTransforms =
 26 |     describe "apply default transform tests"
 27 |         (List.map testGetTokens
 28 |             [ ( "words of only non word chars removed"
 29 |               , "engineering ???"
 30 |               , [ "engin" ]
 31 |               )
 32 |             , ( "stemmer and non word chars removed"
 33 |               , ".This was very large.-"
 34 |               , [ "veri", "larg" ]
 35 |               )
 36 |             , ( "stop words removed"
 37 |               , "however among the dear .- -"
 38 |               , []
 39 |               )
 40 | 
 41 |             -- Bug https://github.com/rluiten/elm-text-search/issues/10
 42 |             , ( "\"on\" in the stop word list should not filter \"one\""
 43 |               , "one two three"
 44 |                 -- note that "one" is transformed to "on" by stemmer
 45 |               , [ "on", "two", "three" ]
 46 |               )
 47 |             ]
 48 |         )
 49 | 
 50 | 
 51 | testGetTokens : ( String, String, List String ) -> Test
 52 | testGetTokens ( name, input, expected ) =
 53 |     test ("getTokens \"" ++ input ++ "\" " ++ name) <|
 54 |         \() ->
 55 |             let
 56 |                 testMyDocIndex =
 57 |                     Index.new
 58 |                         { indexType = "- IndexTest Type -"
 59 |                         , ref = .cid
 60 |                         , fields =
 61 |                             [ ( .title, 5 )
 62 |                             , ( .body, 1 )
 63 |                             ]
 64 |                         , listFields = []
 65 |                         }
 66 |             in
 67 |             Index.Utils.getTokens
 68 |                 testMyDocIndex
 69 |                 input
 70 |                 |> Tuple.second
 71 |                 |> Expect.equal expected
 72 | 
 73 | 
 74 | createTestIndex1 :
 75 |     List (TransformFactory MyDoc)
 76 |     -> List (TransformFactory MyDoc)
 77 |     -> List (FilterFactory MyDoc)
 78 |     -> Index MyDoc
 79 | createTestIndex1 initialTransformFactories transformFactories filterFactories =
 80 |     Index.newWith
 81 |         { indexType = "- IndexTest Type -"
 82 |         , ref = .cid
 83 |         , fields =
 84 |             [ ( .title, 5 )
 85 |             , ( .body, 1 )
 86 |             ]
 87 |         , listFields = []
 88 |         , initialTransformFactories = initialTransformFactories
 89 |         , transformFactories = transformFactories
 90 |         , filterFactories = filterFactories
 91 |         }
 92 | 
 93 | 
 94 | test_processTokens_transformFactories : Test
 95 | test_processTokens_transformFactories =
 96 |     test "test processTokens transformFactories list" <|
 97 |         \() ->
 98 |             Index.Utils.processTokens
 99 |                 (createTestIndex1
100 |                     []
101 |                     [ Index.Utils.createFuncCreator (String.dropRight 1), Index.Utils.createFuncCreator (String.dropRight 1) ]
102 |                     []
103 |                 )
104 |                 [ "awords", "btesting", "ca" ]
105 |                 |> Tuple.second
106 |                 |> Expect.equal [ "awor", "btesti" ]
107 | 
108 | 
109 | test_processTokens_initialTransformFactories : Test
110 | test_processTokens_initialTransformFactories =
111 |     test "test processTokens initialTransformFactories list" <|
112 |         \() ->
113 |             Index.Utils.processTokens
114 |                 (createTestIndex1
115 |                     [ Index.Utils.createFuncCreator (String.dropLeft 1), Index.Utils.createFuncCreator (String.dropRight 1) ]
116 |                     []
117 |                     []
118 |                 )
119 |                 [ "pwords", "qtesting", "ra" ]
120 |                 |> Tuple.second
121 |                 |> Expect.equal
122 |                     [ "word", "testin" ]
123 | 
124 | 
125 | test_processTokens_filterFactories : Test
126 | test_processTokens_filterFactories =
127 |     test "test processTokens filterFactories list" <|
128 |         \() ->
129 |             Index.Utils.processTokens
130 |                 (createTestIndex1
131 |                     []
132 |                     []
133 |                     [ createFilterFunc [ "special" ], createFilterFunc [ "swimming" ] ]
134 |                 )
135 |                 [ "word", "special", "puzzle", "swimming" ]
136 |                 |> Tuple.second
137 |                 |> Expect.equal
138 |                     [ "word", "puzzle" ]
139 | 


--------------------------------------------------------------------------------
/tests/SearchIndexTests.elm:
--------------------------------------------------------------------------------
  1 | module SearchIndexTests exposing (saveAndLoadSameTest, searchReturnsEmptyResult, searchReturnsValidResult)
  2 | 
  3 | {- Save and Load index check search results same -}
  4 | 
  5 | import ElmTextSearch
  6 | import Expect
  7 | import Index.Model exposing (Index(..), IndexSimpleConfig)
  8 | import Test exposing (..)
  9 | import TestUtils
 10 | 
 11 | 
 12 | type alias MyDoc =
 13 |     { cid : String
 14 |     , title : String
 15 |     , author : String
 16 |     , body : String
 17 |     }
 18 | 
 19 | 
 20 | configElmTextSearchMyDoc : IndexSimpleConfig MyDoc
 21 | configElmTextSearchMyDoc =
 22 |     { ref = .cid
 23 |     , fields =
 24 |         [ ( .title, 5 )
 25 |         , ( .body, 1 )
 26 |         ]
 27 |     , listFields = []
 28 |     }
 29 | 
 30 | 
 31 | doc1 : MyDoc
 32 | doc1 =
 33 |     { cid = "doc1"
 34 |     , title = "Examples of a Banana"
 35 |     , author = "Sally Apples"
 36 |     , body = "Sally writes words about a grown banana."
 37 |     }
 38 | 
 39 | 
 40 | doc2 : MyDoc
 41 | doc2 =
 42 |     { cid = "doc2"
 43 |     , title = "Words about a vehicle"
 44 |     , author = "John Barrel"
 45 |     , body = "All about a vehicle in exile."
 46 |     }
 47 | 
 48 | 
 49 | {-| example index
 50 | -}
 51 | getEmptyIndex : () -> Index MyDoc
 52 | getEmptyIndex _ =
 53 |     ElmTextSearch.new configElmTextSearchMyDoc
 54 | 
 55 | 
 56 | getIndexDoc1 : () -> Index MyDoc
 57 | getIndexDoc1 _ =
 58 |     getEmptyIndex ()
 59 |         |> ElmTextSearch.add doc1
 60 |         |> TestUtils.getResultIgnoreError
 61 | 
 62 | 
 63 | getIndexDoc1Doc2 : () -> Index MyDoc
 64 | getIndexDoc1Doc2 _ =
 65 |     getIndexDoc1 ()
 66 |         |> ElmTextSearch.add doc2
 67 |         |> TestUtils.getResultIgnoreError
 68 | 
 69 | 
 70 | searchReturnsEmptyResult : Test
 71 | searchReturnsEmptyResult =
 72 |     test "Search returns empty result" <|
 73 |         \() ->
 74 |             getIndexDoc1Doc2 ()
 75 |                 |> ElmTextSearch.search "foreign"
 76 |                 |> TestUtils.getResultIgnoreError
 77 |                 |> Tuple.second
 78 |                 |> Expect.equal []
 79 | 
 80 | 
 81 | searchReturnsValidResult : Test
 82 | searchReturnsValidResult =
 83 |     test "Search returns valid result" <|
 84 |         \() ->
 85 |             getIndexDoc1Doc2 ()
 86 |                 |> ElmTextSearch.search "exile"
 87 |                 |> TestUtils.getResultIgnoreError
 88 |                 |> Tuple.second
 89 |                 |> Expect.equal [ ( "doc2", 0.13898344497096093 ) ]
 90 | 
 91 | 
 92 | {-| helper to save and load an index. and run a search in original index and loaded index.
 93 | -}
 94 | searchIndexSearchSavedLoadedIndex : String -> Index MyDoc -> ( List ( String, Float ), List ( String, Float ) )
 95 | searchIndexSearchSavedLoadedIndex search index =
 96 |     let
 97 |         searchAnIndex index2 =
 98 |             index2
 99 |                 |> ElmTextSearch.search search
100 |                 |> TestUtils.getResultIgnoreError
101 |                 |> Tuple.second
102 | 
103 |         savedAndLoadedIndex i =
104 |             ElmTextSearch.storeToString i
105 |                 |> ElmTextSearch.fromString configElmTextSearchMyDoc
106 |                 |> TestUtils.getResultIgnoreError
107 |     in
108 |     ( searchAnIndex index, searchAnIndex <| savedAndLoadedIndex index )
109 | 
110 | 
111 | saveAndLoadSameTest : Test
112 | saveAndLoadSameTest =
113 |     describe "results same before and after save and load index"
114 |         [ test "x Search result of nothing for Index same as for Save and Loaded Index." <|
115 |             \() ->
116 |                 let
117 |                     ( resultA, resultsB ) =
118 |                         getIndexDoc1Doc2 ()
119 |                             |> searchIndexSearchSavedLoadedIndex "foreign"
120 |                 in
121 |                 Expect.equal resultA resultsB
122 |         , test "x Search result of something for Index same as for Save and Loaded Index." <|
123 |             \() ->
124 |                 let
125 |                     ( resultA, resultsB ) =
126 |                         getIndexDoc1Doc2 ()
127 |                             |> searchIndexSearchSavedLoadedIndex "exile"
128 |                 in
129 |                 Expect.equal resultA resultsB
130 |         ]
131 | 


--------------------------------------------------------------------------------
/tests/StopWordFilterTests.elm:
--------------------------------------------------------------------------------
 1 | module StopWordFilterTests exposing (newIndex, stopWordFilterTest, tests)
 2 | 
 3 | import ElmTextSearch
 4 | import Expect
 5 | import StopWordFilter
 6 | import Test exposing (..)
 7 | 
 8 | 
 9 | type alias ExampleDocType =
10 |     { cid : String
11 |     , title : String
12 |     , author : String
13 |     , body : String
14 |     }
15 | 
16 | 
17 | newIndex : ElmTextSearch.Index ExampleDocType
18 | newIndex =
19 |     ElmTextSearch.new
20 |         { ref = .cid
21 |         , fields =
22 |             [ ( .title, 5 )
23 |             , ( .body, 1 )
24 |             ]
25 |         , listFields = []
26 |         }
27 | 
28 | 
29 | tests : Test
30 | tests =
31 |     describe "check stopEnglishWordList against default token processing"
32 |         (List.map stopWordFilterTest StopWordFilter.stopEnglishWordList)
33 | 
34 | 
35 | stopWordFilterTest : String -> Test
36 | stopWordFilterTest word =
37 |     let
38 |         ( _, stopWordFilter ) =
39 |             StopWordFilter.createDefaultFilterFunc newIndex
40 |     in
41 |     test ("This word \"" ++ word ++ "\" got past default stop word filter in error.") <|
42 |         \() ->
43 |             stopWordFilter word
44 |                 |> Expect.equal False
45 |                 >> Expect.onFail "These should all be stopped"
46 | 


--------------------------------------------------------------------------------
/tests/TestUtils.elm:
--------------------------------------------------------------------------------
 1 | module TestUtils exposing
 2 |     ( expectOkWithGoodFailMessage
 3 |     , getDecodeErrorFailureMessage
 4 |     , getErrorIgnoreResult
 5 |     , getResultIgnoreError
 6 |     , isErr
 7 |     , isOk
 8 |     )
 9 | 
10 | {-| Utilities to make test cases simpler.
11 | -}
12 | 
13 | import Expect
14 | import Index
15 | import Index.Model exposing (Index(..))
16 | import Json.Decode exposing (Error(..))
17 | import Test exposing (..)
18 | 
19 | 
20 | expectOkWithGoodFailMessage : Result Error a -> Expect.Expectation
21 | expectOkWithGoodFailMessage result =
22 |     case result of
23 |         Ok _ ->
24 |             Expect.pass |> Expect.onFail "Result OK as expected"
25 | 
26 |         Err error ->
27 |             Expect.fail
28 |                 (String.concat
29 |                     [ "Result Err not expected: "
30 |                     , getDecodeErrorFailureMessage error
31 |                     ]
32 |                 )
33 | 
34 | 
35 | getResultIgnoreError : Result error a -> a
36 | getResultIgnoreError result =
37 |     case result of
38 |         Ok value ->
39 |             value
40 | 
41 |         Err _ ->
42 |             Debug.todo "Ignoring failure for testing"
43 | 
44 | 
45 | getErrorIgnoreResult : Result error a -> error
46 | getErrorIgnoreResult result =
47 |     case result of
48 |         Ok _ ->
49 |             Debug.todo "Ignoring value for testing"
50 | 
51 |         Err error ->
52 |             error
53 | 
54 | 
55 | getDecodeErrorFailureMessage : Error -> String
56 | getDecodeErrorFailureMessage error =
57 |     case error of
58 |         Failure message _ ->
59 |             message
60 | 
61 |         _ ->
62 |             Debug.todo "Ignoring all but Failures of Decode Error"
63 | 
64 | 
65 | isOk : Result e a -> Bool
66 | isOk x =
67 |     case x of
68 |         Ok _ ->
69 |             True
70 | 
71 |         Err _ ->
72 |             False
73 | 
74 | 
75 | isErr : Result e a -> Bool
76 | isErr x =
77 |     case x of
78 |         Ok _ ->
79 |             False
80 | 
81 |         Err _ ->
82 |             True
83 | 


--------------------------------------------------------------------------------
/tests/TokenProcessorTests.elm:
--------------------------------------------------------------------------------
 1 | module TokenProcessorTests exposing (tokenizerTest, tokenizerTests, trimmerTest, trimmerTests)
 2 | 
 3 | import Expect
 4 | import Test exposing (..)
 5 | import TokenProcessors
 6 | 
 7 | 
 8 | tokenizerTests : Test
 9 | tokenizerTests =
10 |     describe "Lunr TokenProcessors tokenizer tests" <|
11 |         List.map tokenizerTest
12 |             [ ( "splitting simple strings into tokens"
13 |               , "this is a simple string"
14 |               , [ "this", "is", "a", "simple", "string" ]
15 |               )
16 |             , ( "downcasing tokens"
17 |               , "FOO BAR"
18 |               , [ "foo", "bar" ]
19 |               )
20 |             , ( "splitting strings with hyphens"
21 |               , "take the New York-San Francisco flight"
22 |               , [ "take", "the", "new", "york", "san", "francisco", "flight" ]
23 |               )
24 |             , ( "splitting strings with hyphens and spaces"
25 |               , "Solve for A - B"
26 |               , [ "solve", "for", "a", "b" ]
27 |               )
28 |             , ( "leading - in query should not cause extra token ?"
29 |               , "-misery! .appeal,"
30 |               , [ "misery!", ".appeal," ]
31 |               )
32 |             ]
33 | 
34 | 
35 | tokenizerTest : ( String, String, List String ) -> Test
36 | tokenizerTest ( name, testString, expectedTokens ) =
37 |     test name <|
38 |         \() ->
39 |             Expect.equal
40 |                 expectedTokens
41 |                 (TokenProcessors.tokenizer testString)
42 | 
43 | 
44 | trimmerTests : Test
45 | trimmerTests =
46 |     describe "Lunr TokenProcessors trimmer tests" <|
47 |         List.map trimmerTest
48 |             [ ( "023hello", "023hello" )
49 |             , ( "=hello", "hello" )
50 |             , ( "hello.", "hello" )
51 |             , ( ",hello,", "hello" )
52 |             , ( ",hello_,", "hello_" )
53 |             , ( "40%", "40" )
54 |             ]
55 | 
56 | 
57 | trimmerTest : ( String, String ) -> Test
58 | trimmerTest ( testString, expectedString ) =
59 |     test ("trimmer " ++ testString ++ " -> " ++ expectedString) <|
60 |         \() ->
61 |             Expect.equal
62 |                 expectedString
63 |                 (TokenProcessors.trimmer testString)
64 | 


--------------------------------------------------------------------------------