├── .gitignore ├── LICENSE ├── README.md ├── TODO.md ├── elm.json ├── examples ├── elm.json └── src │ ├── IndexNewAddSearch.elm │ ├── IndexNewAddSearchListFields.elm │ ├── IndexNewWithAddSearch.elm │ └── MultipleAddSearch.elm ├── src ├── ElmTextSearch.elm ├── ElmTextSearch │ └── Json │ │ ├── Decoder.elm │ │ └── Encoder.elm ├── ElmTextSearchErrors.elm ├── Index.elm ├── Index │ ├── Defaults.elm │ ├── Load.elm │ ├── Model.elm │ ├── Utils.elm │ └── Vector.elm ├── StopWordFilter.elm ├── TokenProcessors.elm └── Utils.elm └── tests ├── DefaultTests.elm ├── ElmTextSearchTests.elm ├── IndexDecoderTests.elm ├── IndexEncoderTests.elm ├── IndexLoadTests.elm ├── IndexTests.elm ├── IndexUtilsTests.elm ├── SearchIndexTests.elm ├── StopWordFilterTests.elm ├── TestUtils.elm └── TokenProcessorTests.elm /.gitignore: -------------------------------------------------------------------------------- 1 | elm-stuff 2 | lunr.js 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Robin Luiten www.github.com/rluiten 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of ElmTextSearch nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ElmTextSearch full text indexer 2 | 3 | Copyright (c) 2025 Robin Luiten 4 | 5 | This is a full text indexing engine inspired by lunr.js and written in Elm language. 6 | See http://lunrjs.com/ for lunr.js 7 | 8 | I am happy to hear about users of this package. 9 | 10 | I am happy to receive contributions be they bug reports, pull requests, documentation updates or examples. 11 | 12 | ### v4.0.0 will not load indexes saved with old version. 13 | 14 | If you do not use `storeToValue` `storeToString` `fromString` `fromValue` in ElmTextSearch this update is not likely to introduce issues. 15 | 16 | The way that filters and transforms are applied to the content of documents has changed. 17 | This is to properly fix a bug reported see https://github.com/rluiten/elm-text-search/issues/10 where stop word filters were not correctly applied. This means saved indexes from previous version of ElmTextSearch will not load in this version. 18 | 19 | * `Defaults.indexVersion` has changed value. 20 | 21 | The reason this is a Major version bump is some generalisation was done to enable future support 22 | for loading and saving of older version and types of default index configurations. 23 | 24 | ### v5.0.0 updates for Elm 0.19 25 | 26 | Result types from loading indexes are now Decode.Error not String. 27 | 28 | ### v5.0.2, v5.1.0 29 | 30 | New functions addT for add, searchT for search and removeT for remove. 31 | These replace the error type of result with a type. 32 | v5.0.2 was a goof on my part i forgot to expose new functions correctly. 33 | 34 | ### v5.1.1 35 | 36 | Fixed a bug in Trie that affect this when removing documents quite seriously. 37 | New version of Trie, a few extra tests created. 38 | 39 | ### Packages 40 | 41 | Several packages were created for this project and published separately for this package to depend on. 42 | 43 | * trie 44 | * http://package.elm-lang.org/packages/rluiten/trie/latest 45 | * stemmer 46 | * http://package.elm-lang.org/packages/rluiten/stemmer/latest 47 | * sparsevector 48 | * http://package.elm-lang.org/packages/rluiten/sparsevector/latest 49 | 50 | ### Parts of lunr.js were left out 51 | 52 | * This does not have an event system. 53 | * Its internal data structure is not compatible. 54 | 55 | ### Notes captured along way writing this. 56 | 57 | * lunr.js 58 | * tokenStore.remove does not decrement length, but it doesn't use length really only save/load 59 | * stemmer "lay" -> "lay" "try" -> "tri" is opposite to porter stemmer 60 | * porter stemmer erlang implementation 61 | * step5b does not use endsWithDoubleCons which is required afaik to pass the voc.txt output.txt cases 62 | 63 | 64 | ### Example 65 | 66 | See examples folder for four examples. 67 | You can run any of the examples if you navigate to the examples folder and run `elm reactor` and select an example in the src folder. 68 | 69 | First example is included inline here. 70 | 71 | IndexNewAddSearch.elm 72 | ```elm 73 | module Main exposing (ExampleDocType, createNewIndexExample, main, resultSearchIndex, resultUpdatedMyIndexAfterAdd) 74 | 75 | {-| Create an index and add a document, search a document 76 | 77 | Copyright (c) 2025 Robin Luiten 78 | 79 | -} 80 | 81 | import Browser 82 | import ElmTextSearch 83 | import Html exposing (Html, button, div, text) 84 | 85 | 86 | {-| Example document type. 87 | -} 88 | type alias ExampleDocType = 89 | { cid : String 90 | , title : String 91 | , author : String 92 | , body : String 93 | } 94 | 95 | 96 | {-| Create an index with default configuration. 97 | See ElmTextSearch.SimpleConfig documentation for parameter information. 98 | -} 99 | createNewIndexExample : ElmTextSearch.Index ExampleDocType 100 | createNewIndexExample = 101 | ElmTextSearch.new 102 | { ref = .cid 103 | , fields = 104 | [ ( .title, 5.0 ) 105 | , ( .body, 1.0 ) 106 | ] 107 | , listFields = [] 108 | } 109 | 110 | 111 | {-| Add a document to an index. 112 | -} 113 | resultUpdatedMyIndexAfterAdd : Result String (ElmTextSearch.Index ExampleDocType) 114 | resultUpdatedMyIndexAfterAdd = 115 | ElmTextSearch.add 116 | { cid = "id1" 117 | , title = "First Title" 118 | , author = "Some Author" 119 | , body = "Words in this example document with explanations." 120 | } 121 | createNewIndexExample 122 | 123 | 124 | {-| Search the index. 125 | 126 | The result includes an updated Index because a search causes internal 127 | caches to be updated to improve overall performance. 128 | 129 | -} 130 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) ) 131 | resultSearchIndex = 132 | resultUpdatedMyIndexAfterAdd 133 | |> Result.andThen 134 | (ElmTextSearch.search "explanations") 135 | 136 | 137 | {-| Display search result. 138 | -} 139 | main = 140 | Browser.sandbox { init = 0, update = update, view = view } 141 | 142 | 143 | type Msg 144 | = DoNothing 145 | 146 | 147 | update msg model = 148 | case msg of 149 | DoNothing -> 150 | model 151 | 152 | 153 | view model = 154 | let 155 | -- want only the search results not the returned index 156 | searchResults = 157 | Result.map Tuple.second resultSearchIndex 158 | in 159 | div [] 160 | [ text 161 | ("Result of searching for \"explanations\" is " 162 | ++ Debug.toString searchResults 163 | ) 164 | ] 165 | ``` 166 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | TODO - TokenProcessor look at elm/parser instead of regex looks better and probably more efficient than regex. 2 | 3 | TODO - Idea store index to corpus tokens for each document not the words. maybe worth it. 4 | 5 | -------------------------------------------------------------------------------- /elm.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "package", 3 | "name": "rluiten/elm-text-search", 4 | "summary": "Full text index engine in Elm language inspired by lunr.js.", 5 | "license": "BSD-3-Clause", 6 | "version": "5.1.1", 7 | "exposed-modules": [ 8 | "ElmTextSearch", 9 | "ElmTextSearchErrors", 10 | "Index.Defaults", 11 | "StopWordFilter", 12 | "ElmTextSearch.Json.Encoder", 13 | "ElmTextSearch.Json.Decoder" 14 | ], 15 | "elm-version": "0.19.0 <= v < 0.20.0", 16 | "dependencies": { 17 | "NoRedInk/elm-json-decode-pipeline": "1.0.0 <= v < 2.0.0", 18 | "elm/core": "1.0.0 <= v < 2.0.0", 19 | "elm/json": "1.0.0 <= v < 2.0.0", 20 | "elm/regex": "1.0.0 <= v < 2.0.0", 21 | "rluiten/sparsevector": "1.0.3 <= v < 2.0.0", 22 | "rluiten/stemmer": "1.0.4 <= v < 2.0.0", 23 | "rluiten/trie": "2.1.1 <= v < 3.0.0" 24 | }, 25 | "test-dependencies": { 26 | "elm-explorations/test": "2.0.0 <= v <= 2.2.0" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /examples/elm.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "application", 3 | "source-directories": [ 4 | "src", 5 | "../src" 6 | ], 7 | "elm-version": "0.19.0", 8 | "dependencies": { 9 | "direct": { 10 | "NoRedInk/elm-json-decode-pipeline": "1.0.0", 11 | "elm/browser": "1.0.0", 12 | "elm/core": "1.0.0", 13 | "elm/html": "1.0.0", 14 | "elm/json": "1.0.0", 15 | "elm/regex": "1.0.0", 16 | "rluiten/sparsevector": "1.0.3", 17 | "rluiten/stemmer": "1.0.4", 18 | "rluiten/trie": "2.0.3" 19 | }, 20 | "indirect": { 21 | "elm/time": "1.0.0", 22 | "elm/url": "1.0.0", 23 | "elm/virtual-dom": "1.0.0" 24 | } 25 | }, 26 | "test-dependencies": { 27 | "direct": {}, 28 | "indirect": {} 29 | } 30 | } -------------------------------------------------------------------------------- /examples/src/IndexNewAddSearch.elm: -------------------------------------------------------------------------------- 1 | module Main exposing (ExampleDocType, createNewIndexExample, main, resultSearchIndex, resultUpdatedMyIndexAfterAdd) 2 | 3 | {-| Create an index and add a document, search a document 4 | 5 | Copyright (c) 2016 Robin Luiten 6 | 7 | -} 8 | 9 | import Browser 10 | import ElmTextSearch 11 | import Html exposing (Html, button, div, text) 12 | 13 | 14 | {-| Example document type. 15 | -} 16 | type alias ExampleDocType = 17 | { cid : String 18 | , title : String 19 | , author : String 20 | , body : String 21 | } 22 | 23 | 24 | {-| Create an index with default configuration. 25 | See ElmTextSearch.SimpleConfig documentation for parameter information. 26 | -} 27 | createNewIndexExample : ElmTextSearch.Index ExampleDocType 28 | createNewIndexExample = 29 | ElmTextSearch.new 30 | { ref = .cid 31 | , fields = 32 | [ ( .title, 5.0 ) 33 | , ( .body, 1.0 ) 34 | ] 35 | , listFields = [] 36 | } 37 | 38 | 39 | {-| Add a document to an index. 40 | -} 41 | resultUpdatedMyIndexAfterAdd : Result String (ElmTextSearch.Index ExampleDocType) 42 | resultUpdatedMyIndexAfterAdd = 43 | ElmTextSearch.add 44 | { cid = "id1" 45 | , title = "First Title" 46 | , author = "Some Author" 47 | , body = "Words in this example document with explanations." 48 | } 49 | createNewIndexExample 50 | 51 | 52 | {-| Search the index. 53 | 54 | The result includes an updated Index because a search causes internal 55 | caches to be updated to improve overall performance. 56 | 57 | -} 58 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) ) 59 | resultSearchIndex = 60 | resultUpdatedMyIndexAfterAdd 61 | |> Result.andThen 62 | (ElmTextSearch.search "explanations") 63 | 64 | 65 | {-| Display search result. 66 | -} 67 | main = 68 | Browser.sandbox { init = 0, update = update, view = view } 69 | 70 | 71 | type Msg 72 | = DoNothing 73 | 74 | 75 | update msg model = 76 | case msg of 77 | DoNothing -> 78 | model 79 | 80 | 81 | view model = 82 | let 83 | -- want only the search results not the returned index 84 | searchResults = 85 | Result.map Tuple.second resultSearchIndex 86 | in 87 | div [] 88 | [ text 89 | ("Result of searching for \"explanations\" is " 90 | ++ Debug.toString searchResults 91 | ) 92 | ] 93 | -------------------------------------------------------------------------------- /examples/src/IndexNewAddSearchListFields.elm: -------------------------------------------------------------------------------- 1 | module Main exposing (ExampleDocType, createNewIndexExample, main, resultSearchIndex, resultUpdatedMyIndexAfterAdd) 2 | 3 | {-| Create an index and add a document, search a document 4 | This variation indexes words from a field which is List String. 5 | 6 | Copyright (c) 2016 Robin Luiten 7 | 8 | -} 9 | 10 | import Browser 11 | import ElmTextSearch 12 | import Html exposing (Html, div, text) 13 | 14 | 15 | {-| Example document type. 16 | -} 17 | type alias ExampleDocType = 18 | { cid : String 19 | , title : String 20 | , author : String 21 | , body : List String 22 | } 23 | 24 | 25 | {-| Create an index with default configuration. 26 | See ElmTextSearch.SimpleConfig documentation for parameter information. 27 | -} 28 | createNewIndexExample : ElmTextSearch.Index ExampleDocType 29 | createNewIndexExample = 30 | ElmTextSearch.new 31 | { ref = .cid 32 | , fields = 33 | [ ( .title, 5.0 ) 34 | ] 35 | , listFields = 36 | [ ( .body, 1.0 ) 37 | ] 38 | } 39 | 40 | 41 | {-| Add a document to an index. 42 | -} 43 | resultUpdatedMyIndexAfterAdd : Result String (ElmTextSearch.Index ExampleDocType) 44 | resultUpdatedMyIndexAfterAdd = 45 | ElmTextSearch.add 46 | { cid = "id1" 47 | , title = "First Title" 48 | , author = "Some Author" 49 | , body = 50 | [ "Words in this example " 51 | , "document with explanations." 52 | ] 53 | } 54 | createNewIndexExample 55 | 56 | 57 | {-| Search the index. 58 | 59 | The result includes an updated Index because a search causes internal 60 | caches to be updated to improve overall performance. 61 | 62 | -} 63 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) ) 64 | resultSearchIndex = 65 | resultUpdatedMyIndexAfterAdd 66 | |> Result.andThen 67 | (ElmTextSearch.search "explanations") 68 | 69 | 70 | {-| Display search result. 71 | -} 72 | main = 73 | Browser.sandbox { init = 0, update = update, view = view } 74 | 75 | 76 | type Msg 77 | = DoNothing 78 | 79 | 80 | update msg model = 81 | case msg of 82 | DoNothing -> 83 | model 84 | 85 | {-| Display search result. 86 | -} 87 | view model = 88 | let 89 | -- want only the search results not the returned index 90 | searchResults = 91 | Result.map Tuple.second resultSearchIndex 92 | in 93 | div [] 94 | [ text 95 | ("Result of searching for \"explanations\" is " 96 | ++ Debug.toString searchResults 97 | ) 98 | ] 99 | -------------------------------------------------------------------------------- /examples/src/IndexNewWithAddSearch.elm: -------------------------------------------------------------------------------- 1 | module Main exposing (ExampleDocType, addDocToIndexExample, createMyStopWordFilter, createNewWithIndexExample, firstResultSearchIndex, main, secondResultSearchIndex) 2 | 3 | {-| Create an index with customized stop word filter using 4 | ElmTextSearch.newWith. 5 | 6 | Copyright (c) 2016 Robin Luiten 7 | 8 | -} 9 | 10 | import Browser 11 | import ElmTextSearch 12 | import Html exposing (Html, div, text) 13 | import Index.Defaults 14 | import StopWordFilter 15 | 16 | 17 | {-| Example document type. 18 | -} 19 | type alias ExampleDocType = 20 | { cid : String 21 | , title : String 22 | , author : String 23 | , body : String 24 | } 25 | 26 | 27 | {-| Create an extended stop word filter. 28 | 29 | Be careful about adding words to your stop word list, as any stop word 30 | will not be indexed and you will not be able to search for the word in 31 | documents as it will not be found. 32 | 33 | It is possible to completely replace the stop word list and not 34 | just extend it. 35 | 36 | -} 37 | createMyStopWordFilter = 38 | StopWordFilter.createFilterFuncWith 39 | [ "explanations" ] 40 | 41 | 42 | {-| Create an index with extra options. 43 | 44 | - In this case a customized stop word filter is provided. 45 | - It is supplying the default transform factories. 46 | - It supplies an index type for the customized index config. 47 | This becomes important when loading back saved index. 48 | - It is a good idea to include a version in your index type string 49 | in case you update things and might still have old versions 50 | around that you need to work with. 51 | 52 | -} 53 | createNewWithIndexExample : ElmTextSearch.Index ExampleDocType 54 | createNewWithIndexExample = 55 | ElmTextSearch.newWith 56 | { indexType = "ElmTextSearch - Customized Stop Words v1" 57 | , ref = .cid 58 | , fields = 59 | [ ( .title, 5.0 ) 60 | , ( .body, 1.0 ) 61 | ] 62 | , listFields = [] 63 | , initialTransformFactories = Index.Defaults.defaultInitialTransformFactories 64 | , transformFactories = Index.Defaults.defaultTransformFactories 65 | , filterFactories = [ createMyStopWordFilter ] 66 | } 67 | 68 | 69 | {-| Adding a document to the index. 70 | -} 71 | addDocToIndexExample : Result String (ElmTextSearch.Index ExampleDocType) 72 | addDocToIndexExample = 73 | ElmTextSearch.add 74 | { cid = "id1" 75 | , title = "First Title" 76 | , author = "Some Author" 77 | , body = "Words in this example document with explanations." 78 | } 79 | createNewWithIndexExample 80 | 81 | 82 | {-| Search the index for a word in our extended stop words. 83 | This will return no matches. 84 | -} 85 | firstResultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) ) 86 | firstResultSearchIndex = 87 | addDocToIndexExample 88 | |> Result.andThen 89 | (ElmTextSearch.search "explanation") 90 | 91 | 92 | {-| Search the index for a word that is not a stop word. 93 | It will return an Err about no search terms. 94 | -} 95 | secondResultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) ) 96 | secondResultSearchIndex = 97 | addDocToIndexExample 98 | |> Result.andThen 99 | (ElmTextSearch.search "examples") 100 | 101 | 102 | {-| Display search result. 103 | -} 104 | main = 105 | Browser.sandbox { init = 0, update = update, view = view } 106 | 107 | 108 | type Msg 109 | = DoNothing 110 | 111 | 112 | update msg model = 113 | case msg of 114 | DoNothing -> 115 | model 116 | 117 | 118 | view model = 119 | let 120 | searchResults1 = 121 | Result.map Tuple.second firstResultSearchIndex 122 | 123 | searchResults2 = 124 | Result.map Tuple.second secondResultSearchIndex 125 | in 126 | div [] 127 | [ div [] 128 | [ text 129 | ("Expecting no matches (because explanation is in stop words). Result of first search for \"explanation\" is " 130 | ++ Debug.toString searchResults1 131 | ) 132 | ] 133 | , div [] 134 | [ text 135 | ("Result of second search for \"examples\" is " 136 | ++ Debug.toString searchResults2 137 | ) 138 | ] 139 | ] 140 | -------------------------------------------------------------------------------- /examples/src/MultipleAddSearch.elm: -------------------------------------------------------------------------------- 1 | module Main exposing (ExampleDocType, createNewIndexExample, documents, indexWithMulitpleDocumentsAdded, main, resultSearchIndex) 2 | 3 | {-| Create an index and add multiple documents. 4 | 5 | Copyright (c) 2016 Robin Luiten 6 | 7 | -} 8 | 9 | import Browser 10 | import ElmTextSearch 11 | import Html exposing (Html, div, text) 12 | 13 | 14 | {-| Example document type. 15 | -} 16 | type alias ExampleDocType = 17 | { cid : String 18 | , title : String 19 | , author : String 20 | , body : String 21 | } 22 | 23 | 24 | {-| Create an index with default configuration. 25 | See ElmTextSearch.SimpleConfig documentation for parameter information. 26 | -} 27 | createNewIndexExample : ElmTextSearch.Index ExampleDocType 28 | createNewIndexExample = 29 | ElmTextSearch.new 30 | { ref = .cid 31 | , fields = 32 | [ ( .title, 5.0 ) 33 | , ( .body, 1.0 ) 34 | ] 35 | , listFields = [] 36 | } 37 | 38 | 39 | documents = 40 | [ { cid = "id1" 41 | , title = "First Title" 42 | , author = "Some Author" 43 | , body = "Words in this example document with explanations." 44 | } 45 | , { cid = "id2" 46 | , title = "Is a cactus as pretty as a tree ?" 47 | , author = "Joe Greeny" 48 | , body = "This title contains information about cactuses." 49 | } 50 | ] 51 | 52 | 53 | {-| Add a documents to index. 54 | 55 | If any add result is an Err this returns the first failure. 56 | 57 | -} 58 | indexWithMulitpleDocumentsAdded : ( ElmTextSearch.Index ExampleDocType, List ( Int, String ) ) 59 | indexWithMulitpleDocumentsAdded = 60 | ElmTextSearch.addDocs 61 | documents 62 | createNewIndexExample 63 | 64 | 65 | {-| Search the index. 66 | 67 | The result includes an updated Index because a search causes internal 68 | caches to be updated to improve overall performance. 69 | 70 | This is ignoring any errors from call to addAllDocs 71 | in indexWithMulitpleDocumentsAdded. 72 | 73 | -} 74 | resultSearchIndex : Result String ( ElmTextSearch.Index ExampleDocType, List ( String, Float ) ) 75 | resultSearchIndex = 76 | ElmTextSearch.search "title" (Tuple.first indexWithMulitpleDocumentsAdded) 77 | 78 | 79 | {-| Display search result. 80 | -} 81 | main = 82 | Browser.sandbox { init = 0, update = update, view = view } 83 | 84 | 85 | type Msg 86 | = DoNothing 87 | 88 | 89 | update msg model = 90 | case msg of 91 | DoNothing -> 92 | model 93 | 94 | 95 | view model = 96 | let 97 | -- want only the search results not the returned index 98 | searchResults = 99 | Result.map Tuple.second resultSearchIndex 100 | in 101 | div [] 102 | [ text 103 | ("Result of searching for \"title\" is " 104 | ++ Debug.toString searchResults 105 | ) 106 | ] 107 | -------------------------------------------------------------------------------- /src/ElmTextSearch.elm: -------------------------------------------------------------------------------- 1 | module ElmTextSearch exposing 2 | ( new 3 | , newWith 4 | , add 5 | , addT 6 | , addDocs 7 | , remove 8 | , removeT 9 | , update 10 | , addOrUpdate 11 | , search 12 | , searchT 13 | , Index 14 | , Config 15 | , SimpleConfig 16 | , storeToValue 17 | , storeToString 18 | , fromString 19 | , fromValue 20 | , fromStringWith 21 | , fromValueWith 22 | ) 23 | 24 | {-| A full text indexer written in Elm language inspired by lunr.js. 25 | 26 | A useful article about lunr.js 27 | 28 | 29 | 30 | ## Create Index 31 | 32 | @docs new 33 | @docs newWith 34 | 35 | 36 | ## Modify Index 37 | 38 | @docs add 39 | @docs addT 40 | @docs addDocs 41 | @docs remove 42 | @docs removeT 43 | @docs update 44 | @docs addOrUpdate 45 | 46 | 47 | ## Query Index 48 | 49 | @docs search 50 | @docs searchT 51 | 52 | 53 | ## Types 54 | 55 | @docs Index 56 | @docs Config 57 | @docs SimpleConfig 58 | 59 | 60 | ## Save and Load an Index 61 | 62 | - You can save an index using [`ElmTextSearch.Json.Encoder.encoder`](ElmTextSearch.Json.Encoder#encoder) 63 | - You can load a saved index using 64 | [`ElmTextSearch.Json.Decoder.decoder`](ElmTextSearch.Json.Decoder#decoder) 65 | to produce a [`Index.Model.CodecIndexRecord`](Index.Model#CodecIndexRecord). 66 | - You can save a [`Index.Model.CodecIndexRecord`](Index.Model#CodecIndexRecord) 67 | using [`ElmTextSearch.Json.Encoder.codecIndexRecordEncoder`](ElmTextSearch.Json.Encoder#codecIndexRecordEncoder) 68 | - \*\* Modifying an index outside of ElmTextSearch using the Decoder and Encoder directly 69 | may cause it to not work correctly loaded into ElmTextSearch. \*\* 70 | 71 | @docs storeToValue 72 | @docs storeToString 73 | @docs fromString 74 | @docs fromValue 75 | @docs fromStringWith 76 | @docs fromValueWith 77 | 78 | Copyright (c) 2016 Robin Luiten 79 | 80 | -} 81 | 82 | import ElmTextSearch.Json.Encoder as IndexEncoder 83 | import ElmTextSearchErrors 84 | import Index 85 | import Index.Defaults as Defaults 86 | import Index.Load 87 | import Index.Model as Model 88 | import Json.Decode as Decode 89 | import Json.Encode as Encode 90 | 91 | 92 | {-| An Index holds the data to be able search for added documents. 93 | -} 94 | type alias Index doc = 95 | Index.Index doc 96 | 97 | 98 | {-| A SimpleConfig is the least amount of configuration data 99 | required to create an Index. 100 | 101 | See [`ElmTextSearch.new`](ElmTextSearch#new) for fields. 102 | 103 | -} 104 | type alias SimpleConfig doc = 105 | Model.IndexSimpleConfig doc 106 | 107 | 108 | {-| A Config is required to create an Index. 109 | -} 110 | type alias Config doc = 111 | Model.Config doc 112 | 113 | 114 | {-| Create new index. 115 | 116 | Example 117 | 118 | import ElmTextSearch 119 | 120 | {-| Example document type. 121 | -} 122 | type alias ExampleDocType = 123 | { cid : String 124 | , title : String 125 | , author : String 126 | , body : String 127 | } 128 | 129 | {-| Create an index with default configuration. 130 | See ElmTextSearch.SimpleConfig documentation for parameter information. 131 | -} 132 | createNewIndexExample : ElmTextSearch.Index ExampleDocType 133 | createNewIndexExample = 134 | ElmTextSearch.new 135 | { ref = .cid 136 | , fields = 137 | [ ( .title, 5.0 ) 138 | , ( .body, 1.0 ) 139 | ] 140 | , listFields = [] 141 | } 142 | 143 | The `SimpleConfig` parameter to new is 144 | 145 | - ref 146 | - The unique document reference will be extracted from each 147 | document using `.cid`. 148 | - fields 149 | - Define which fields contain a strings to be indexed. 150 | - The following fields will be indexed from each document 151 | - `.title` 152 | - `.body` 153 | - When searching the index any word matches found in the 154 | `.title` field (boost value 5.0) raise the document match score 155 | more than if found in the `.body` field (boost value 1.0). 156 | - The document match score determines the order of the list 157 | of matching documents returned. 158 | - listFields 159 | - Define which fields contain list of strings to be indexed. 160 | 161 | -} 162 | new : SimpleConfig doc -> Index doc 163 | new simpleConfig = 164 | Index.new (Defaults.getIndexSimpleConfig simpleConfig) 165 | 166 | 167 | {-| Create new index with additional configuration. 168 | 169 | Example. 170 | 171 | import ElmTextSearch 172 | import Index.Defaults 173 | import StopWordFilter 174 | 175 | type alias ExampleDocType = 176 | { cid : String 177 | , title : String 178 | , author : String 179 | , body : String 180 | } 181 | 182 | createMyStopWordFilter = 183 | StopWordFilter.createFilterFuncWith 184 | [ "explanations" ] 185 | 186 | createNewWithIndexExample : ElmTextSearch.Index ExampleDocType 187 | createNewWithIndexExample = 188 | ElmTextSearch.newWith 189 | { indexType = "ElmTextSearch - Customized Stop Words v1" 190 | , ref = .cid 191 | , fields = 192 | [ ( .title, 5.0 ) 193 | , ( .body, 1.0 ) 194 | ] 195 | , listFields = [] 196 | , initialTransformFactories = Index.Defaults.defaultInitialTransformFactories 197 | , transformFactories = Index.Defaults.defaultTransformFactories 198 | , filterFactories = [ createMyStopWordFilter ] 199 | } 200 | 201 | -} 202 | newWith : Config doc -> Index doc 203 | newWith = 204 | Index.newWith 205 | 206 | 207 | {-| Add a document to an index. 208 | 209 | Starting with the ElmTextSearch.new example above this adds a document. 210 | 211 | addDocToIndexExample : Result String (ElmTextSearch.Index ExampleDocType) 212 | addDocToIndexExample = 213 | ElmTextSearch.add 214 | { cid = "id1" 215 | , title = "First Title" 216 | , author = "Some Author" 217 | , body = "Words in this example document with explanations." 218 | } 219 | createNewWithIndexExample 220 | 221 | Conditions that cause a result Err with message. 222 | 223 | - Error document ref is empty. 224 | - Error after tokenisation there are no terms to index. 225 | - Error adding document that allready exists. 226 | 227 | Original function signature retained for backward compatible. 228 | 229 | -} 230 | add : doc -> Index doc -> Result String (Index doc) 231 | add = 232 | Index.add 233 | 234 | 235 | {-| Add document to an Index if no error conditions found. 236 | 237 | Variant of `add` that provides AddError type for error Results. 238 | 239 | -} 240 | addT : doc -> Index doc -> Result ElmTextSearchErrors.AddError (Index doc) 241 | addT = 242 | Index.addT 243 | 244 | 245 | {-| Add multiple documents. Tries to add all docs and collects errors.. 246 | It does not stop adding at first error encountered. 247 | 248 | The result part List (Int, String) is the list of document index 249 | and the error string message result of adding. 250 | Returns the index unchanged if all documents error when added. 251 | Returns the updated index after adding the documents. 252 | 253 | -} 254 | addDocs : List doc -> Index doc -> ( Index doc, List ( Int, String ) ) 255 | addDocs = 256 | Index.addDocs 257 | 258 | 259 | {-| Remove a document from an index. 260 | 261 | Starting with the ElmTextSearch.new example above this removes a document. 262 | 263 | removeDocFromIndexExample = 264 | ElmTextSearch.remove 265 | { cid = "123" 266 | , title = "Examples of a Banana" 267 | , author = "Sally Apples" 268 | , body = "Sally writes words about a banana." 269 | } 270 | createNewIndexExample 271 | 272 | Conditions that cause a result Err with message. 273 | 274 | - Error document has an empty unique id (ref). 275 | - Error document is not in index. 276 | 277 | Original function signature retained for backward compatible. 278 | 279 | -} 280 | remove : doc -> Index doc -> Result String (Index doc) 281 | remove = 282 | Index.remove 283 | 284 | 285 | {-| Add document to an Index if no error conditions found. 286 | 287 | Variant of `remove` that provides RemoveError type for error Results. 288 | 289 | -} 290 | removeT : doc -> Index doc -> Result ElmTextSearchErrors.RemoveError (Index doc) 291 | removeT = 292 | Index.removeT 293 | 294 | 295 | {-| Update a document in an index. 296 | 297 | Starting with the ElmTextSearch.new example above this updates a document. 298 | 299 | updatedIndex = 300 | ElmTextSearch.update 301 | { cid = "123" 302 | , title = "Examples of a Bananas in every day life." 303 | , author = "Sally Apples" 304 | , body = "Sally writes more words about a banana." 305 | } 306 | createNewIndexExample 307 | 308 | Conditions that cause an error result are those for 309 | [`ElmTextSearch.remove`](ElmTextSearch#remove) and 310 | [`ElmTextSearch.add`](ElmTextSearch#add). 311 | 312 | -} 313 | update : doc -> Index doc -> Result String (Index doc) 314 | update = 315 | Index.update 316 | 317 | 318 | {-| Add or Update a document in an index. 319 | This removes the document first if it is allready in index then adds it. 320 | -} 321 | addOrUpdate : doc -> Index doc -> Result String (Index doc) 322 | addOrUpdate = 323 | Index.addOrUpdate 324 | 325 | 326 | {-| Search an index with query. 327 | 328 | Tokens are extracted from the query string and passed through the 329 | same processing used when indexing documents. 330 | 331 | Each token is expanded, so that the term "he" might be expanded to "hello" 332 | and "help" if those terms were already included in the document index. 333 | 334 | Multiple tokens are allowed and will lead to an AND based query. 335 | 336 | The following example runs a search for documents containing both "apple" and "banana". 337 | 338 | searchResult = 339 | Index.search "Apple banana" createNewIndexExample 340 | 341 | Results are a list of matching document reference identifiers with 342 | there similarity to query score, ordered by score descending, so the 343 | best matches are earliest in the list. 344 | 345 | An index is returned from search as well. This is because the data model may 346 | be updated to improve performance for later searches. 347 | 348 | Adding or removing a new document will cause some of the internal caching 349 | to be reset. 350 | 351 | Conditions that cause a result Err with message. 352 | 353 | - Error there are no documents in index to search. 354 | - Error query is empty. 355 | - Error after tokenisation there are no terms to search for. 356 | 357 | Original function signature retained for backward compatible. 358 | 359 | -} 360 | search : 361 | String 362 | -> Index doc 363 | -> Result String ( Index doc, List ( String, Float ) ) 364 | search = 365 | Index.search 366 | 367 | 368 | {-| Add document to an Index if no error conditions found. 369 | 370 | Variant of `search` that provides SearchError type for error Results. 371 | 372 | -} 373 | searchT : 374 | String 375 | -> Index doc 376 | -> Result ElmTextSearchErrors.SearchError ( Index doc, List ( String, Float ) ) 377 | searchT = 378 | Index.searchT 379 | 380 | 381 | {-| Store an index to a Value. 382 | You can also use [`ElmTextSearch.Json.Encoder`](ElmTextSearch.Json.Encoder). 383 | -} 384 | storeToValue : Index doc -> Encode.Value 385 | storeToValue = 386 | IndexEncoder.encoder 387 | 388 | 389 | {-| Store an index to a String. 390 | You can also use [`ElmTextSearch.Json.Encoder`](ElmTextSearch.Json.Encoder). 391 | -} 392 | storeToString : Index doc -> String 393 | storeToString index = 394 | Encode.encode 0 (IndexEncoder.encoder index) 395 | 396 | 397 | {-| Create an Index from a String which has a stored Index in it and the 398 | supplied basic configurations. 399 | See [`ElmTextSearch.fromStringWith`](ElmTextSearch#fromStringWith) for possible Err results. 400 | -} 401 | fromString : SimpleConfig doc -> String -> Result Decode.Error (Index doc) 402 | fromString simpleConfig inputString = 403 | Index.Load.loadIndex 404 | (Defaults.getIndexSimpleConfig simpleConfig) 405 | inputString 406 | 407 | 408 | {-| Create an Index from a Value which has a stored Index in it. 409 | See [`ElmTextSearch.fromStringWith`](ElmTextSearch#fromStringWith) for possible Err results. 410 | -} 411 | fromValue : SimpleConfig doc -> Decode.Value -> Result Decode.Error (Index doc) 412 | fromValue simpleConfig inputValue = 413 | Index.Load.loadIndexValue 414 | (Defaults.getIndexSimpleConfig simpleConfig) 415 | inputValue 416 | 417 | 418 | {-| Create an Index from a String which has a stored Index in it. 419 | 420 | If none of the indexVersion in the list of Config match the index 421 | type being loaded it will return an Err. 422 | 423 | The list of configurations wil be searched for a matching indexType 424 | so you should provide configs for all types you may be trying to load. 425 | No more than the config that matches is required though. 426 | 427 | If the none of the supplied Config match the loaded Index then it 428 | will try if the index being loaded matches the default version if so 429 | it will still load the index. 430 | 431 | The following Err results may be returned. 432 | 433 | - "Error cannot load Index. Tried to load index of type "\_\_IndexTest Type -". It is not in supported index configurations." 434 | - It contains the loaded version index type which comes from input. 435 | - "Error cannot load Index. Version supported is 1.0.0. Version tried to load is 1.0.1." 436 | - It includes both expected and loaded versions which may vary. 437 | 438 | -} 439 | fromStringWith : List (Config doc) -> String -> Result Decode.Error (Index doc) 440 | fromStringWith = 441 | Index.Load.loadIndexWith 442 | 443 | 444 | {-| Create an Index from a String which has a stored Index in it. 445 | If none of the indexVersion in the list of SimpleConfig match the index 446 | being decoded it will return an Err. 447 | 448 | See [`ElmTextSearch.fromStringWith`](ElmTextSearch#fromStringWith) for possible Err results. 449 | 450 | -} 451 | fromValueWith : List (Config doc) -> Decode.Value -> Result Decode.Error (Index doc) 452 | fromValueWith = 453 | Index.Load.loadIndexValueWith 454 | -------------------------------------------------------------------------------- /src/ElmTextSearch/Json/Decoder.elm: -------------------------------------------------------------------------------- 1 | module ElmTextSearch.Json.Decoder exposing (decoder) 2 | 3 | {-| Decoder for Index. 4 | 5 | It decodes to a CodecIndexRecord. 6 | 7 | @docs decoder 8 | 9 | Copyright (c) 2016 Robin Luiten 10 | 11 | -} 12 | 13 | import Dict exposing (Dict) 14 | import Index.Model as Model 15 | import Json.Decode as Decode exposing (..) 16 | import Json.Decode.Pipeline exposing (required) 17 | import Set exposing (Set) 18 | import Trie.Json.Decoder as TrieDecoder 19 | 20 | 21 | {-| CodecIndexRecord decoder. 22 | -} 23 | decoder : Decoder Model.CodecIndexRecord 24 | decoder = 25 | Decode.succeed Model.CodecIndexRecord 26 | |> required "indexVersion" string 27 | |> required "indexType" string 28 | |> required "documentStore" documentStoreDecoder 29 | |> required "corpusTokens" setDecoder 30 | |> required "tokenStore" (TrieDecoder.decoder float) 31 | 32 | 33 | documentStoreDecoder : Decoder (Dict String (Set String)) 34 | documentStoreDecoder = 35 | dict setDecoder 36 | 37 | 38 | setDecoder : Decoder (Set String) 39 | setDecoder = 40 | map Set.fromList (list string) 41 | -------------------------------------------------------------------------------- /src/ElmTextSearch/Json/Encoder.elm: -------------------------------------------------------------------------------- 1 | module ElmTextSearch.Json.Encoder exposing 2 | ( encoder 3 | , codecIndexRecordEncoder 4 | ) 5 | 6 | {-| Encoder for Index. 7 | 8 | @docs encoder 9 | @docs codecIndexRecordEncoder 10 | 11 | Copyright (c) 2016 Robin Luiten 12 | 13 | -} 14 | 15 | import Dict exposing (Dict) 16 | import Index 17 | import Index.Model as Model exposing (Index(..)) 18 | import Json.Encode as Encode 19 | import Set exposing (Set) 20 | import Trie exposing (Trie) 21 | import Trie.Json.Encoder as TrieEncoder 22 | 23 | 24 | {-| Encoder for Index a. 25 | 26 | Only encoding fields required to recreate a working index. 27 | 28 | The following fields are not saved as they are restored via 29 | the provided Config on fromString. 30 | 31 | - ref 32 | - fields 33 | - transformFactories 34 | - filterFactories 35 | 36 | The following fields are not saved because they are an 37 | acceleration model, decoder needs to set it on fromString. 38 | 39 | - corpusTokensIndex 40 | 41 | The following fields are not saved because they are caches 42 | and are cached as operationg requires 43 | 44 | - transforms 45 | - filters 46 | - idfCache 47 | 48 | Do not need an (a -> Encode.Value) because a is a document 49 | type and that is never encoded from an Index. 50 | 51 | -} 52 | encoder : Index doc -> Encode.Value 53 | encoder (Index irec) = 54 | codecIndexRecordEncoder 55 | { indexVersion = irec.indexVersion 56 | , indexType = irec.indexType 57 | , documentStore = irec.documentStore 58 | , corpusTokens = irec.corpusTokens 59 | , tokenStore = irec.tokenStore 60 | } 61 | 62 | 63 | {-| Encode CodecIndexRecord. 64 | -} 65 | codecIndexRecordEncoder : Model.CodecIndexRecord -> Encode.Value 66 | codecIndexRecordEncoder rec = 67 | Encode.object 68 | [ ( "indexVersion", Encode.string rec.indexVersion ) 69 | , ( "indexType", Encode.string rec.indexType ) 70 | , ( "documentStore", documentStoreEncoder rec.documentStore ) 71 | , ( "corpusTokens", corpusTokensEncoder rec.corpusTokens ) 72 | , ( "tokenStore", tokenStore rec.tokenStore ) 73 | ] 74 | 75 | 76 | documentStoreEncoder : Dict String (Set String) -> Encode.Value 77 | documentStoreEncoder dict = 78 | Encode.object <| 79 | List.map 80 | (\( key, val ) -> 81 | ( key 82 | , Encode.list Encode.string (Set.toList val) 83 | ) 84 | ) 85 | (Dict.toList dict) 86 | 87 | 88 | corpusTokensEncoder : Set String -> Encode.Value 89 | corpusTokensEncoder setVal = 90 | Encode.list Encode.string (Set.toList setVal) 91 | 92 | tokenStore : Trie Float -> Encode.Value 93 | tokenStore = 94 | TrieEncoder.encoder Encode.float 95 | -------------------------------------------------------------------------------- /src/ElmTextSearchErrors.elm: -------------------------------------------------------------------------------- 1 | module ElmTextSearchErrors exposing 2 | ( AddError(..) 3 | , RemoveError(..) 4 | , SearchError(..) 5 | ) 6 | 7 | {-| Error types used in ElmTextSearch results. 8 | 9 | 10 | ## Types 11 | 12 | @docs AddError 13 | @docs RemoveError 14 | @docs SearchError 15 | 16 | -} 17 | 18 | 19 | {-| Used in error Result case of ElmTextSearh.addT 20 | -} 21 | type AddError 22 | = AddErrorUniqueRefIsEmpty 23 | | NoTermsToIndexAfterTokenisation 24 | | DocAlreadyExists 25 | 26 | 27 | {-| Used in error Result case of ElmTextSearh.removeT 28 | -} 29 | type RemoveError 30 | = RemoveErrorUniqueRefIsEmpty 31 | | DocIsNotInIndex 32 | 33 | 34 | {-| Used in error Result case of ElmTextSearh.searchT 35 | -} 36 | type SearchError 37 | = IndexIsEmpty 38 | | QueryIsEmpty 39 | | NoTermsToSearchAfterTokenisation 40 | -------------------------------------------------------------------------------- /src/Index.elm: -------------------------------------------------------------------------------- 1 | module Index exposing 2 | ( new 3 | , newWith 4 | , add 5 | , addT 6 | , addDocs 7 | , remove 8 | , removeT 9 | , update 10 | , addOrUpdate 11 | , search 12 | , searchT 13 | , Index 14 | ) 15 | 16 | {-| Index module for full text indexer 17 | 18 | Added addT, removeT and searchT functions that provide 19 | a strong type for Error in the Result. 20 | 21 | 22 | ## Create Index 23 | 24 | @docs new 25 | @docs newWith 26 | 27 | 28 | ## Update Index 29 | 30 | @docs add 31 | @docs addT 32 | @docs addDocs 33 | @docs remove 34 | @docs removeT 35 | @docs update 36 | @docs addOrUpdate 37 | 38 | 39 | ## Query Index 40 | 41 | @docs search 42 | @docs searchT 43 | 44 | 45 | ## Types 46 | 47 | @docs Index 48 | 49 | Copyright (c) 2016 Robin Luiten 50 | 51 | -} 52 | 53 | import Dict 54 | import ElmTextSearchErrors exposing (AddError(..), RemoveError(..), SearchError(..)) 55 | import Index.Defaults as Defaults 56 | import Index.Model as Model exposing (Index(..)) 57 | import Index.Utils 58 | import Index.Vector exposing (..) 59 | import Maybe 60 | import Set exposing (Set) 61 | import String 62 | import Trie 63 | import Utils 64 | 65 | 66 | type alias Index doc = 67 | Model.Index doc 68 | 69 | 70 | type alias Config doc = 71 | Model.Config doc 72 | 73 | 74 | type alias SimpleConfig doc = 75 | Model.ModelSimpleConfig doc 76 | 77 | 78 | {-| Create new index. 79 | -} 80 | new : SimpleConfig doc -> Index doc 81 | new simpleConfig = 82 | newWith 83 | (Defaults.getDefaultIndexConfig simpleConfig) 84 | 85 | 86 | {-| Create new index with control of transformers and filters. 87 | -} 88 | newWith : Config doc -> Index doc 89 | newWith { indexType, ref, fields, listFields, initialTransformFactories, transformFactories, filterFactories } = 90 | Index 91 | { indexVersion = Defaults.indexVersion 92 | , indexType = indexType 93 | , ref = ref 94 | , fields = fields 95 | , listFields = listFields 96 | , initialTransformFactories = initialTransformFactories 97 | , transformFactories = transformFactories 98 | , filterFactories = filterFactories 99 | , initialTransforms = Nothing 100 | , transforms = Nothing 101 | , filters = Nothing 102 | , corpusTokens = Set.empty 103 | , corpusTokensIndex = Dict.empty 104 | , documentStore = Dict.empty 105 | , tokenStore = Trie.empty 106 | , idfCache = Dict.empty 107 | } 108 | 109 | 110 | {-| Add document to an Index if no error conditions found. 111 | See ElmTextSearch documentation for `add` to see error conditions. 112 | 113 | Original function signature retained for backward compatible. 114 | 115 | -} 116 | add : doc -> Index doc -> Result String (Index doc) 117 | add doc index = 118 | case addT doc index of 119 | Ok resultValue -> 120 | Ok resultValue 121 | 122 | Err error -> 123 | case error of 124 | AddErrorUniqueRefIsEmpty -> 125 | Err "Error document has an empty unique id (ref)." 126 | 127 | DocAlreadyExists -> 128 | Err "Error adding document that allready exists." 129 | 130 | NoTermsToIndexAfterTokenisation -> 131 | Err "Error after tokenisation there are no terms to index." 132 | 133 | 134 | {-| Add document to an Index if no error conditions found. 135 | 136 | Variant that supports AddError type for Result 137 | 138 | See ElmTextSearch documentation for `add` to see error conditions. 139 | 140 | -} 141 | addT : doc -> Index doc -> Result AddError (Index doc) 142 | addT doc ((Index irec) as index) = 143 | let 144 | docRef = 145 | irec.ref doc 146 | in 147 | if String.isEmpty docRef then 148 | Err AddErrorUniqueRefIsEmpty 149 | 150 | else if Index.Utils.refExists docRef index then 151 | Err DocAlreadyExists 152 | 153 | else 154 | let 155 | ( u1index, fieldsWordListAndBoost ) = 156 | List.foldr 157 | (getWordsForField doc) 158 | ( index, [] ) 159 | irec.fields 160 | 161 | ( u2index, u2fieldsWordListAndBoost ) = 162 | List.foldr 163 | (getWordsForFieldList doc) 164 | ( u1index, fieldsWordListAndBoost ) 165 | irec.listFields 166 | 167 | docTokens = 168 | List.map Tuple.first u2fieldsWordListAndBoost 169 | |> List.foldr Set.union Set.empty 170 | in 171 | if Set.isEmpty docTokens then 172 | Err NoTermsToIndexAfterTokenisation 173 | 174 | else 175 | Ok (addDoc docRef u2fieldsWordListAndBoost docTokens u2index) 176 | 177 | 178 | {-| Add multiple documents. Tries to add all docs and collects errors.. 179 | It does not stop adding at first error encountered. 180 | 181 | The result part List (Int, String) is the list of document index 182 | and the error string message result of adding. 183 | Returns the index unchanged if all documents error when added. 184 | Returns the updated index after adding the documents. 185 | 186 | -} 187 | addDocs : List doc -> Index doc -> ( Index doc, List ( Int, String ) ) 188 | addDocs docs index = 189 | addDocsCore 0 docs index [] 190 | 191 | 192 | addDocsCore : 193 | Int 194 | -> List doc 195 | -> Index doc 196 | -> List ( Int, String ) 197 | -> ( Index doc, List ( Int, String ) ) 198 | addDocsCore docsI docs index errors = 199 | case docs of 200 | [] -> 201 | ( index, errors ) 202 | 203 | headDoc :: tailDocs -> 204 | case add headDoc index of 205 | Ok u1index -> 206 | addDocsCore (docsI + 1) tailDocs u1index errors 207 | 208 | Err msg -> 209 | addDocsCore (docsI + 1) tailDocs index (errors ++ [ ( docsI, msg ) ]) 210 | 211 | 212 | {-| Reducer to extract tokens from each field String from doc. 213 | -} 214 | getWordsForField : 215 | doc 216 | -> ( doc -> String, Float ) 217 | -> ( Index doc, List ( Set String, Float ) ) 218 | -> ( Index doc, List ( Set String, Float ) ) 219 | getWordsForField doc ( getField, fieldBoost ) ( index, fieldsLists ) = 220 | -- GRR fieldBoost goes where? dammmit. it doesnt belong here :( its not part of aggregate 221 | let 222 | ( u1index, tokens ) = 223 | Index.Utils.getTokens index (getField doc) 224 | in 225 | ( u1index, ( Set.fromList tokens, fieldBoost ) :: fieldsLists ) 226 | 227 | 228 | {-| Reducer to extract tokens from each field List String from doc. 229 | -} 230 | getWordsForFieldList : 231 | doc 232 | -> ( doc -> List String, Float ) 233 | -> ( Index doc, List ( Set String, Float ) ) 234 | -> ( Index doc, List ( Set String, Float ) ) 235 | getWordsForFieldList doc ( getFieldList, fieldBoost ) ( index, fieldsLists ) = 236 | let 237 | ( u1index, tokens ) = 238 | Index.Utils.getTokensList index (getFieldList doc) 239 | in 240 | ( u1index, ( Set.fromList tokens, fieldBoost ) :: fieldsLists ) 241 | 242 | 243 | {-| Add the document to the index. 244 | -} 245 | addDoc : String -> List ( Set String, Float ) -> Set String -> Index doc -> Index doc 246 | addDoc docRef fieldTokensAndBoosts docTokens (Index irec) = 247 | let 248 | addTokenScore ( token, score ) trie = 249 | Trie.add ( docRef, score ) token trie 250 | 251 | updatedDocumentStore = 252 | Dict.insert docRef docTokens irec.documentStore 253 | 254 | updatedCorpusTokens = 255 | Set.union irec.corpusTokens docTokens 256 | 257 | -- can the cost of this be reduced ? 258 | updatedCorpusTokensIndex = 259 | Index.Utils.buildOrderIndex updatedCorpusTokens 260 | 261 | tokenAndScores = 262 | List.map 263 | (scoreToken fieldTokensAndBoosts) 264 | (Set.toList docTokens) 265 | 266 | updatedTokenStore = 267 | List.foldr addTokenScore irec.tokenStore tokenAndScores 268 | in 269 | Index 270 | { irec 271 | | documentStore = updatedDocumentStore 272 | , corpusTokens = updatedCorpusTokens 273 | , corpusTokensIndex = updatedCorpusTokensIndex 274 | , tokenStore = updatedTokenStore 275 | , idfCache = Dict.empty 276 | } 277 | 278 | 279 | {-| Return term frequency score for a token in document. 280 | 281 | Overall score for a token is based on the number of fields the word 282 | appears and weighted by boost score on each field. 283 | 284 | -} 285 | scoreToken : List ( Set String, Float ) -> String -> ( String, Float ) 286 | scoreToken fieldTokensAndBoost token = 287 | let 288 | score : ( Set String, Float ) -> Float -> Float 289 | score ( tokenSet, fieldBoost ) scoreSum = 290 | if Set.isEmpty tokenSet then 291 | scoreSum 292 | 293 | else 294 | let 295 | tokenBoost = 296 | if Set.member token tokenSet then 297 | fieldBoost / toFloat (Set.size tokenSet) 298 | 299 | else 300 | 0 301 | in 302 | scoreSum + tokenBoost 303 | in 304 | ( token, List.foldr score 0 fieldTokensAndBoost ) 305 | 306 | 307 | {-| Remove document from an Index if no error result conditions encountered. 308 | 309 | Original function signature retained for backward compatible. 310 | 311 | See ElmTextSearch documentation for `remove` to see error result conditions. 312 | 313 | This does the following things 314 | 315 | - Remove the document tags from documentStore. 316 | - Remove all the document references in tokenStore. 317 | - It does not modify corpusTokens - as this requires 318 | reprocessing tokens for all documents to recreate corpusTokens. 319 | - This may skew the results over time after many removes but not badly. 320 | - It appears lunr.js operates this way as well for remove. 321 | 322 | -} 323 | remove : doc -> Index doc -> Result String (Index doc) 324 | remove doc index = 325 | case removeT doc index of 326 | Ok value -> 327 | Ok value 328 | 329 | Err err -> 330 | case err of 331 | DocIsNotInIndex -> 332 | Err "Error document is not in index." 333 | 334 | RemoveErrorUniqueRefIsEmpty -> 335 | Err "Error document has an empty unique id (ref)." 336 | 337 | 338 | {-| Remove document from an Index if no error result conditions encountered. 339 | 340 | Variant that supports RemoveError type for Result 341 | 342 | See ElmTextSearch documentation for `remove` to see error result conditions. 343 | 344 | This does the following things 345 | 346 | - Remove the document tags from documentStore. 347 | - Remove all the document references in tokenStore. 348 | - It does not modify corpusTokens - as this requires 349 | reprocessing tokens for all documents to recreate corpusTokens. 350 | - This may skew the results over time after many removes but not badly. 351 | - It appears lunr.js operates this way as well for remove. 352 | 353 | -} 354 | removeT : doc -> Index doc -> Result RemoveError (Index doc) 355 | removeT doc ((Index irec) as index) = 356 | let 357 | docRef = 358 | irec.ref doc 359 | in 360 | if String.isEmpty docRef then 361 | Err RemoveErrorUniqueRefIsEmpty 362 | 363 | else if not (Index.Utils.refExists docRef index) then 364 | Err DocIsNotInIndex 365 | 366 | else 367 | Ok 368 | (Maybe.withDefault index <| 369 | Maybe.map 370 | (removeDoc docRef index) 371 | (Dict.get docRef irec.documentStore) 372 | ) 373 | 374 | 375 | errorMessageNotIndex : String 376 | errorMessageNotIndex = 377 | "Error document is not in index." 378 | 379 | 380 | {-| Remove the doc by docRef id from the index. 381 | -} 382 | removeDoc : String -> Index doc -> Set String -> Index doc 383 | removeDoc docRef (Index irec) docTokens = 384 | let 385 | removeToken token trie = 386 | Trie.remove token docRef trie 387 | 388 | updatedDocumentStore = 389 | Dict.remove docRef irec.documentStore 390 | 391 | updatedTokenStore = 392 | List.foldr removeToken irec.tokenStore (Set.toList docTokens) 393 | in 394 | Index 395 | { irec 396 | | documentStore = updatedDocumentStore 397 | , tokenStore = updatedTokenStore 398 | , idfCache = Dict.empty 399 | } 400 | 401 | 402 | {-| Update document in Index. Does a remove then add. 403 | See ElmTextSearch documentation for `add` and `remove` to see error result conditions. 404 | -} 405 | update : doc -> Index doc -> Result String (Index doc) 406 | update doc index = 407 | remove doc index 408 | |> Result.andThen (add doc) 409 | 410 | 411 | {-| Add or Update document in Index. 412 | This does an add if document is not in index. 413 | -} 414 | addOrUpdate : doc -> Index doc -> Result String (Index doc) 415 | addOrUpdate doc index = 416 | case remove doc index of 417 | Ok u1index -> 418 | add doc u1index 419 | 420 | Err msg -> 421 | if msg == errorMessageNotIndex then 422 | add doc index 423 | 424 | else 425 | Err msg 426 | 427 | 428 | {-| Search index with query. 429 | See ElmTextSearch documentation for `search` to see error result conditions. 430 | 431 | Original function signature retained for backward compatible. 432 | 433 | -} 434 | search : String -> Index doc -> Result String ( Index doc, List ( String, Float ) ) 435 | search query index = 436 | case searchT query index of 437 | Ok value -> 438 | Ok value 439 | 440 | Err error -> 441 | Err <| 442 | case error of 443 | IndexIsEmpty -> 444 | "Error there are no documents in index to search." 445 | 446 | QueryIsEmpty -> 447 | "Error query is empty." 448 | 449 | NoTermsToSearchAfterTokenisation -> 450 | "Error after tokenisation there are no terms to search for." 451 | 452 | 453 | {-| Search index with query. 454 | See ElmTextSearch documentation for `search` to see error result conditions. 455 | 456 | Variant that supports RemoveError type for Result 457 | 458 | -} 459 | searchT : String -> Index doc -> Result SearchError ( Index doc, List ( String, Float ) ) 460 | searchT query index = 461 | let 462 | ( (Index i1irec) as i1index, tokens ) = 463 | Index.Utils.getTokens index query 464 | 465 | tokenInStore token = 466 | Trie.getNode token i1irec.tokenStore /= Nothing 467 | in 468 | if Dict.isEmpty i1irec.documentStore then 469 | Err IndexIsEmpty 470 | 471 | else if String.isEmpty (String.trim query) then 472 | Err QueryIsEmpty 473 | 474 | else if List.isEmpty tokens then 475 | Err NoTermsToSearchAfterTokenisation 476 | 477 | else if List.isEmpty tokens || not (List.any tokenInStore tokens) then 478 | Ok ( i1index, [] ) 479 | 480 | else 481 | Ok (searchTokens tokens i1index) 482 | 483 | 484 | {-| Return list of document ref's with score, ordered by score descending. 485 | 486 | This had a bug it used "fields" boosts but did not use "listFields" for all fields indexed. 487 | This meant that if you only indexed with listFields that fieldsBoosts would be zero and 488 | resultant score would end up NaN. 489 | 490 | In addition a second problem was that it makes little to no sense to scale query vector 491 | by average of all fields boost as it does not change the relative score document matches. 492 | So removing boost on queries is a simpler solution than including "listFields" into boosts. 493 | 494 | -} 495 | searchTokens : 496 | List String 497 | -> Index doc 498 | -> ( Index doc, List ( String, Float ) ) 499 | searchTokens tokens index = 500 | let 501 | ( tokenDocSets, queryVector, u1index ) = 502 | Index.Vector.getQueryVector 503 | tokens 504 | index 505 | 506 | ( u2index, matchedDocs ) = 507 | List.foldr 508 | (scoreAndCompare queryVector) 509 | ( u1index, [] ) 510 | (Set.toList (Utils.intersectSets tokenDocSets)) 511 | 512 | -- _ = Debug.log "searchTokens intersect" (Utils.intersectSets tokenDocSets) 513 | in 514 | ( u2index, List.reverse (List.sortBy Tuple.second matchedDocs) ) 515 | -------------------------------------------------------------------------------- /src/Index/Defaults.elm: -------------------------------------------------------------------------------- 1 | module Index.Defaults exposing 2 | ( indexVersion 3 | , elmTextSearchIndexType 4 | , defaultTransformFactories 5 | , defaultFilterFactories 6 | , defaultTokenTrimmerFuncCreator 7 | , defaultStemmerFuncCreator 8 | , defaultStopWordFilterFuncCreator 9 | , defaultInitialTransformFactories 10 | , getDefaultIndexConfig 11 | , getIndexSimpleConfig 12 | ) 13 | 14 | {-| Defaults for indexes and configurations. 15 | 16 | 17 | ## Index Storage Engine Version and Type 18 | 19 | @docs indexVersion 20 | @docs elmTextSearchIndexType 21 | 22 | 23 | ## Built in Transforms and Filters 24 | 25 | @docs defaultTransformFactories 26 | @docs defaultFilterFactories 27 | @docs defaultTokenTrimmerFuncCreator 28 | @docs defaultStemmerFuncCreator 29 | @docs defaultStopWordFilterFuncCreator 30 | @docs defaultInitialTransformFactories 31 | 32 | 33 | ## Config type adapters 34 | 35 | @docs getDefaultIndexConfig 36 | @docs getIndexSimpleConfig 37 | 38 | Copyright (c) 2016 Robin Luiten 39 | 40 | -} 41 | 42 | import Index.Model as Model 43 | exposing 44 | ( FilterFactory 45 | , IndexSimpleConfig 46 | , TransformFactory 47 | ) 48 | import Index.Utils 49 | import Stemmer 50 | import StopWordFilter 51 | import TokenProcessors 52 | 53 | 54 | {-| The version of index, for loading a saved index. 55 | 56 | This is not the same as package version. 57 | 58 | This needs to change if the encoded format changes. Be careful of updates to 59 | Trie package, if Trie encoding format changes this version needs to change as 60 | well. 61 | 62 | -} 63 | indexVersion : String 64 | indexVersion = 65 | "1.1.0" 66 | 67 | 68 | {-| The type of index defaults to using. 69 | It defines the default token transforms and filters. 70 | -} 71 | elmTextSearchIndexType : String 72 | elmTextSearchIndexType = 73 | "-= ElmTextSearch Index Type 1 =-" 74 | 75 | 76 | {-| Index default transform factories. 77 | -} 78 | defaultTransformFactories : List (TransformFactory doc) 79 | defaultTransformFactories = 80 | [ defaultStemmerFuncCreator 81 | ] 82 | 83 | 84 | {-| Index default transform factories that apply before filters. 85 | -} 86 | defaultInitialTransformFactories : List (TransformFactory doc) 87 | defaultInitialTransformFactories = 88 | [ defaultTokenTrimmerFuncCreator 89 | ] 90 | 91 | 92 | {-| Index default filter factories. 93 | -} 94 | defaultFilterFactories : List (FilterFactory doc) 95 | defaultFilterFactories = 96 | [ defaultStopWordFilterFuncCreator 97 | ] 98 | 99 | 100 | {-| The default token trimmer transform function creator. 101 | Normally applied first in transform functions. 102 | -} 103 | defaultTokenTrimmerFuncCreator : TransformFactory doc 104 | defaultTokenTrimmerFuncCreator = 105 | Index.Utils.createFuncCreator TokenProcessors.trimmer 106 | 107 | 108 | {-| The default token stemmer transform function creator. 109 | -} 110 | defaultStemmerFuncCreator : TransformFactory doc 111 | defaultStemmerFuncCreator = 112 | Index.Utils.createFuncCreator Stemmer.stem 113 | 114 | 115 | {-| The default stop word filter function creator. 116 | -} 117 | defaultStopWordFilterFuncCreator : FilterFactory doc 118 | defaultStopWordFilterFuncCreator = 119 | StopWordFilter.createDefaultFilterFunc 120 | 121 | 122 | {-| Convert Index.Model.ModelSimpleConfig to Index.Model.Config 123 | Filling in default values for fields not in SimpleConfig 124 | This is the definition of the default index configuration. 125 | -} 126 | getDefaultIndexConfig : Model.ModelSimpleConfig doc -> Model.Config doc 127 | getDefaultIndexConfig { indexType, ref, fields, listFields } = 128 | { indexType = indexType 129 | , ref = ref 130 | , fields = fields 131 | , listFields = listFields 132 | , initialTransformFactories = defaultInitialTransformFactories 133 | , transformFactories = defaultTransformFactories 134 | , filterFactories = defaultFilterFactories 135 | } 136 | 137 | 138 | {-| convert ElmTextSearch.IndexSimpleConfig to Index.Model.ModelSimpleConfig 139 | -} 140 | getIndexSimpleConfig : IndexSimpleConfig doc -> Model.ModelSimpleConfig doc 141 | getIndexSimpleConfig { ref, fields, listFields } = 142 | { indexType = elmTextSearchIndexType 143 | , ref = ref 144 | , fields = fields 145 | , listFields = listFields 146 | } 147 | -------------------------------------------------------------------------------- /src/Index/Load.elm: -------------------------------------------------------------------------------- 1 | module Index.Load exposing (errorPrefix, loadIndex, loadIndexValue, loadIndexValueWith, loadIndexWith) 2 | 3 | {-| Load an index from Value or String 4 | 5 | Copyright (c) 2016 Robin Luiten 6 | 7 | -} 8 | 9 | import Dict 10 | import ElmTextSearch.Json.Decoder as IndexDecoder 11 | import Index.Defaults as Defaults 12 | import Index.Model exposing (..) 13 | import Index.Utils 14 | import Json.Decode as Decode 15 | 16 | 17 | errorPrefix : String 18 | errorPrefix = 19 | "Error cannot load Index." 20 | 21 | 22 | {-| Decode an index with one of provided configs. 23 | 24 | The configurations supplied will be used in the order provided in 25 | the list so the earliest one that matches indexType is used. 26 | Try to use a supported index config first. 27 | Then try the default just in case. 28 | 29 | -} 30 | loadIndexWith : List (Config doc) -> String -> Result Decode.Error (Index doc) 31 | loadIndexWith supportedIndexConfigs inputString = 32 | Decode.decodeString 33 | (IndexDecoder.decoder 34 | |> Decode.andThen (mapIndexConfig supportedIndexConfigs) 35 | |> Decode.andThen createIndex 36 | ) 37 | inputString 38 | 39 | 40 | mapIndexConfig : List (Config doc) -> CodecIndexRecord -> Decode.Decoder ( Config doc, CodecIndexRecord ) 41 | mapIndexConfig supportedIndexConfigs index = 42 | if Defaults.indexVersion /= index.indexVersion then 43 | Decode.fail <| 44 | (errorPrefix 45 | ++ " Version supported is " 46 | ++ Defaults.indexVersion 47 | ++ ". Version tried to load is " 48 | ++ index.indexVersion 49 | ++ "." 50 | ) 51 | 52 | else 53 | let 54 | config = 55 | List.filter 56 | (\cfg -> cfg.indexType == index.indexType) 57 | supportedIndexConfigs 58 | in 59 | case config of 60 | [] -> 61 | Decode.fail <| 62 | (errorPrefix 63 | ++ " Tried to load index of type \"" 64 | ++ index.indexType 65 | ++ "\". It is not in supported index configurations." 66 | ) 67 | 68 | matchedConfig :: _ -> 69 | Decode.succeed ( matchedConfig, index ) 70 | 71 | 72 | loadIndexValueWith : List (Config doc) -> Decode.Value -> Result Decode.Error (Index doc) 73 | loadIndexValueWith supportedIndexConfigs inputValue = 74 | Decode.decodeValue 75 | (IndexDecoder.decoder 76 | |> Decode.andThen (mapIndexConfig supportedIndexConfigs) 77 | |> Decode.andThen createIndex 78 | ) 79 | inputValue 80 | 81 | 82 | createIndex : ( Config doc, CodecIndexRecord ) -> Decode.Decoder (Index doc) 83 | createIndex ( config, decodedIndex ) = 84 | Decode.succeed <| 85 | Index 86 | { indexVersion = decodedIndex.indexVersion 87 | , indexType = decodedIndex.indexType 88 | , ref = config.ref 89 | , fields = config.fields 90 | , listFields = config.listFields 91 | , initialTransformFactories = config.initialTransformFactories 92 | , transformFactories = config.transformFactories 93 | , filterFactories = config.filterFactories 94 | , documentStore = decodedIndex.documentStore 95 | , corpusTokens = decodedIndex.corpusTokens 96 | , tokenStore = decodedIndex.tokenStore 97 | , corpusTokensIndex = 98 | Index.Utils.buildOrderIndex decodedIndex.corpusTokens 99 | , initialTransforms = Nothing 100 | , transforms = Nothing 101 | , filters = Nothing 102 | , idfCache = Dict.empty 103 | } 104 | 105 | 106 | loadIndex : ModelSimpleConfig doc -> String -> Result Decode.Error (Index doc) 107 | loadIndex simpleConfig inputString = 108 | loadIndexWith 109 | [ Defaults.getDefaultIndexConfig simpleConfig ] 110 | inputString 111 | 112 | 113 | loadIndexValue : ModelSimpleConfig doc -> Decode.Value -> Result Decode.Error (Index doc) 114 | loadIndexValue simpleConfig inputValue = 115 | loadIndexValueWith 116 | [ Defaults.getDefaultIndexConfig simpleConfig ] 117 | inputValue 118 | -------------------------------------------------------------------------------- /src/Index/Model.elm: -------------------------------------------------------------------------------- 1 | module Index.Model exposing 2 | ( Index(..) 3 | , IndexSimpleConfig 4 | , CodecIndexRecord, Config, FilterFactory, FilterFunc, FuncFactory, ModelSimpleConfig, TransformFactory, TransformFunc, TransformFunc2 5 | ) 6 | 7 | {-| Define the Index Model 8 | 9 | @docs Index 10 | @docs IndexSimpleConfig 11 | @docs IndexConfig 12 | 13 | Copyright (c) 2016 Robin Luiten 14 | 15 | -} 16 | 17 | import Dict exposing (Dict) 18 | import Set exposing (Set) 19 | import Trie exposing (Trie) 20 | 21 | 22 | {-| Func and Factory types used with ElmTextSearch. 23 | -} 24 | type alias FuncFactory doc func = 25 | Index doc -> ( Index doc, func ) 26 | 27 | 28 | type alias TransformFunc = 29 | String -> String 30 | 31 | 32 | {-| Variant, making composition nicer in code 33 | The old version just isn't right was using "" as no token rather than Maybe. 34 | Makes composition lots better 35 | -} 36 | type alias TransformFunc2 = 37 | String -> Maybe String 38 | 39 | 40 | type alias TransformFactory doc = 41 | Index doc -> ( Index doc, String -> String ) 42 | 43 | 44 | type alias FilterFunc = 45 | String -> Bool 46 | 47 | 48 | type alias FilterFactory doc = 49 | Index doc -> ( Index doc, String -> Bool ) 50 | 51 | 52 | {-| Index is a full text index for a document type. 53 | 54 | The internal data model of Index 55 | 56 | - indexType 57 | - a string that can be used on load to provide the correct set 58 | 59 | - indexVersion 60 | - a version string 61 | 62 | - ref 63 | - how to get at unique id of documents added 64 | 65 | - fields 66 | - list of fields of type String to index from document 67 | - first field is function to get String content of field 68 | - second field Float is a boost to text frequency of tokens in this field 69 | 70 | - listFields 71 | - list of fields of type List String to index from document 72 | - first field is function to get List String content of field 73 | - second field Float is a boost to text frequency of tokens in this field 74 | 75 | - initialTransformFactories 76 | - list of factory functions to create transform functions 77 | - this list is of transforms is applied before filters 78 | - the ones in `transformFactories` are applied after filters 79 | 80 | - transformFactories 81 | - list of factory functions to create transform functions 82 | 83 | - filterFactories 84 | - list of factory functions to create filter functions 85 | 86 | - transforms 87 | - the transforms in index token processing 88 | - lazy populated from transformFactories 89 | 90 | - filters 91 | - the files in index token processing 92 | - lazy populated from filterFactories 93 | 94 | - documentStore 95 | - contains dictionary of document ref to Set of document tokens 96 | 97 | - corpusTokens 98 | - Set of all indexed tokens from all documentStore 99 | 100 | - corpusTokensIndex 101 | - to get the position of a token in the order list of corpusTokens 102 | 103 | - tokenStore 104 | - tokenStore is used for efficient storing and lookup of the 105 | reverse index of token to document ref and holding the 106 | token term frequency 107 | 108 | - idfCache 109 | - cached idf (inverse document frequency scores) 110 | - cache is reset (cleared) if any document is added removed or updated in index 111 | 112 | -} 113 | type Index doc 114 | = Index (IndexRecord doc) 115 | 116 | 117 | {-| The Record model in an Index. 118 | -} 119 | type alias IndexRecord doc = 120 | { indexVersion : String 121 | , indexType : String 122 | , ref : doc -> String 123 | , fields : List ( doc -> String, Float ) 124 | , listFields : List ( doc -> List String, Float ) 125 | , initialTransformFactories : List (TransformFactory doc) 126 | , transformFactories : List (TransformFactory doc) 127 | , filterFactories : List (FilterFactory doc) 128 | , documentStore : Dict String (Set String) 129 | , corpusTokens : Set String 130 | , tokenStore : Trie Float 131 | , corpusTokensIndex : Dict String Int 132 | , initialTransforms : Maybe (List TransformFunc2) 133 | , transforms : Maybe (List TransformFunc2) 134 | , filters : Maybe (List TransformFunc2) 135 | , idfCache : Dict String Float 136 | } 137 | 138 | 139 | {-| Simple index config with default token processing. 140 | 141 | Simple still requires configuring the fields for your document type. 142 | See [`ElmTextSearch.SimpleConfig`](ElmTextSearch#SimpleConfig) 143 | for explantions of `ref`, `fields` and `listFields` fields. 144 | 145 | - ElmTextSearch.SimpleConfig does not include `indexType`. 146 | - In this case the user is getting the ElmTextSearch default token processing. 147 | - Index.SimpleConfig includes `indexType`. 148 | 149 | `indexType` is an identifier used to determine the transforms and filters the 150 | index uses for operation. It should be unique for all possible differently 151 | configured indexes you plan to use. 152 | 153 | 154 | ### The default transform factories. 155 | 156 | Index.Defaults.defaultTransformFactories 157 | 158 | 159 | ### The default filter factories. 160 | 161 | Index.Defaults.defaultFilterFactories 162 | 163 | -} 164 | type alias ModelSimpleConfig doc = 165 | { indexType : String 166 | , ref : doc -> String 167 | , fields : List ( doc -> String, Float ) 168 | , listFields : List ( doc -> List String, Float ) 169 | } 170 | 171 | 172 | {-| Index config with customized token processing. 173 | 174 | If a configuration does not match an index being loaded 175 | you will get an Err Result returned. 176 | 177 | -} 178 | type alias Config doc = 179 | { indexType : String 180 | , ref : doc -> String 181 | , fields : List ( doc -> String, Float ) 182 | , listFields : List ( doc -> List String, Float ) 183 | , initialTransformFactories : List (TransformFactory doc) 184 | , transformFactories : List (TransformFactory doc) 185 | , filterFactories : List (FilterFactory doc) 186 | } 187 | 188 | 189 | {-| Just the fields encoded for an Index. 190 | -} 191 | type alias CodecIndexRecord = 192 | { indexVersion : String 193 | , indexType : String 194 | , documentStore : Dict String (Set String) 195 | , corpusTokens : Set String 196 | , tokenStore : Trie Float 197 | } 198 | 199 | 200 | {-| A SimpleConfig is the least amount of configuration data 201 | required to create an Index. 202 | -} 203 | type alias IndexSimpleConfig doc = 204 | { ref : doc -> String 205 | , fields : List ( doc -> String, Float ) 206 | , listFields : List ( doc -> List String, Float ) 207 | } 208 | -------------------------------------------------------------------------------- /src/Index/Utils.elm: -------------------------------------------------------------------------------- 1 | module Index.Utils exposing 2 | ( createFuncCreator 3 | , getTokens 4 | , getTokensList 5 | , processTokens 6 | , idf 7 | , refExists 8 | , buildOrderIndex 9 | ) 10 | 11 | {-| Index Utilities 12 | 13 | 14 | ## Functions 15 | 16 | @docs createFuncCreator 17 | @docs getTokens 18 | @docs getTokensList 19 | @docs processTokens 20 | @docs idf 21 | @docs refExists 22 | @docs buildOrderIndex 23 | 24 | Copyright (c) 2016 Robin Luiten 25 | 26 | -} 27 | 28 | import Dict exposing (Dict) 29 | import Index.Model 30 | exposing 31 | ( FilterFactory 32 | , FuncFactory 33 | , Index(..) 34 | , TransformFunc 35 | , TransformFunc2 36 | ) 37 | import Set exposing (Set) 38 | import TokenProcessors 39 | import Trie 40 | 41 | 42 | {-| Create a function creator (FuncFactory) 43 | given the simple Function to start with 44 | -} 45 | createFuncCreator : func -> FuncFactory doc func 46 | createFuncCreator func index = 47 | ( index, func ) 48 | 49 | 50 | {-| Extract tokens from string, and process them. 51 | -} 52 | getTokens : Index doc -> String -> ( Index doc, List String ) 53 | getTokens index string = 54 | processTokens index (TokenProcessors.tokenizer string) 55 | 56 | 57 | getTokensList : Index doc -> List String -> ( Index doc, List String ) 58 | getTokensList index listString = 59 | processTokens index (TokenProcessors.tokenizerList listString) 60 | 61 | 62 | {-| Transform list of words into tokens for index and search. 63 | 64 | Applies filters and transformers configured in index. 65 | 66 | Applies filters first then tokenizers. 67 | So filters apply to untokenized words from document. 68 | 69 | -} 70 | processTokens : Index doc -> List String -> ( Index doc, List String ) 71 | processTokens index tokens = 72 | let 73 | ( u1index, initialTransformTokens ) = 74 | applyInitialTransform index tokens 75 | 76 | ( u2index, filterTokens ) = 77 | applyFilter u1index initialTransformTokens 78 | in 79 | applyTransform u2index filterTokens 80 | 81 | 82 | {-| Apply the transforms to tokens. 83 | If any transform converts a token to an empty string no further transforms 84 | are applied and the empty string is removed from the set of tokens. 85 | -} 86 | applyTransform : Index doc -> List String -> ( Index doc, List String ) 87 | applyTransform index strings = 88 | let 89 | ( u1index, transformList2 ) = 90 | getOrSetTransformList index 91 | in 92 | ( u1index 93 | , List.filterMap 94 | (applyTransformList transformList2) 95 | strings 96 | ) 97 | 98 | 99 | {-| Would prefer to pass just accessors (eg .transforms) to 100 | getOrSetIndexFuncList but so far the types are beating me. 101 | -} 102 | getOrSetTransformList : Index doc -> ( Index doc, List TransformFunc2 ) 103 | getOrSetTransformList index = 104 | getOrSetIndexFuncListA 105 | (\(Index irec) -> irec.transforms) 106 | (\(Index irec) -> irec.transformFactories) 107 | setIndexTransforms 108 | index 109 | 110 | 111 | {-| set Index transforms func field 112 | 113 | Added listFuncs2 114 | 115 | -} 116 | setIndexTransforms : Index doc -> List TransformFunc2 -> Index doc 117 | setIndexTransforms (Index irec) listFuncs2 = 118 | Index { irec | transforms = Just listFuncs2 } 119 | 120 | 121 | applyInitialTransform : Index doc -> List String -> ( Index doc, List String ) 122 | applyInitialTransform index strings = 123 | let 124 | ( u1index, intitialTransformList2 ) = 125 | getOrSetInitialTransformList index 126 | in 127 | ( u1index 128 | , List.filterMap 129 | (applyTransformList intitialTransformList2) 130 | strings 131 | ) 132 | 133 | 134 | getOrSetInitialTransformList : Index doc -> ( Index doc, List TransformFunc2 ) 135 | getOrSetInitialTransformList index = 136 | getOrSetIndexFuncListA 137 | (\(Index irec) -> irec.initialTransforms) 138 | (\(Index irec) -> irec.initialTransformFactories) 139 | setIndexInitialTransforms 140 | index 141 | 142 | 143 | setIndexInitialTransforms : Index doc -> List TransformFunc2 -> Index doc 144 | setIndexInitialTransforms (Index irec) listFuncs2 = 145 | Index { irec | initialTransforms = Just listFuncs2 } 146 | 147 | 148 | {-| Apply all transforms in sequence to input token. 149 | 150 | This works it came from reference learn-maybe/src/Transforms.elm my test project. 151 | 152 | -} 153 | applyTransformList : List TransformFunc2 -> String -> Maybe String 154 | applyTransformList transforms token = 155 | List.foldl (\t -> Maybe.andThen t) (Just token) transforms 156 | 157 | 158 | {-| Adapt function String -> String 159 | Into String -> Maybe String 160 | Where an empty string maps to Nothing. 161 | 162 | This is only exposed to test AUGH! 163 | -} 164 | adaptFuncStrA : a -> (String -> a) -> (String -> Maybe a) 165 | adaptFuncStrA aValue func = 166 | \string -> 167 | let 168 | result = 169 | func string 170 | in 171 | if result /= aValue then 172 | Just result 173 | 174 | else 175 | Nothing 176 | 177 | 178 | adaptFuncStrB : (String -> Bool) -> (String -> Maybe String) 179 | adaptFuncStrB func = 180 | \string -> 181 | let 182 | result = 183 | func string 184 | in 185 | if result then 186 | Just string 187 | 188 | else 189 | Nothing 190 | 191 | 192 | {-| Apply index filters to tokens. 193 | 194 | If any token is an empty string it will be filtered out as well. 195 | 196 | -} 197 | applyFilter : Index doc -> List String -> ( Index doc, List String ) 198 | applyFilter index strings = 199 | let 200 | ( u1index, filterList2 ) = 201 | getOrSetFilterList index 202 | in 203 | ( u1index 204 | , List.filterMap 205 | (applyTransformList filterList2) 206 | strings 207 | ) 208 | 209 | 210 | getOrSetFilterList : Index doc -> ( Index doc, List TransformFunc2 ) 211 | getOrSetFilterList index = 212 | getOrSetIndexFuncListB 213 | (\(Index irec) -> irec.filters) 214 | (\(Index irec) -> irec.filterFactories) 215 | setIndexFilters 216 | index 217 | 218 | 219 | {-| set Index filters func field 220 | -} 221 | setIndexFilters : Index doc -> List TransformFunc2 -> Index doc 222 | setIndexFilters (Index irec) listFuncs2 = 223 | Index { irec | filters = Just listFuncs2 } 224 | 225 | 226 | {-| String TranformFunc source type variant. 227 | 228 | See getOrSetIndexFuncListB for FilterFunc variant 229 | Generic type `a` isnt helping me here so splitting for specific types 230 | Dang and these two variants work. 231 | 232 | -} 233 | getOrSetIndexFuncListA : 234 | (Index doc -> Maybe (List TransformFunc2)) 235 | -> (Index doc -> List (FuncFactory doc TransformFunc)) 236 | -> (Index doc -> List TransformFunc2 -> Index doc) 237 | -> Index doc 238 | -> ( Index doc, List TransformFunc2 ) 239 | getOrSetIndexFuncListA getFuncs2 getFactoryFuncs setFuncs index = 240 | case getFuncs2 index of 241 | -- init allready run 242 | Just funcList2 -> 243 | ( index, funcList2 ) 244 | 245 | -- rebuild function lists 246 | _ -> 247 | let 248 | ( u1index, newFuncList ) = 249 | runFactories (getFactoryFuncs index) index 250 | 251 | newFunc2List = 252 | List.map (adaptFuncStrA "") newFuncList 253 | 254 | u2index = 255 | setFuncs u1index newFunc2List 256 | in 257 | ( u2index, newFunc2List ) 258 | 259 | 260 | {-| Variant for FilterFunc hydration 261 | 262 | If i switch FilterFunc to be TransformFunc instead i can share above code, just one less variation. 263 | 264 | -} 265 | getOrSetIndexFuncListB : 266 | (Index doc -> Maybe (List TransformFunc2)) 267 | -> (Index doc -> List (FilterFactory doc)) 268 | -> (Index doc -> List TransformFunc2 -> Index doc) 269 | -> Index doc 270 | -> ( Index doc, List TransformFunc2 ) 271 | getOrSetIndexFuncListB getFuncs2 getFactoryFuncs setFuncs index = 272 | case getFuncs2 index of 273 | -- init allready run 274 | Just funcList2 -> 275 | ( index, funcList2 ) 276 | 277 | -- rebuild function lists 278 | _ -> 279 | let 280 | ( u1index, newFuncList ) = 281 | runFactories (getFactoryFuncs index) index 282 | 283 | newFunc2List = 284 | List.map adaptFuncStrB newFuncList 285 | 286 | u2index = 287 | setFuncs u1index newFunc2List 288 | in 289 | ( u2index, newFunc2List ) 290 | 291 | 292 | {-| Run each of the function factories returning the list of functions. 293 | 294 | TODO use foldr?, probably dont mater here 295 | 296 | -} 297 | runFactories : List (FuncFactory doc func) -> Index doc -> ( Index doc, List func ) 298 | runFactories factoryList index = 299 | List.foldr 300 | (\factory ( u1index, funcList ) -> 301 | let 302 | ( u2index, newFunc ) = 303 | factory u1index 304 | in 305 | ( u2index, newFunc :: funcList ) 306 | ) 307 | ( index, [] ) 308 | factoryList 309 | 310 | 311 | {-| Calculate the inverse document frequency for a token in the Index. 312 | 313 | Model will update if token has no cached value for idf. 314 | 315 | -} 316 | idf : Index doc -> String -> ( Index doc, Float ) 317 | idf ((Index irec) as index) token = 318 | case Dict.get token irec.idfCache of 319 | Nothing -> 320 | calcIdf index token 321 | 322 | Just idfValue -> 323 | ( index, idfValue ) 324 | 325 | 326 | calcIdf : Index doc -> String -> ( Index doc, Float ) 327 | calcIdf (Index irec) token = 328 | let 329 | -- _ = Debug.log("calcIdf") (token) 330 | docFrequency = 331 | toFloat (Trie.valueCount token irec.tokenStore) 332 | 333 | idfLocal = 334 | if docFrequency > 0 then 335 | 1 336 | + logBase 10 337 | (toFloat (Dict.size irec.documentStore) / docFrequency) 338 | 339 | else 340 | toFloat 1 341 | 342 | updatedIdfCache = 343 | Dict.insert token idfLocal irec.idfCache 344 | 345 | u1index = 346 | Index 347 | { irec 348 | | idfCache = updatedIdfCache 349 | } 350 | in 351 | ( u1index, idfLocal ) 352 | 353 | 354 | {-| Return True if document reference is indexed. 355 | -} 356 | refExists : String -> Index doc -> Bool 357 | refExists docRef (Index irec) = 358 | Dict.member docRef irec.documentStore 359 | 360 | 361 | {-| Build an index of string to index from Set where key is 362 | Set word and value is ordered index of word in Set. 363 | -} 364 | buildOrderIndex : Set String -> Dict String Int 365 | buildOrderIndex tokenSet = 366 | let 367 | withIndex = 368 | List.indexedMap Tuple.pair (Set.toList tokenSet) 369 | in 370 | List.foldr (\( i, v ) d -> Dict.insert v i d) Dict.empty withIndex 371 | -------------------------------------------------------------------------------- /src/Index/Vector.elm: -------------------------------------------------------------------------------- 1 | module Index.Vector exposing (buildDocVector, getDocVector, getQueryVector, scoreAndCompare, similarityBoost, updateDocVector, updateSetAndVec) 2 | 3 | {-| Index document vector support. 4 | 5 | Copyright (c) 2016 Robin Luiten 6 | 7 | -} 8 | 9 | import Dict 10 | import Index.Model exposing (Index(..)) 11 | import Index.Utils 12 | import Maybe 13 | import Set exposing (Set) 14 | import SparseVector exposing (SparseVector) 15 | import String 16 | import Trie 17 | 18 | 19 | {-| Build a query vector and the sets of candidate document matches 20 | for each token in our query tokens. 21 | 22 | Each token in our query will have a seperate Set String entry in 23 | the returned List. As all query token document result sets are 24 | intersected together for final list of documents matched. (a logical and 25 | of all the query tokens) 26 | 27 | -} 28 | getQueryVector : 29 | List String 30 | -> Index doc 31 | -> ( List (Set String), SparseVector, Index doc ) 32 | getQueryVector tokens index = 33 | List.foldr 34 | (buildDocVector (List.length tokens)) 35 | ( [], SparseVector.empty, index ) 36 | tokens 37 | 38 | 39 | {-| Update query vector elements to create query vector. 40 | Update the list of documents that match for each query token (baseToken). 41 | -} 42 | buildDocVector : 43 | Int 44 | -> String 45 | -> ( List (Set String), SparseVector, Index doc ) 46 | -> ( List (Set String), SparseVector, Index doc ) 47 | buildDocVector tokensLength baseToken ( docSets, vec, (Index irec) as index ) = 48 | let 49 | termFrequency = 50 | 1 / toFloat tokensLength 51 | 52 | expandedTokens = 53 | Trie.expand baseToken irec.tokenStore 54 | 55 | -- _ = Debug.log("buildDocVector") (tokensLength, baseToken, expandedTokens) 56 | ( docs, vecU1, indexU1 ) = 57 | List.foldr 58 | (updateSetAndVec termFrequency baseToken) 59 | ( Set.empty, vec, index ) 60 | expandedTokens 61 | in 62 | ( docs :: docSets, vecU1, indexU1 ) 63 | 64 | 65 | {-| Calculate Term frequency-inverse document frequency (tf-idf). 66 | Union of documents for each expandedToken for this (base)token. 67 | -} 68 | updateSetAndVec : 69 | Float 70 | -> String 71 | -> String 72 | -> ( Set String, SparseVector, Index doc ) 73 | -> ( Set String, SparseVector, Index doc ) 74 | updateSetAndVec tf token expandedToken ( docSets, vec, (Index irec) as index ) = 75 | let 76 | ( (Index u1irec) as u1index, keyIdf ) = 77 | Index.Utils.idf index expandedToken 78 | 79 | tfidf = 80 | tf * keyIdf * similarityBoost token expandedToken 81 | 82 | -- _ = Debug.log("updateSetAndVec") (tf, token, expandedToken, (similarityBoost token expandedToken), keyIdf, tfidf) 83 | -- _ = Debug.log("updateSetAndVec corpus") (irec.corpusTokensIndex) 84 | u1vec = 85 | Maybe.withDefault vec <| 86 | Maybe.map 87 | (\pos -> SparseVector.insert pos tfidf vec) 88 | (Dict.get expandedToken irec.corpusTokensIndex) 89 | 90 | expandedTokenDocSet = 91 | Maybe.withDefault Set.empty <| 92 | Maybe.map 93 | (\dict -> Set.fromList (Dict.keys dict)) 94 | (Trie.get expandedToken u1irec.tokenStore) 95 | 96 | u1docSets = 97 | Set.union expandedTokenDocSet docSets 98 | 99 | -- _ = Debug.log("updateSetAndVec u1docSets u1vec") (expandedToken, u1docSets, u1vec) 100 | in 101 | ( u1docSets, u1vec, u1index ) 102 | 103 | 104 | {-| if the expanded token is not an exact match to the token then 105 | penalise the score for this key by how different the key is 106 | to the token. 107 | -} 108 | similarityBoost : String -> String -> Float 109 | similarityBoost token expandedToken = 110 | if expandedToken == token then 111 | 1 112 | 113 | else 114 | 1 115 | / logBase 10 116 | (toFloat 117 | (max 3 118 | (String.length expandedToken 119 | - String.length token 120 | ) 121 | ) 122 | ) 123 | 124 | 125 | {-| calculate the score for each doc 126 | -} 127 | scoreAndCompare : 128 | SparseVector 129 | -> String 130 | -> ( Index doc, List ( String, Float ) ) 131 | -> ( Index doc, List ( String, Float ) ) 132 | scoreAndCompare queryVector ref ( index, docs ) = 133 | let 134 | ( u1index, docVector ) = 135 | getDocVector index ref 136 | 137 | -- _ = Debug.log("scoreAndCompare") (docVector) 138 | in 139 | ( u1index, ( ref, SparseVector.cosineSimilarity queryVector docVector ) :: docs ) 140 | 141 | 142 | {-| build vector for docRef 143 | -} 144 | getDocVector : Index doc -> String -> ( Index doc, SparseVector ) 145 | getDocVector ((Index irec) as index) docRef = 146 | Maybe.withDefault ( index, SparseVector.empty ) <| 147 | Maybe.map 148 | (\tokenSet -> 149 | List.foldr 150 | (updateDocVector docRef) 151 | ( index, SparseVector.empty ) 152 | (Set.toList tokenSet) 153 | ) 154 | (Dict.get docRef irec.documentStore) 155 | 156 | 157 | {-| reducer for docRef docVector for this token 158 | -} 159 | updateDocVector : String -> String -> ( Index doc, SparseVector ) -> ( Index doc, SparseVector ) 160 | updateDocVector docRef token (( (Index irec) as index, docVector ) as inputTuple) = 161 | Maybe.withDefault inputTuple <| 162 | Maybe.map2 163 | (\position termFrequency -> 164 | let 165 | ( u1index, idfScore ) = 166 | Index.Utils.idf index token 167 | in 168 | ( u1index, SparseVector.insert position (termFrequency * idfScore) docVector ) 169 | ) 170 | (Dict.get token irec.corpusTokensIndex) 171 | (Trie.get token irec.tokenStore 172 | |> Maybe.andThen (Dict.get docRef) 173 | ) 174 | -------------------------------------------------------------------------------- /src/StopWordFilter.elm: -------------------------------------------------------------------------------- 1 | module StopWordFilter exposing 2 | ( createDefaultFilterFunc 3 | , stopEnglishWordList 4 | , createFilterFuncWith 5 | , createFilterFunc 6 | ) 7 | 8 | {-| StopWordFilter is an English language stop word list filter, any words 9 | contained in the list are not stored in the index. 10 | 11 | This is intended to be used in the ElmTextSearch token processing pipeline. 12 | 13 | 14 | ### Things to know about stop word lists. 15 | 16 | - Words in document are split on white space to create tokens. 17 | - Tokens have non word characters from prefix and suffix to improve matching filters. 18 | - Input tokens to create stop word filters should be full words. 19 | - It is more efficient to merge all your stop words into a single 20 | stop word filter. 21 | 22 | 23 | ## create default stop word filter func 24 | 25 | @docs createDefaultFilterFunc 26 | 27 | 28 | ## A default stop word english filter list 29 | 30 | @docs stopEnglishWordList 31 | 32 | 33 | ## Create a custom stop word filter list 34 | 35 | @docs createFilterFuncWith 36 | @docs createFilterFunc 37 | 38 | Copyright (c) 2016 Robin Luiten 39 | 40 | -} 41 | 42 | import Index.Model exposing (FilterFactory) 43 | import Set 44 | 45 | 46 | {-| Default english stop word list to create filter. 47 | -} 48 | stopEnglishWordList : List String 49 | stopEnglishWordList = 50 | [ "a" 51 | , "able" 52 | , "about" 53 | , "across" 54 | , "after" 55 | , "all" 56 | , "almost" 57 | , "also" 58 | , "am" 59 | , "among" 60 | , "an" 61 | , "and" 62 | , "any" 63 | , "are" 64 | , "as" 65 | , "at" 66 | , "be" 67 | , "because" 68 | , "been" 69 | , "but" 70 | , "by" 71 | , "can" 72 | , "cannot" 73 | , "could" 74 | , "dear" 75 | , "did" 76 | , "do" 77 | , "does" 78 | , "either" 79 | , "else" 80 | , "ever" 81 | , "every" 82 | , "for" 83 | , "from" 84 | , "get" 85 | , "got" 86 | , "had" 87 | , "has" 88 | , "have" 89 | , "he" 90 | , "her" 91 | , "hers" 92 | , "him" 93 | , "his" 94 | , "how" 95 | , "however" 96 | , "i" 97 | , "if" 98 | , "in" 99 | , "into" 100 | , "is" 101 | , "it" 102 | , "its" 103 | , "just" 104 | , "least" 105 | , "let" 106 | , "like" 107 | , "likely" 108 | , "may" 109 | , "me" 110 | , "might" 111 | , "most" 112 | , "must" 113 | , "my" 114 | , "neither" 115 | , "no" 116 | , "nor" 117 | , "not" 118 | , "of" 119 | , "off" 120 | , "often" 121 | , "on" 122 | , "only" 123 | , "or" 124 | , "other" 125 | , "our" 126 | , "own" 127 | , "rather" 128 | , "said" 129 | , "say" 130 | , "says" 131 | , "she" 132 | , "should" 133 | , "since" 134 | , "so" 135 | , "some" 136 | , "than" 137 | , "that" 138 | , "the" 139 | , "their" 140 | , "them" 141 | , "then" 142 | , "there" 143 | , "these" 144 | , "they" 145 | , "this" 146 | , "tis" 147 | , "to" 148 | , "too" 149 | , "twas" 150 | , "us" 151 | , "wants" 152 | , "was" 153 | , "we" 154 | , "were" 155 | , "what" 156 | , "when" 157 | , "where" 158 | , "which" 159 | , "while" 160 | , "who" 161 | , "whom" 162 | , "why" 163 | , "will" 164 | , "with" 165 | , "would" 166 | , "yet" 167 | , "you" 168 | , "your" 169 | ] 170 | 171 | 172 | {-| Default english stop word filter suitable for ElmTextSearch. 173 | -} 174 | createDefaultFilterFunc : FilterFactory doc 175 | createDefaultFilterFunc index = 176 | createFilterFunc stopEnglishWordList index 177 | 178 | 179 | {-| Create stop word list filter suitable for ElmTextSearch, this versions 180 | extends the default word list with the extra words provided. 181 | -} 182 | createFilterFuncWith : List String -> FilterFactory doc 183 | createFilterFuncWith extraWords index = 184 | createFilterFunc (List.append extraWords stopEnglishWordList) index 185 | 186 | 187 | {-| Create stop word filter for provided list of tokens suitable for ElmTextSearch. 188 | 189 | \*\* This creates a stop world filter purely from your own word list, understand 190 | what you are doing and consequences if you use this. \*\* 191 | 192 | The FilterFunc created returns True to allow words into index. 193 | So words found in the stopWordList return False 194 | 195 | -} 196 | createFilterFunc : List String -> FilterFactory doc 197 | createFilterFunc tokens index = 198 | let 199 | tokenSet = 200 | Set.fromList tokens 201 | in 202 | ( index, \word -> not (Set.member word tokenSet) ) 203 | -------------------------------------------------------------------------------- /src/TokenProcessors.elm: -------------------------------------------------------------------------------- 1 | module TokenProcessors exposing 2 | ( tokenizer 3 | , tokenizerList 4 | , tokenizerWith 5 | , tokenizerWithRegex 6 | , tokenizerWithRegexList 7 | , trimmer 8 | , tokenizerWithList 9 | ) 10 | 11 | {-| TokenProcessors for strings. 12 | 13 | 14 | ## Create a tokenizer 15 | 16 | @docs tokenizer 17 | @docs tokenizerList 18 | @docs tokenizerWith 19 | @docs tokenizerWithRegex 20 | @docs tokenizerWithRegexList 21 | 22 | 23 | ## Word transformer 24 | 25 | @docs trimmer 26 | 27 | Copyright (c) 2016 Robin Luiten 28 | 29 | -} 30 | 31 | import Regex 32 | exposing 33 | ( Regex 34 | -- , HowMany(..) 35 | , fromString 36 | , replace 37 | , split 38 | ) 39 | import String exposing (toLower, trim) 40 | 41 | 42 | forceRegex : String -> Regex 43 | forceRegex = 44 | Maybe.withDefault Regex.never << fromString 45 | 46 | 47 | defaultSeparator : Regex 48 | defaultSeparator = 49 | forceRegex "[\\s\\-]+" 50 | 51 | 52 | {-| Tokenize a String. 53 | Will not return any empty string tokens. 54 | By default this splits on whitespace and hyphens. 55 | -} 56 | tokenizer : String -> List String 57 | tokenizer = 58 | tokenizerWithRegex defaultSeparator 59 | 60 | 61 | {-| Tokenize a List String. 62 | Will not return any empty string tokens. 63 | By default this splits on whitespace and hyphens. 64 | -} 65 | tokenizerList : List String -> List String 66 | tokenizerList = 67 | tokenizerWithRegexList defaultSeparator 68 | 69 | 70 | {-| Tokenize a string. 71 | Will not return any empty string tokens. 72 | Supply your own regex for splitting the string. 73 | -} 74 | tokenizerWithRegex : Regex -> String -> List String 75 | tokenizerWithRegex seperatorRegex data = 76 | let 77 | splitter = 78 | split seperatorRegex << toLower << trim 79 | in 80 | List.filter 81 | (\token -> String.length token > 0) 82 | (splitter data) 83 | 84 | 85 | tokenizerWithRegexList : Regex -> List String -> List String 86 | tokenizerWithRegexList seperatorRegex listData = 87 | let 88 | splitter = 89 | split seperatorRegex << toLower << trim 90 | 91 | -- List.foldr (\set agg -> Set.intersect set agg) h tail 92 | -- tokens : List String 93 | tokens = 94 | List.foldr 95 | (\str agg -> 96 | List.append agg (splitter str) 97 | ) 98 | [] 99 | listData 100 | in 101 | List.filter 102 | (\token -> String.length token > 0) 103 | tokens 104 | 105 | 106 | {-| Tokenize a String. 107 | Will not return any empty string tokens. 108 | Supply your own String which is turned into a regex for splitting the string. 109 | -} 110 | tokenizerWith : String -> String -> List String 111 | tokenizerWith seperatorPattern = 112 | tokenizerWithRegex (forceRegex seperatorPattern) 113 | 114 | 115 | {-| Tokenize a List String. 116 | Will not return any empty string tokens. 117 | Supply your own String which is turned into a regex for splitting the string. 118 | -} 119 | tokenizerWithList : String -> List String -> List String 120 | tokenizerWithList seperatorPattern = 121 | tokenizerWithRegexList (forceRegex seperatorPattern) 122 | 123 | 124 | trimmerRegex = 125 | forceRegex "^\\W+|\\W+$" 126 | 127 | 128 | {-| Remove non word characters from start and end of tokens 129 | -} 130 | trimmer : String -> String 131 | trimmer = 132 | replace trimmerRegex (\_ -> "") 133 | -------------------------------------------------------------------------------- /src/Utils.elm: -------------------------------------------------------------------------------- 1 | module Utils exposing (intersectSets) 2 | 3 | {-| Some misc utils 4 | 5 | @docs intersectSets 6 | 7 | Copyright (c) 2016 Robin Luiten 8 | 9 | -} 10 | 11 | import Set exposing (Set) 12 | 13 | 14 | {-| Return intersection of a list of sets 15 | -} 16 | intersectSets : List (Set String) -> Set String 17 | intersectSets sets = 18 | case sets of 19 | [] -> 20 | Set.empty 21 | 22 | h :: tail -> 23 | List.foldr (\set agg -> Set.intersect set agg) h tail 24 | -------------------------------------------------------------------------------- /tests/DefaultTests.elm: -------------------------------------------------------------------------------- 1 | module DefaultTests exposing (testDefaultIndexType) 2 | 3 | import Expect 4 | import Index.Defaults 5 | import Test exposing (..) 6 | 7 | 8 | testDefaultIndexType : Test 9 | testDefaultIndexType = 10 | test "Check Index Type" <| 11 | \() -> 12 | Index.Defaults.getIndexSimpleConfig 13 | { ref = .cid 14 | , fields = 15 | [ ( .title, 5.0 ) 16 | ] 17 | , listFields = 18 | [ ( .body, 1.0 ) 19 | ] 20 | } 21 | |> .indexType 22 | |> Expect.equal "-= ElmTextSearch Index Type 1 =-" 23 | -------------------------------------------------------------------------------- /tests/ElmTextSearchTests.elm: -------------------------------------------------------------------------------- 1 | module ElmTextSearchTests exposing (..) 2 | 3 | import ElmTextSearch 4 | import ElmTextSearchErrors exposing (AddError(..), RemoveError(..), SearchError(..)) 5 | import Expect 6 | import Test exposing (..) 7 | 8 | 9 | type alias MyDoc = 10 | { cid : String 11 | , title : String 12 | , author : String 13 | , body : String 14 | } 15 | 16 | 17 | doc1_ : MyDoc 18 | doc1_ = 19 | { cid = "doc1" 20 | , title = "Examples of a Banana" 21 | , author = "Sally Apples" 22 | , body = "Sally writes words about a grown banana." 23 | } 24 | 25 | 26 | getEmptyIndex : () -> ElmTextSearch.Index MyDoc 27 | getEmptyIndex _ = 28 | ElmTextSearch.new 29 | { ref = .cid 30 | , fields = [ ( .title, 5 ), ( .body, 1 ) ] 31 | , listFields = [] 32 | } 33 | 34 | 35 | test_searchT_CanUseErrorResultConstructors : Test 36 | test_searchT_CanUseErrorResultConstructors = 37 | test "If can case on error result" <| 38 | \() -> 39 | getEmptyIndex () 40 | |> ElmTextSearch.searchT "hello" 41 | |> (\result -> 42 | case result of 43 | Ok _ -> 44 | False 45 | 46 | Err error -> 47 | case error of 48 | IndexIsEmpty -> 49 | True 50 | 51 | _ -> 52 | False 53 | ) 54 | |> Expect.equal True 55 | >> Expect.onFail "Result should be an error" 56 | -------------------------------------------------------------------------------- /tests/IndexDecoderTests.elm: -------------------------------------------------------------------------------- 1 | module IndexDecoderTests exposing (decodeAndEncodeRoundTripSameTest) 2 | 3 | import ElmTextSearch.Json.Decoder as IndexDecoder 4 | import ElmTextSearch.Json.Encoder as IndexEncoder 5 | import Expect 6 | import Json.Decode as Decode 7 | import Json.Encode as Encode 8 | import Test exposing (..) 9 | import TestUtils 10 | 11 | 12 | encodeAndDecodeHelper : String -> String 13 | encodeAndDecodeHelper string = 14 | string 15 | |> Decode.decodeString IndexDecoder.decoder 16 | |> TestUtils.getResultIgnoreError 17 | |> IndexEncoder.codecIndexRecordEncoder 18 | |> Encode.encode 0 19 | 20 | 21 | {-| From 22 | QUOTE: "Dictionary equality with (==) is unreliable and should not be used." 23 | 24 | Therefore decode then encode back to string to check its same. 25 | 26 | -} 27 | decodeAndEncodeRoundTripSameTest : Test 28 | decodeAndEncodeRoundTripSameTest = 29 | let 30 | -- from the encoder tests 31 | encodedIndex = 32 | String.concat 33 | [ "{\"indexVersion\":\"1.0.0\",\"indexType\":\"- IndexTest Type -\"," 34 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]}," 35 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]," 36 | , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}}," 37 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}}," 38 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}}," 39 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}}," 40 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}}," 41 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}" 42 | ] 43 | in 44 | test "decode then encode ensure same" <| 45 | \() -> 46 | encodedIndex 47 | |> encodeAndDecodeHelper 48 | |> Expect.equal encodedIndex 49 | -------------------------------------------------------------------------------- /tests/IndexEncoderTests.elm: -------------------------------------------------------------------------------- 1 | module IndexEncoderTests exposing (testEncodeList, testEncoder) 2 | 3 | import ElmTextSearch.Json.Encoder as IndexEncoder 4 | import Expect 5 | import Index 6 | import Index.Model exposing (..) 7 | import Json.Encode as Encode 8 | import Test exposing (..) 9 | import TestUtils 10 | 11 | 12 | encodedIndex : String 13 | encodedIndex = 14 | String.concat 15 | [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"- IndexTest Type -\"," 16 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]}," 17 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]," 18 | , "\"tokenStore\":{" 19 | , "\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}}," 20 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}}," 21 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}}," 22 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}}," 23 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}}," 24 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}" 25 | ] 26 | 27 | 28 | testEncoder : Test 29 | testEncoder = 30 | test "Encode index with doc matches encodedIndex" <| 31 | \() -> 32 | Index.new 33 | { indexType = "- IndexTest Type -" 34 | , ref = .cid 35 | , fields = [ ( .title, 5 ), ( .body, 1 ) ] 36 | , listFields = [] 37 | } 38 | |> Index.add 39 | { cid = "doc1" 40 | , title = "Examples of a Banana" 41 | , author = "Sally Apples" 42 | , body = "Sally writes words about a grown banana." 43 | } 44 | |> TestUtils.getResultIgnoreError 45 | |> IndexEncoder.encoder 46 | |> Encode.encode 0 47 | |> Expect.equal 48 | encodedIndex 49 | 50 | 51 | testEncodeList : Test 52 | testEncodeList = 53 | test "Encode index with doc matches encodedIndex using listFields" <| 54 | \() -> 55 | Index.new 56 | { indexType = "- IndexTest Type -" 57 | , ref = .cid 58 | , fields = [ ( .title, 5 ) ] 59 | , listFields = [ ( .body, 1 ) ] 60 | } 61 | |> Index.add 62 | { cid = "doc1" 63 | , title = "Examples of a Banana" 64 | , author = "Sally Apples" 65 | , body = 66 | [ "Sally writes words " 67 | , "about a grown banana." 68 | ] 69 | } 70 | |> TestUtils.getResultIgnoreError 71 | |> IndexEncoder.encoder 72 | |> Encode.encode 0 73 | |> Expect.equal 74 | encodedIndex 75 | -------------------------------------------------------------------------------- /tests/IndexLoadTests.elm: -------------------------------------------------------------------------------- 1 | module IndexLoadTests exposing 2 | ( indexfromString1Test 3 | , loadIndexWith1Test 4 | , loadIndexWithErr1Test 5 | , loadIndexWithErr2Test 6 | ) 7 | 8 | import ElmTextSearch 9 | import Expect 10 | import Index.Load 11 | import Index.Model exposing (Index(..)) 12 | import Json.Decode exposing (Error(..)) 13 | import Test exposing (..) 14 | import TestUtils 15 | 16 | 17 | loadIndexWithErr1Test : Test 18 | loadIndexWithErr1Test = 19 | test "Fails to load an index with wrong index version" <| 20 | \() -> 21 | String.concat 22 | [ "{\"indexVersion\":\"1.0.1\",\"indexType\":\"- IndexTest Type -\"," 23 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]}," 24 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]," 25 | , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}}," 26 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}}," 27 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}}," 28 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}}," 29 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}}," 30 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}" 31 | ] 32 | |> Index.Load.loadIndexWith 33 | [ { indexType = "_______some string" 34 | , ref = .cid 35 | , fields = [ ( .title, 5 ), ( .body, 1 ) ] 36 | , listFields = [] 37 | , initialTransformFactories = [] 38 | , transformFactories = [] 39 | , filterFactories = [] 40 | } 41 | ] 42 | |> TestUtils.getErrorIgnoreResult 43 | |> TestUtils.getDecodeErrorFailureMessage 44 | |> Expect.equal "Error cannot load Index. Version supported is 1.1.0. Version tried to load is 1.0.1." 45 | 46 | 47 | loadIndexWithErr2Test : Test 48 | loadIndexWithErr2Test = 49 | test "Fails to load an index with an indexType not in configuration provided." <| 50 | \() -> 51 | String.concat 52 | [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"__IndexTest Type -\"," 53 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]}," 54 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]," 55 | , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}}," 56 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}}," 57 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}}," 58 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}}," 59 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}}," 60 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}" 61 | ] 62 | |> Index.Load.loadIndexWith 63 | [ { indexType = "_______some string not matching the encoded index type" 64 | , ref = .cid 65 | , fields = [ ( .title, 5 ), ( .body, 1 ) ] 66 | , listFields = [] 67 | , initialTransformFactories = [] 68 | , transformFactories = [] 69 | , filterFactories = [] 70 | } 71 | ] 72 | |> TestUtils.getErrorIgnoreResult 73 | |> TestUtils.getDecodeErrorFailureMessage 74 | |> Expect.equal "Error cannot load Index. Tried to load index of type \"__IndexTest Type -\". It is not in supported index configurations." 75 | 76 | 77 | loadIndexWith1Test : Test 78 | loadIndexWith1Test = 79 | let 80 | config = 81 | { indexType = "not set" 82 | , ref = .cid 83 | , fields = [ ( .title, 5 ), ( .body, 1 ) ] 84 | , listFields = [] 85 | , initialTransformFactories = [] 86 | , transformFactories = [] 87 | , filterFactories = [] 88 | } 89 | in 90 | test "Load an index. really dumb check" <| 91 | \() -> 92 | String.concat 93 | [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"_______some string\"," 94 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]}," 95 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]," 96 | , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}}," 97 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}}," 98 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}}," 99 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}}," 100 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}}," 101 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}" 102 | ] 103 | |> Index.Load.loadIndexWith 104 | [ config 105 | , { config | indexType = "_______some string" } 106 | ] 107 | |> TestUtils.expectOkWithGoodFailMessage 108 | 109 | 110 | indexfromString1Test : Test 111 | indexfromString1Test = 112 | test "Can succesfully load index from string with ElmTextSearch.SimpleConfig." <| 113 | \() -> 114 | String.concat 115 | [ "{\"indexVersion\":\"1.1.0\",\"indexType\":\"-= ElmTextSearch Index Type 1 =-\"," 116 | , "\"documentStore\":{\"doc1\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]}," 117 | , "\"corpusTokens\":[\"banana\",\"exampl\",\"grown\",\"salli\",\"word\",\"write\"]," 118 | , "\"tokenStore\":{\"b\":{\"a\":{\"n\":{\"a\":{\"n\":{\"a\":{\"doc1\":2.7}}}}}}," 119 | , "\"e\":{\"x\":{\"a\":{\"m\":{\"p\":{\"l\":{\"doc1\":2.5}}}}}}," 120 | , "\"g\":{\"r\":{\"o\":{\"w\":{\"n\":{\"doc1\":0.2}}}}}," 121 | , "\"s\":{\"a\":{\"l\":{\"l\":{\"i\":{\"doc1\":0.2}}}}}," 122 | , "\"w\":{\"o\":{\"r\":{\"d\":{\"doc1\":0.2}}}," 123 | , "\"r\":{\"i\":{\"t\":{\"e\":{\"doc1\":0.2}}}}}}}" 124 | ] 125 | |> ElmTextSearch.fromString 126 | { ref = .cid 127 | , fields = 128 | [ ( .title, 5 ) 129 | , ( .body, 1 ) 130 | ] 131 | , listFields = [] 132 | } 133 | |> TestUtils.expectOkWithGoodFailMessage 134 | -------------------------------------------------------------------------------- /tests/IndexTests.elm: -------------------------------------------------------------------------------- 1 | module IndexTests exposing 2 | ( addDocAlreadyInIndexReturnsError 3 | , addDocWithEmptyIdFieldReturnsError 4 | , addDocWithIndexFieldsEmptyReturnsError 5 | , addDocumentWithSameIdAsExistingReturnsError 6 | , addMultipleDocsReturnsErrorListForProblems 7 | , addOrUpdateDocNotInIndexReturnsSuccess 8 | , addOrUpdateDocWithSameIdReturnsSuccess 9 | , idfCacheIsClearedAfterASuccessfulAdd 10 | , idfCacheIsClearedAfterSuccessfulRemove 11 | , removeDocRefNotIndexReturnsError 12 | , removeDocWithEmptyIdFieldReturnsError 13 | , removeDoesNotBreakSearchResults 14 | , removeOnlyDocIndexReturnsIsEmpty 15 | , searchCasesTest 16 | , searchEmptyIndexReturnsError 17 | , searchIndexAfter2DocRemovedErrors 18 | , searchIndexAfterDocRemovedErrors 19 | , searchListFieldsSingleLetterWithLetterInBody 20 | , searchSingleLetterWithLetterInTitles 21 | , searchUsingEmptyQueryReturnsError 22 | , searchUsingQueryWithOnlyStopWordsWhichMeansEmptyReturnsError 23 | , searchWithOnlyListFieldsIndexReturnsValidScores 24 | , updateDocNotInIndexReturnsError 25 | , updateDocUsesNewDocContent 26 | ) 27 | 28 | import Dict 29 | import ElmTextSearch.Json.Encoder as IndexEncoder 30 | import Expect 31 | import Index 32 | import Index.Model exposing (Index(..)) 33 | import Json.Encode as Encode 34 | import Test exposing (..) 35 | import TestUtils 36 | import Trie 37 | 38 | 39 | {-| example record type for tests 40 | -} 41 | type alias MyDoc = 42 | { cid : String 43 | , title : String 44 | , author : String 45 | , body : String 46 | } 47 | 48 | 49 | type alias MyDoc2 = 50 | { cid : String 51 | , title : String 52 | , author : String 53 | , body : List String 54 | } 55 | 56 | 57 | doc1_ : MyDoc 58 | doc1_ = 59 | { cid = "doc1" 60 | , title = "Examples of a Banana" 61 | , author = "Sally Apples" 62 | , body = "Sally writes words about a grown banana." 63 | } 64 | 65 | 66 | doc2_ : MyDoc 67 | doc2_ = 68 | { cid = "doc2" 69 | , title = "Grown Bananas and there appeal" 70 | , author = "John Banana" 71 | , body = "An example of apple engineering." 72 | } 73 | 74 | 75 | doc3_ : MyDoc 76 | doc3_ = 77 | { cid = "doc3" 78 | , title = "Kites and Trees a tail of misery" 79 | , author = "Adam Winddriven" 80 | , body = "When a flyer meets an Elm it maybe a problem." 81 | } 82 | 83 | 84 | doc4_indexFieldsEmpty : { cid : String, title : String, author : String, body : String } 85 | doc4_indexFieldsEmpty = 86 | { cid = "doc4" 87 | , title = "" 88 | , author = "Some Author" 89 | , body = "" 90 | } 91 | 92 | 93 | doc5_idEmpty : MyDoc 94 | doc5_idEmpty = 95 | { cid = "" 96 | , title = "Empty Reference Title" 97 | , author = "Some Author" 98 | , body = "Empty Reference Body" 99 | } 100 | 101 | 102 | type alias SearchCaseRecord = 103 | { name : String 104 | , input : String 105 | , expect : List String 106 | , indexResult : Index MyDoc 107 | } 108 | 109 | 110 | searchCasesTest : Test 111 | searchCasesTest = 112 | describe "Index search tests" 113 | (List.map searchTestCase 114 | [ { name = "two docs one with term in title first and body second" 115 | , input = "example" 116 | , expect = [ "doc1", "doc2" ] 117 | , indexResult = getIndexDoc1Doc2 () 118 | } 119 | , { name = "two docs one with term in title first" 120 | , input = "grown" 121 | , expect = [ "doc2", "doc1" ] 122 | , indexResult = getIndexDoc1Doc2 () 123 | } 124 | , { name = "neither document contains both words so return nothing" 125 | , input = "-misery! .appeal," 126 | , expect = [] 127 | , indexResult = getIndexDoc1Doc2 () 128 | } 129 | , { name = "with doc3 returns no docs with both words" 130 | , input = "-misery! .appeal," 131 | , expect = [] 132 | , indexResult = getIndexDoc1Doc2Doc3 () 133 | } 134 | , { name = "returns doc1 and doc2 e expands to example and engineer which exist in both documents." 135 | , input = "e" 136 | , expect = [ "doc1", "doc2" ] 137 | , indexResult = getIndexDoc1Doc2 () 138 | } 139 | , { name = "search \"ex\" returns doc1, doc2 as both contain example." 140 | , input = "ex" 141 | , expect = [ "doc1", "doc2" ] 142 | , indexResult = getIndexDoc1Doc2 () 143 | } 144 | , { name = "search \"en\" returns doc2 as it contains engineering." 145 | , input = "en" 146 | , expect = [ "doc2" ] 147 | , indexResult = getIndexDoc1Doc2 () 148 | } 149 | ] 150 | ) 151 | 152 | 153 | searchTestCase : SearchCaseRecord -> Test 154 | searchTestCase { name, input, expect, indexResult } = 155 | test ("search \"" ++ input ++ "\" " ++ name) <| 156 | \() -> 157 | Expect.equal expect <| 158 | case Index.search input indexResult of 159 | Ok ( _, docs ) -> 160 | List.map Tuple.first docs 161 | 162 | Err err -> 163 | [ err ] 164 | 165 | 166 | getEmptyIndexMyDoc2IndexOnlyListFields : () -> Index.Index MyDoc2 167 | getEmptyIndexMyDoc2IndexOnlyListFields _ = 168 | Index.new 169 | { indexType = "- IndexTest Type -" 170 | , ref = .cid 171 | , fields = [] 172 | , listFields = 173 | [ ( .body, 1 ) 174 | ] 175 | } 176 | 177 | 178 | getEmptyIndexMyDoc2 : () -> Index.Index MyDoc2 179 | getEmptyIndexMyDoc2 _ = 180 | Index.new 181 | { indexType = "- IndexTest Type -" 182 | , ref = .cid 183 | , fields = [ ( .title, 5 ) ] 184 | , listFields = [ ( .body, 1 ) ] 185 | } 186 | 187 | 188 | getEmptyIndex : () -> Index.Index MyDoc 189 | getEmptyIndex _ = 190 | Index.new 191 | { indexType = "- IndexTest Type -" 192 | , ref = .cid 193 | , fields = [ ( .title, 5 ), ( .body, 1 ) ] 194 | , listFields = [] 195 | } 196 | 197 | 198 | getIndexDoc1 : () -> Index.Index MyDoc 199 | getIndexDoc1 _ = 200 | getEmptyIndex () 201 | |> Index.add doc1_ 202 | |> TestUtils.getResultIgnoreError 203 | 204 | 205 | getIndexDoc1Doc2 : () -> Index.Index MyDoc 206 | getIndexDoc1Doc2 _ = 207 | getIndexDoc1 () 208 | |> Index.add doc2_ 209 | |> TestUtils.getResultIgnoreError 210 | 211 | 212 | getIndexDoc1Doc2Doc3 : () -> Index.Index MyDoc 213 | getIndexDoc1Doc2Doc3 _ = 214 | getIndexDoc1Doc2 () 215 | |> Index.add doc3_ 216 | |> TestUtils.getResultIgnoreError 217 | 218 | 219 | searchUsingEmptyQueryReturnsError : Test 220 | searchUsingEmptyQueryReturnsError = 221 | test "empty query returns Err" <| 222 | \() -> 223 | getIndexDoc1Doc2 () 224 | |> Index.search "" 225 | |> Expect.equal (Err "Error query is empty.") 226 | 227 | 228 | searchUsingQueryWithOnlyStopWordsWhichMeansEmptyReturnsError : Test 229 | searchUsingQueryWithOnlyStopWordsWhichMeansEmptyReturnsError = 230 | test "query full of stop words (filtered out words) returns Err" <| 231 | \() -> 232 | getIndexDoc1Doc2 () 233 | |> Index.search "if and but " 234 | |> Expect.equal (Err "Error after tokenisation there are no terms to search for.") 235 | 236 | 237 | searchEmptyIndexReturnsError : Test 238 | searchEmptyIndexReturnsError = 239 | test "no document returns Err" <| 240 | \() -> 241 | Index.search "hello world" 242 | (getEmptyIndex ()) 243 | |> Expect.equal (Err "Error there are no documents in index to search.") 244 | 245 | 246 | idfCacheIsClearedAfterSuccessfulRemove : Test 247 | idfCacheIsClearedAfterSuccessfulRemove = 248 | test "idfCache is cleared after a successful remove document." <| 249 | \() -> 250 | getIndexDoc1Doc2 () 251 | |> Index.search "banana" 252 | |> TestUtils.getResultIgnoreError 253 | |> Tuple.first 254 | |> Index.remove doc1_ 255 | |> TestUtils.getResultIgnoreError 256 | |> getIdfCache 257 | |> Dict.isEmpty 258 | |> Expect.equal True 259 | |> Expect.onFail "IdfCache should be cleared after document remove" 260 | 261 | 262 | 263 | -- |> Expect.pass |> Expect.onFail "IdfCache should be cleared after document remove" 264 | 265 | 266 | idfCacheIsClearedAfterASuccessfulAdd : Test 267 | idfCacheIsClearedAfterASuccessfulAdd = 268 | test "idfCache is cleared after a successful add document." <| 269 | \() -> 270 | getIndexDoc1Doc2 () 271 | |> Index.search "banana" 272 | |> TestUtils.getResultIgnoreError 273 | |> Tuple.first 274 | |> Index.add doc3_ 275 | |> TestUtils.getResultIgnoreError 276 | |> getIdfCache 277 | |> Dict.isEmpty 278 | |> Expect.equal True 279 | |> Expect.onFail "IdfCache should be cleared after document remove" 280 | 281 | 282 | addDocWithIndexFieldsEmptyReturnsError : Test 283 | addDocWithIndexFieldsEmptyReturnsError = 284 | test "Add a doc which has all index fields empty returns Err" <| 285 | \() -> 286 | getEmptyIndex () 287 | |> Index.add doc4_indexFieldsEmpty 288 | |> TestUtils.getErrorIgnoreResult 289 | |> Expect.equal "Error after tokenisation there are no terms to index." 290 | 291 | 292 | addDocWithEmptyIdFieldReturnsError : Test 293 | addDocWithEmptyIdFieldReturnsError = 294 | test "Add a doc empty ID field returns Err" <| 295 | \() -> 296 | getEmptyIndex () 297 | |> Index.add doc5_idEmpty 298 | |> Expect.equal (Err "Error document has an empty unique id (ref).") 299 | 300 | 301 | addDocAlreadyInIndexReturnsError : Test 302 | addDocAlreadyInIndexReturnsError = 303 | test "Add a doc allready in index returns Err" <| 304 | \() -> 305 | getIndexDoc1Doc2Doc3 () 306 | |> Index.add doc1_ 307 | |> TestUtils.getErrorIgnoreResult 308 | |> Expect.equal "Error adding document that allready exists." 309 | 310 | 311 | getIdfCache : Index doc -> Dict.Dict String Float 312 | getIdfCache (Index irec) = 313 | irec.idfCache 314 | 315 | 316 | removeDocRefNotIndexReturnsError : Test 317 | removeDocRefNotIndexReturnsError = 318 | test "Remove a doc ref not in index returns Err." <| 319 | \() -> 320 | getIndexDoc1Doc2 () 321 | |> Index.remove doc3_ 322 | |> TestUtils.getErrorIgnoreResult 323 | |> Expect.equal "Error document is not in index." 324 | 325 | 326 | removeDocWithEmptyIdFieldReturnsError : Test 327 | removeDocWithEmptyIdFieldReturnsError = 328 | test "Remove a doc with empty id field is an error." <| 329 | \() -> 330 | getEmptyIndex () 331 | |> Index.remove doc5_idEmpty 332 | |> Expect.equal (Err "Error document has an empty unique id (ref).") 333 | 334 | 335 | searchIndexAfterDocRemovedErrors : Test 336 | searchIndexAfterDocRemovedErrors = 337 | test "Search index where 1 doc from index was removed fails" <| 338 | \() -> 339 | getIndexDoc1 () 340 | |> Index.remove doc1_ 341 | |> TestUtils.getResultIgnoreError 342 | |> Index.search "Sally" 343 | |> TestUtils.getErrorIgnoreResult 344 | |> Expect.equal "Error there are no documents in index to search." 345 | 346 | 347 | searchIndexAfter2DocRemovedErrors : Test 348 | searchIndexAfter2DocRemovedErrors = 349 | test "Search Index where 2 docs from index removed fails" <| 350 | \() -> 351 | getIndexDoc1Doc2 () 352 | |> Index.remove doc1_ 353 | |> TestUtils.getResultIgnoreError 354 | |> Index.remove doc2_ 355 | |> TestUtils.getResultIgnoreError 356 | |> Index.search "Sally" 357 | |> TestUtils.getErrorIgnoreResult 358 | |> Expect.equal "Error there are no documents in index to search." 359 | 360 | 361 | removeDoesNotBreakSearchResults : Test 362 | removeDoesNotBreakSearchResults = 363 | test "Remove does not break searching" <| 364 | \() -> 365 | getIndexDoc1Doc2 () 366 | |> Index.remove doc2_ 367 | |> TestUtils.getResultIgnoreError 368 | |> Index.search "Sally" 369 | |> TestUtils.getResultIgnoreError 370 | |> Tuple.second 371 | |> List.map Tuple.first 372 | |> Expect.equal [ doc1_.cid ] 373 | 374 | 375 | {-| Test to verify removing only document reports 376 | -} 377 | removeOnlyDocIndexReturnsIsEmpty : Test 378 | removeOnlyDocIndexReturnsIsEmpty = 379 | let 380 | testIndexU1 = 381 | getIndexDoc1 () 382 | |> Index.remove doc1_ 383 | |> TestUtils.getResultIgnoreError 384 | 385 | ( storeB, tokenStoreB ) = 386 | case testIndexU1 of 387 | Index { documentStore, tokenStore } -> 388 | ( documentStore, tokenStore ) 389 | in 390 | describe "removing a doc" 391 | [ test "removes it from document store" <| 392 | \() -> 393 | Dict.member "doc1" storeB 394 | |> Expect.equal False 395 | |> Expect.onFail "oops its in document store" 396 | , test "removes trie nodes not leading to a reference. This is not testing trie, testing Index use of trie" <| 397 | \() -> 398 | Trie.isEmpty tokenStoreB 399 | |> Expect.equal True 400 | |> Expect.onFail "Trie model is not empty" 401 | ] 402 | 403 | 404 | addMultipleDocsReturnsErrorListForProblems : Test 405 | addMultipleDocsReturnsErrorListForProblems = 406 | describe "addAllDocs Tests" <| 407 | [ test "Add multiple docs returning list of docs with errors" <| 408 | \() -> 409 | getEmptyIndex () 410 | |> Index.addDocs [ doc3_, doc4_indexFieldsEmpty ] 411 | |> Tuple.second 412 | |> Expect.equal [ ( 1, "Error after tokenisation there are no terms to index." ) ] 413 | , test "Add multiple docs returning list of errors swap order of documents." <| 414 | \() -> 415 | getEmptyIndex () 416 | |> Index.addDocs [ doc4_indexFieldsEmpty, doc3_ ] 417 | |> Tuple.second 418 | |> Expect.equal [ ( 0, "Error after tokenisation there are no terms to index." ) ] 419 | ] 420 | 421 | 422 | helperAddDocsSearchIndexResults : String -> List doc -> Index doc -> List ( String, Float ) 423 | helperAddDocsSearchIndexResults search docs index = 424 | index 425 | -- |> (\a -> Debug.log "foo" a) 426 | |> Index.addDocs docs 427 | |> Tuple.first 428 | |> Index.search search 429 | |> TestUtils.getResultIgnoreError 430 | |> Tuple.second 431 | 432 | 433 | {-| Case from 434 | Two docs with titles Question1 and Question2 435 | "q" search was not returning both documents. 436 | -} 437 | searchSingleLetterWithLetterInTitles : Test 438 | searchSingleLetterWithLetterInTitles = 439 | test "search single letter reports both documents with word starting with that letter in title field" <| 440 | \() -> 441 | getEmptyIndex () 442 | |> helperAddDocsSearchIndexResults "q" 443 | [ { cid = "qdoc1" 444 | , title = "Question1" 445 | , author = "Sally Apples" 446 | , body = "Sally writes words about a grown banana." 447 | } 448 | , { cid = "qdoc2" 449 | , title = "Question2" 450 | , author = "John Banana" 451 | , body = "An example of apple engineering." 452 | } 453 | ] 454 | |> List.map Tuple.first 455 | |> Expect.equal [ "qdoc1", "qdoc2" ] 456 | 457 | 458 | searchListFieldsSingleLetterWithLetterInBody : Test 459 | searchListFieldsSingleLetterWithLetterInBody = 460 | test "search finds words in list fields body of MyDoc2" <| 461 | \() -> 462 | getEmptyIndexMyDoc2 () 463 | |> helperAddDocsSearchIndexResults "green" 464 | [ { cid = "qdoc1" 465 | , title = "Question1 Notgreen" 466 | , author = "Sally Apples" 467 | , body = 468 | [ "Sally writes words about " 469 | , "a grown green banana." 470 | ] 471 | } 472 | , { cid = "qdoc2" 473 | , title = "Question2 Purple" 474 | , author = "John Banana" 475 | , body = 476 | [ "An example of " 477 | , "green apple engineering." 478 | ] 479 | } 480 | ] 481 | |> List.map Tuple.first 482 | |> Expect.equal [ "qdoc2", "qdoc1" ] 483 | 484 | 485 | {-| Configure to have some data in listFields body, match in listFields body, index with fields set to [] 486 | Reproduce a bug reported. 487 | -} 488 | searchWithOnlyListFieldsIndexReturnsValidScores : Test 489 | searchWithOnlyListFieldsIndexReturnsValidScores = 490 | test "search index with only List fields configured, check for NaN values in scores" <| 491 | \() -> 492 | getEmptyIndexMyDoc2IndexOnlyListFields () 493 | |> helperAddDocsSearchIndexResults "green" 494 | [ { cid = "qdoc1" 495 | , title = "Question1 Notgreen" 496 | , author = "Sally Apples" 497 | , body = 498 | [ "Sally writes words about " 499 | , "a grown green banana." 500 | ] 501 | } 502 | , { cid = "qdoc2" 503 | , title = "Question2 Purple" 504 | , author = "John Banana" 505 | , body = 506 | [ "An example of " 507 | , "green apple engineering." 508 | ] 509 | } 510 | ] 511 | |> List.map Tuple.second 512 | |> List.any Basics.isNaN 513 | |> Expect.equal False 514 | |> Expect.onFail "Expect searchScores to not contain any NaN values" 515 | 516 | 517 | addDocumentWithSameIdAsExistingReturnsError : Test 518 | addDocumentWithSameIdAsExistingReturnsError = 519 | test "add same document to index produces error" <| 520 | \() -> 521 | getIndexDoc1 () 522 | |> Index.add doc1_ 523 | |> TestUtils.getErrorIgnoreResult 524 | |> Expect.equal "Error adding document that allready exists." 525 | 526 | 527 | addOrUpdateDocWithSameIdReturnsSuccess : Test 528 | addOrUpdateDocWithSameIdReturnsSuccess = 529 | test "addOrUpdate same document does not produce error" <| 530 | \() -> 531 | getIndexDoc1 () 532 | |> Index.addOrUpdate doc1_ 533 | |> TestUtils.isOk 534 | |> Expect.equal True 535 | |> Expect.onFail "Expect Ok result to addOrUpdate if doc in index" 536 | 537 | 538 | addOrUpdateDocNotInIndexReturnsSuccess : Test 539 | addOrUpdateDocNotInIndexReturnsSuccess = 540 | test "addOrUpdate document not in index updates index with new doc" <| 541 | \() -> 542 | getEmptyIndex () 543 | |> Index.addOrUpdate doc1_ 544 | |> TestUtils.isOk 545 | |> Expect.equal True 546 | |> Expect.onFail "Expect Ok result to addOrUpdate if doc is new" 547 | 548 | 549 | updateDocNotInIndexReturnsError : Test 550 | updateDocNotInIndexReturnsError = 551 | test "index update with a doc not in index fails" <| 552 | \() -> 553 | getEmptyIndex () 554 | |> Index.update doc1_ 555 | |> TestUtils.isOk 556 | |> Expect.equal False 557 | |> Expect.onFail "Updating a doc not in index fails" 558 | 559 | 560 | {-| Updating a document removes old doc version and adds new doc version. 561 | 562 | This was a bug I noticed in code, writing test to confirm before fixing it. 563 | 564 | -} 565 | updateDocUsesNewDocContent : Test 566 | updateDocUsesNewDocContent = 567 | let 568 | indexT1 = 569 | getEmptyIndex () 570 | |> Index.addDocs 571 | [ { cid = "qdoc1" 572 | , title = "Question1" 573 | , author = "Sally Apples" 574 | , body = "Sally writes words about a grown banana." 575 | } 576 | , { cid = "qdoc2" 577 | , title = "Question2" 578 | , author = "John Banana" 579 | , body = "An example of apple engineering." 580 | } 581 | ] 582 | |> Tuple.first 583 | 584 | indexT2 = 585 | indexT1 586 | |> Index.update 587 | { cid = "qdoc1" 588 | , title = "Yesterday" 589 | , author = "New User" 590 | , body = "Completely different document really" 591 | } 592 | |> TestUtils.getResultIgnoreError 593 | 594 | encodedT1 = 595 | indexT1 596 | |> IndexEncoder.encoder 597 | |> Encode.encode 0 598 | 599 | encodedT2 = 600 | indexT2 601 | |> IndexEncoder.encoder 602 | |> Encode.encode 0 603 | in 604 | test "updateDoc removes old doc and replaces it so index changes" <| 605 | \() -> 606 | encodedT1 607 | |> Expect.notEqual 608 | encodedT2 609 | -------------------------------------------------------------------------------- /tests/IndexUtilsTests.elm: -------------------------------------------------------------------------------- 1 | module IndexUtilsTests exposing 2 | ( testDefaultTransforms 3 | , testGetTokens 4 | , test_processTokens_filterFactories 5 | , test_processTokens_initialTransformFactories 6 | , test_processTokens_transformFactories 7 | ) 8 | 9 | import Expect 10 | import Index exposing (Index) 11 | import Index.Model exposing (FilterFactory, TransformFactory) 12 | import Index.Utils 13 | import StopWordFilter exposing (createFilterFunc) 14 | import Test exposing (..) 15 | 16 | type alias MyDoc = 17 | { cid : String 18 | , title : String 19 | , author : String 20 | , body : String 21 | } 22 | 23 | 24 | testDefaultTransforms : Test 25 | testDefaultTransforms = 26 | describe "apply default transform tests" 27 | (List.map testGetTokens 28 | [ ( "words of only non word chars removed" 29 | , "engineering ???" 30 | , [ "engin" ] 31 | ) 32 | , ( "stemmer and non word chars removed" 33 | , ".This was very large.-" 34 | , [ "veri", "larg" ] 35 | ) 36 | , ( "stop words removed" 37 | , "however among the dear .- -" 38 | , [] 39 | ) 40 | 41 | -- Bug https://github.com/rluiten/elm-text-search/issues/10 42 | , ( "\"on\" in the stop word list should not filter \"one\"" 43 | , "one two three" 44 | -- note that "one" is transformed to "on" by stemmer 45 | , [ "on", "two", "three" ] 46 | ) 47 | ] 48 | ) 49 | 50 | 51 | testGetTokens : ( String, String, List String ) -> Test 52 | testGetTokens ( name, input, expected ) = 53 | test ("getTokens \"" ++ input ++ "\" " ++ name) <| 54 | \() -> 55 | let 56 | testMyDocIndex = 57 | Index.new 58 | { indexType = "- IndexTest Type -" 59 | , ref = .cid 60 | , fields = 61 | [ ( .title, 5 ) 62 | , ( .body, 1 ) 63 | ] 64 | , listFields = [] 65 | } 66 | in 67 | Index.Utils.getTokens 68 | testMyDocIndex 69 | input 70 | |> Tuple.second 71 | |> Expect.equal expected 72 | 73 | 74 | createTestIndex1 : 75 | List (TransformFactory MyDoc) 76 | -> List (TransformFactory MyDoc) 77 | -> List (FilterFactory MyDoc) 78 | -> Index MyDoc 79 | createTestIndex1 initialTransformFactories transformFactories filterFactories = 80 | Index.newWith 81 | { indexType = "- IndexTest Type -" 82 | , ref = .cid 83 | , fields = 84 | [ ( .title, 5 ) 85 | , ( .body, 1 ) 86 | ] 87 | , listFields = [] 88 | , initialTransformFactories = initialTransformFactories 89 | , transformFactories = transformFactories 90 | , filterFactories = filterFactories 91 | } 92 | 93 | 94 | test_processTokens_transformFactories : Test 95 | test_processTokens_transformFactories = 96 | test "test processTokens transformFactories list" <| 97 | \() -> 98 | Index.Utils.processTokens 99 | (createTestIndex1 100 | [] 101 | [ Index.Utils.createFuncCreator (String.dropRight 1), Index.Utils.createFuncCreator (String.dropRight 1) ] 102 | [] 103 | ) 104 | [ "awords", "btesting", "ca" ] 105 | |> Tuple.second 106 | |> Expect.equal [ "awor", "btesti" ] 107 | 108 | 109 | test_processTokens_initialTransformFactories : Test 110 | test_processTokens_initialTransformFactories = 111 | test "test processTokens initialTransformFactories list" <| 112 | \() -> 113 | Index.Utils.processTokens 114 | (createTestIndex1 115 | [ Index.Utils.createFuncCreator (String.dropLeft 1), Index.Utils.createFuncCreator (String.dropRight 1) ] 116 | [] 117 | [] 118 | ) 119 | [ "pwords", "qtesting", "ra" ] 120 | |> Tuple.second 121 | |> Expect.equal 122 | [ "word", "testin" ] 123 | 124 | 125 | test_processTokens_filterFactories : Test 126 | test_processTokens_filterFactories = 127 | test "test processTokens filterFactories list" <| 128 | \() -> 129 | Index.Utils.processTokens 130 | (createTestIndex1 131 | [] 132 | [] 133 | [ createFilterFunc [ "special" ], createFilterFunc [ "swimming" ] ] 134 | ) 135 | [ "word", "special", "puzzle", "swimming" ] 136 | |> Tuple.second 137 | |> Expect.equal 138 | [ "word", "puzzle" ] 139 | -------------------------------------------------------------------------------- /tests/SearchIndexTests.elm: -------------------------------------------------------------------------------- 1 | module SearchIndexTests exposing (saveAndLoadSameTest, searchReturnsEmptyResult, searchReturnsValidResult) 2 | 3 | {- Save and Load index check search results same -} 4 | 5 | import ElmTextSearch 6 | import Expect 7 | import Index.Model exposing (Index(..), IndexSimpleConfig) 8 | import Test exposing (..) 9 | import TestUtils 10 | 11 | 12 | type alias MyDoc = 13 | { cid : String 14 | , title : String 15 | , author : String 16 | , body : String 17 | } 18 | 19 | 20 | configElmTextSearchMyDoc : IndexSimpleConfig MyDoc 21 | configElmTextSearchMyDoc = 22 | { ref = .cid 23 | , fields = 24 | [ ( .title, 5 ) 25 | , ( .body, 1 ) 26 | ] 27 | , listFields = [] 28 | } 29 | 30 | 31 | doc1 : MyDoc 32 | doc1 = 33 | { cid = "doc1" 34 | , title = "Examples of a Banana" 35 | , author = "Sally Apples" 36 | , body = "Sally writes words about a grown banana." 37 | } 38 | 39 | 40 | doc2 : MyDoc 41 | doc2 = 42 | { cid = "doc2" 43 | , title = "Words about a vehicle" 44 | , author = "John Barrel" 45 | , body = "All about a vehicle in exile." 46 | } 47 | 48 | 49 | {-| example index 50 | -} 51 | getEmptyIndex : () -> Index MyDoc 52 | getEmptyIndex _ = 53 | ElmTextSearch.new configElmTextSearchMyDoc 54 | 55 | 56 | getIndexDoc1 : () -> Index MyDoc 57 | getIndexDoc1 _ = 58 | getEmptyIndex () 59 | |> ElmTextSearch.add doc1 60 | |> TestUtils.getResultIgnoreError 61 | 62 | 63 | getIndexDoc1Doc2 : () -> Index MyDoc 64 | getIndexDoc1Doc2 _ = 65 | getIndexDoc1 () 66 | |> ElmTextSearch.add doc2 67 | |> TestUtils.getResultIgnoreError 68 | 69 | 70 | searchReturnsEmptyResult : Test 71 | searchReturnsEmptyResult = 72 | test "Search returns empty result" <| 73 | \() -> 74 | getIndexDoc1Doc2 () 75 | |> ElmTextSearch.search "foreign" 76 | |> TestUtils.getResultIgnoreError 77 | |> Tuple.second 78 | |> Expect.equal [] 79 | 80 | 81 | searchReturnsValidResult : Test 82 | searchReturnsValidResult = 83 | test "Search returns valid result" <| 84 | \() -> 85 | getIndexDoc1Doc2 () 86 | |> ElmTextSearch.search "exile" 87 | |> TestUtils.getResultIgnoreError 88 | |> Tuple.second 89 | |> Expect.equal [ ( "doc2", 0.13898344497096093 ) ] 90 | 91 | 92 | {-| helper to save and load an index. and run a search in original index and loaded index. 93 | -} 94 | searchIndexSearchSavedLoadedIndex : String -> Index MyDoc -> ( List ( String, Float ), List ( String, Float ) ) 95 | searchIndexSearchSavedLoadedIndex search index = 96 | let 97 | searchAnIndex index2 = 98 | index2 99 | |> ElmTextSearch.search search 100 | |> TestUtils.getResultIgnoreError 101 | |> Tuple.second 102 | 103 | savedAndLoadedIndex i = 104 | ElmTextSearch.storeToString i 105 | |> ElmTextSearch.fromString configElmTextSearchMyDoc 106 | |> TestUtils.getResultIgnoreError 107 | in 108 | ( searchAnIndex index, searchAnIndex <| savedAndLoadedIndex index ) 109 | 110 | 111 | saveAndLoadSameTest : Test 112 | saveAndLoadSameTest = 113 | describe "results same before and after save and load index" 114 | [ test "x Search result of nothing for Index same as for Save and Loaded Index." <| 115 | \() -> 116 | let 117 | ( resultA, resultsB ) = 118 | getIndexDoc1Doc2 () 119 | |> searchIndexSearchSavedLoadedIndex "foreign" 120 | in 121 | Expect.equal resultA resultsB 122 | , test "x Search result of something for Index same as for Save and Loaded Index." <| 123 | \() -> 124 | let 125 | ( resultA, resultsB ) = 126 | getIndexDoc1Doc2 () 127 | |> searchIndexSearchSavedLoadedIndex "exile" 128 | in 129 | Expect.equal resultA resultsB 130 | ] 131 | -------------------------------------------------------------------------------- /tests/StopWordFilterTests.elm: -------------------------------------------------------------------------------- 1 | module StopWordFilterTests exposing (newIndex, stopWordFilterTest, tests) 2 | 3 | import ElmTextSearch 4 | import Expect 5 | import StopWordFilter 6 | import Test exposing (..) 7 | 8 | 9 | type alias ExampleDocType = 10 | { cid : String 11 | , title : String 12 | , author : String 13 | , body : String 14 | } 15 | 16 | 17 | newIndex : ElmTextSearch.Index ExampleDocType 18 | newIndex = 19 | ElmTextSearch.new 20 | { ref = .cid 21 | , fields = 22 | [ ( .title, 5 ) 23 | , ( .body, 1 ) 24 | ] 25 | , listFields = [] 26 | } 27 | 28 | 29 | tests : Test 30 | tests = 31 | describe "check stopEnglishWordList against default token processing" 32 | (List.map stopWordFilterTest StopWordFilter.stopEnglishWordList) 33 | 34 | 35 | stopWordFilterTest : String -> Test 36 | stopWordFilterTest word = 37 | let 38 | ( _, stopWordFilter ) = 39 | StopWordFilter.createDefaultFilterFunc newIndex 40 | in 41 | test ("This word \"" ++ word ++ "\" got past default stop word filter in error.") <| 42 | \() -> 43 | stopWordFilter word 44 | |> Expect.equal False 45 | >> Expect.onFail "These should all be stopped" 46 | -------------------------------------------------------------------------------- /tests/TestUtils.elm: -------------------------------------------------------------------------------- 1 | module TestUtils exposing 2 | ( expectOkWithGoodFailMessage 3 | , getDecodeErrorFailureMessage 4 | , getErrorIgnoreResult 5 | , getResultIgnoreError 6 | , isErr 7 | , isOk 8 | ) 9 | 10 | {-| Utilities to make test cases simpler. 11 | -} 12 | 13 | import Expect 14 | import Index 15 | import Index.Model exposing (Index(..)) 16 | import Json.Decode exposing (Error(..)) 17 | import Test exposing (..) 18 | 19 | 20 | expectOkWithGoodFailMessage : Result Error a -> Expect.Expectation 21 | expectOkWithGoodFailMessage result = 22 | case result of 23 | Ok _ -> 24 | Expect.pass |> Expect.onFail "Result OK as expected" 25 | 26 | Err error -> 27 | Expect.fail 28 | (String.concat 29 | [ "Result Err not expected: " 30 | , getDecodeErrorFailureMessage error 31 | ] 32 | ) 33 | 34 | 35 | getResultIgnoreError : Result error a -> a 36 | getResultIgnoreError result = 37 | case result of 38 | Ok value -> 39 | value 40 | 41 | Err _ -> 42 | Debug.todo "Ignoring failure for testing" 43 | 44 | 45 | getErrorIgnoreResult : Result error a -> error 46 | getErrorIgnoreResult result = 47 | case result of 48 | Ok _ -> 49 | Debug.todo "Ignoring value for testing" 50 | 51 | Err error -> 52 | error 53 | 54 | 55 | getDecodeErrorFailureMessage : Error -> String 56 | getDecodeErrorFailureMessage error = 57 | case error of 58 | Failure message _ -> 59 | message 60 | 61 | _ -> 62 | Debug.todo "Ignoring all but Failures of Decode Error" 63 | 64 | 65 | isOk : Result e a -> Bool 66 | isOk x = 67 | case x of 68 | Ok _ -> 69 | True 70 | 71 | Err _ -> 72 | False 73 | 74 | 75 | isErr : Result e a -> Bool 76 | isErr x = 77 | case x of 78 | Ok _ -> 79 | False 80 | 81 | Err _ -> 82 | True 83 | -------------------------------------------------------------------------------- /tests/TokenProcessorTests.elm: -------------------------------------------------------------------------------- 1 | module TokenProcessorTests exposing (tokenizerTest, tokenizerTests, trimmerTest, trimmerTests) 2 | 3 | import Expect 4 | import Test exposing (..) 5 | import TokenProcessors 6 | 7 | 8 | tokenizerTests : Test 9 | tokenizerTests = 10 | describe "Lunr TokenProcessors tokenizer tests" <| 11 | List.map tokenizerTest 12 | [ ( "splitting simple strings into tokens" 13 | , "this is a simple string" 14 | , [ "this", "is", "a", "simple", "string" ] 15 | ) 16 | , ( "downcasing tokens" 17 | , "FOO BAR" 18 | , [ "foo", "bar" ] 19 | ) 20 | , ( "splitting strings with hyphens" 21 | , "take the New York-San Francisco flight" 22 | , [ "take", "the", "new", "york", "san", "francisco", "flight" ] 23 | ) 24 | , ( "splitting strings with hyphens and spaces" 25 | , "Solve for A - B" 26 | , [ "solve", "for", "a", "b" ] 27 | ) 28 | , ( "leading - in query should not cause extra token ?" 29 | , "-misery! .appeal," 30 | , [ "misery!", ".appeal," ] 31 | ) 32 | ] 33 | 34 | 35 | tokenizerTest : ( String, String, List String ) -> Test 36 | tokenizerTest ( name, testString, expectedTokens ) = 37 | test name <| 38 | \() -> 39 | Expect.equal 40 | expectedTokens 41 | (TokenProcessors.tokenizer testString) 42 | 43 | 44 | trimmerTests : Test 45 | trimmerTests = 46 | describe "Lunr TokenProcessors trimmer tests" <| 47 | List.map trimmerTest 48 | [ ( "023hello", "023hello" ) 49 | , ( "=hello", "hello" ) 50 | , ( "hello.", "hello" ) 51 | , ( ",hello,", "hello" ) 52 | , ( ",hello_,", "hello_" ) 53 | , ( "40%", "40" ) 54 | ] 55 | 56 | 57 | trimmerTest : ( String, String ) -> Test 58 | trimmerTest ( testString, expectedString ) = 59 | test ("trimmer " ++ testString ++ " -> " ++ expectedString) <| 60 | \() -> 61 | Expect.equal 62 | expectedString 63 | (TokenProcessors.trimmer testString) 64 | --------------------------------------------------------------------------------