├── .github └── workflows │ └── haskell-ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── changelog ├── demo ├── ExtractDescriptionTerms.hs ├── ExtractNameTerms.hs ├── HaddockHtml.hs ├── HaddockLex.x ├── HaddockParse.y ├── HaddockTypes.hs ├── Main.hs ├── PackageIndexUtils.hs └── PackageSearch.hs ├── full-text-search.cabal ├── src └── Data │ ├── SearchEngine.hs │ └── SearchEngine │ ├── Autosuggest.hs │ ├── BM25F.hs │ ├── DocFeatVals.hs │ ├── DocIdSet.hs │ ├── DocTermIds.hs │ ├── Query.hs │ ├── SearchIndex.hs │ ├── TermBag.hs │ ├── Types.hs │ └── Update.hs └── tests ├── Main.hs └── Test └── Data └── SearchEngine ├── DocIdSet.hs └── TermBag.hs /.github/workflows/haskell-ci.yml: -------------------------------------------------------------------------------- 1 | # This GitHub workflow config has been generated by a script via 2 | # 3 | # haskell-ci 'github' 'full-text-search.cabal' '--local-ghc-options' '-Wall -Werror' 4 | # 5 | # To regenerate the script (for example after adjusting tested-with) run 6 | # 7 | # haskell-ci regenerate 8 | # 9 | # For more information, see https://github.com/haskell-CI/haskell-ci 10 | # 11 | # version: 0.19.20250115 12 | # 13 | # REGENDATA ("0.19.20250115",["github","full-text-search.cabal","--local-ghc-options","-Wall -Werror"]) 14 | # 15 | name: Haskell-CI 16 | on: 17 | - push 18 | - pull_request 19 | jobs: 20 | linux: 21 | name: Haskell-CI - Linux - ${{ matrix.compiler }} 22 | runs-on: ubuntu-20.04 23 | timeout-minutes: 24 | 60 25 | container: 26 | image: buildpack-deps:jammy 27 | continue-on-error: ${{ matrix.allow-failure }} 28 | strategy: 29 | matrix: 30 | include: 31 | - compiler: ghc-9.12.1 32 | compilerKind: ghc 33 | compilerVersion: 9.12.1 34 | setup-method: ghcup 35 | allow-failure: false 36 | - compiler: ghc-9.10.1 37 | compilerKind: ghc 38 | compilerVersion: 9.10.1 39 | setup-method: ghcup 40 | allow-failure: false 41 | - compiler: ghc-9.8.2 42 | compilerKind: ghc 43 | compilerVersion: 9.8.2 44 | setup-method: ghcup 45 | allow-failure: false 46 | - compiler: ghc-9.6.6 47 | compilerKind: ghc 48 | compilerVersion: 9.6.6 49 | setup-method: ghcup 50 | allow-failure: false 51 | - compiler: ghc-9.4.8 52 | compilerKind: ghc 53 | compilerVersion: 9.4.8 54 | setup-method: ghcup 55 | allow-failure: false 56 | - compiler: ghc-9.2.8 57 | compilerKind: ghc 58 | compilerVersion: 9.2.8 59 | setup-method: ghcup 60 | allow-failure: false 61 | fail-fast: false 62 | steps: 63 | - name: apt-get install 64 | run: | 65 | apt-get update 66 | apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5 67 | - name: Install GHCup 68 | run: | 69 | mkdir -p "$HOME/.ghcup/bin" 70 | curl -sL https://downloads.haskell.org/ghcup/0.1.30.0/x86_64-linux-ghcup-0.1.30.0 > "$HOME/.ghcup/bin/ghcup" 71 | chmod a+x "$HOME/.ghcup/bin/ghcup" 72 | - name: Install cabal-install 73 | run: | 74 | "$HOME/.ghcup/bin/ghcup" install cabal 3.12.1.0 || (cat "$HOME"/.ghcup/logs/*.* && false) 75 | echo "CABAL=$HOME/.ghcup/bin/cabal-3.12.1.0 -vnormal+nowrap" >> "$GITHUB_ENV" 76 | - name: Install GHC (GHCup) 77 | if: matrix.setup-method == 'ghcup' 78 | run: | 79 | "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false) 80 | HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER") 81 | HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#') 82 | HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#') 83 | echo "HC=$HC" >> "$GITHUB_ENV" 84 | echo "HCPKG=$HCPKG" >> "$GITHUB_ENV" 85 | echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV" 86 | env: 87 | HCKIND: ${{ matrix.compilerKind }} 88 | HCNAME: ${{ matrix.compiler }} 89 | HCVER: ${{ matrix.compilerVersion }} 90 | - name: Set PATH and environment variables 91 | run: | 92 | echo "$HOME/.cabal/bin" >> $GITHUB_PATH 93 | echo "LANG=C.UTF-8" >> "$GITHUB_ENV" 94 | echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV" 95 | echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV" 96 | HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))') 97 | echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV" 98 | echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV" 99 | echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV" 100 | echo "HEADHACKAGE=false" >> "$GITHUB_ENV" 101 | echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV" 102 | env: 103 | HCKIND: ${{ matrix.compilerKind }} 104 | HCNAME: ${{ matrix.compiler }} 105 | HCVER: ${{ matrix.compilerVersion }} 106 | - name: env 107 | run: | 108 | env 109 | - name: write cabal config 110 | run: | 111 | mkdir -p $CABAL_DIR 112 | cat >> $CABAL_CONFIG <> $CABAL_CONFIG < cabal-plan.xz 145 | echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2 cabal-plan.xz' | sha256sum -c - 146 | xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan 147 | rm -f cabal-plan.xz 148 | chmod a+x $HOME/.cabal/bin/cabal-plan 149 | cabal-plan --version 150 | - name: checkout 151 | uses: actions/checkout@v4 152 | with: 153 | path: source 154 | - name: initial cabal.project for sdist 155 | run: | 156 | touch cabal.project 157 | echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project 158 | cat cabal.project 159 | - name: sdist 160 | run: | 161 | mkdir -p sdist 162 | $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist 163 | - name: unpack 164 | run: | 165 | mkdir -p unpacked 166 | find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \; 167 | - name: generate cabal.project 168 | run: | 169 | PKGDIR_full_text_search="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/full-text-search-[0-9.]*')" 170 | echo "PKGDIR_full_text_search=${PKGDIR_full_text_search}" >> "$GITHUB_ENV" 171 | rm -f cabal.project cabal.project.local 172 | touch cabal.project 173 | touch cabal.project.local 174 | echo "packages: ${PKGDIR_full_text_search}" >> cabal.project 175 | echo "package full-text-search" >> cabal.project 176 | echo " ghc-options: -Werror=missing-methods" >> cabal.project 177 | cat >> cabal.project <> cabal.project.local 182 | cat cabal.project 183 | cat cabal.project.local 184 | - name: dump install plan 185 | run: | 186 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all 187 | cabal-plan 188 | - name: restore cache 189 | uses: actions/cache/restore@v4 190 | with: 191 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 192 | path: ~/.cabal/store 193 | restore-keys: ${{ runner.os }}-${{ matrix.compiler }}- 194 | - name: install dependencies 195 | run: | 196 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all 197 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all 198 | - name: build w/o tests 199 | run: | 200 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 201 | - name: build 202 | run: | 203 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always 204 | - name: tests 205 | run: | 206 | $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct 207 | - name: cabal check 208 | run: | 209 | cd ${PKGDIR_full_text_search} || false 210 | ${CABAL} -vnormal check 211 | - name: haddock 212 | run: | 213 | $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all 214 | - name: unconstrained build 215 | run: | 216 | rm -f cabal.project.local 217 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 218 | - name: save cache 219 | if: always() 220 | uses: actions/cache/save@v4 221 | with: 222 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 223 | path: ~/.cabal/store 224 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | dist-newstyle 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013-2014 Duncan Coutts, 2014 Well-Typed LLP 2 | 2014 IRIS Connect Ltd. 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | 17 | * Neither the name of Duncan Coutts nor the names of other 18 | contributors may be used to endorse or promote products derived 19 | from this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # full-text-search 2 | An in-memory full text search engine library. It lets you run full-text queries on a collection of your documents. 3 | 4 | See http://hackage.haskell.org/package/full-text-search 5 | -------------------------------------------------------------------------------- /changelog: -------------------------------------------------------------------------------- 1 | 0.2.2.3 Ben Gamari January 2025 2 | * Introduce compatibility with GHCs up to 9.12 3 | * Drop compatibility with GHC versions pre-9.2 4 | 5 | 0.2.2.2 Adam Gundry March 2023 6 | * Fix bug in 0.2.2.1 autosuggest patch 7 | 8 | 0.2.2.1 Adam Gundry March 2023 9 | * Fix autosuggest query performance bug on large datasets 10 | 11 | 0.2.2.0 Adam Gundry November 2022 12 | * Add queryAutosuggestPredicate and queryAutosuggestMatchingDocuments 13 | * Compatibility with GHC 8.10 to 9.4 and new package versions 14 | 15 | 0.2.1.4 Mikolaj Konarski August 2017 16 | * Compatibility with GHC 8.0.2 and new package versions (no API changes) 17 | 18 | 0.2.1.3 Duncan Coutts May 2015 19 | * Compatibility with GHC 7.10 (no API changes) 20 | 21 | 0.2.1.2 Adam Gundry July 2014 22 | * Compatibility with GHC 7.8 (no API changes) 23 | 24 | 0.2.1.1 Duncan Coutts May 2014 25 | * Fix for NaN scores with a doc field that is empty for all docs 26 | * Add instance Show NoFeatures 27 | 28 | 0.2.1.0 Duncan Coutts March 2014 29 | * Add auto-complete / auto-suggest feature 30 | * Add demo program 31 | * Moved QC props into a separate test suite 32 | * Work sponsored by IRIS Connect Ltd. 33 | 34 | 0.2.0.0 Duncan Coutts Feb 2014 35 | * Initial version as a separate library 36 | 37 | 0.1.1.0 Duncan Coutts Nov 2013 38 | * Add "explain" mode for query 39 | * Add non-term feature scores 40 | 41 | 0.1.0.1 Duncan Coutts Sept 2013 42 | * Fix bug in index update (thanks to Matthew Gruen) 43 | 44 | 0.1.0.0 Duncan Coutts July 2013 45 | * Initial version (included in hackage-server) 46 | 47 | -------------------------------------------------------------------------------- /demo/ExtractDescriptionTerms.hs: -------------------------------------------------------------------------------- 1 | 2 | module ExtractDescriptionTerms ( 3 | extractSynopsisTerms, 4 | extractDescriptionTerms 5 | ) where 6 | 7 | import Data.Text (Text) 8 | import qualified Data.Text as T 9 | import Data.Set (Set) 10 | import qualified Data.Set as Set 11 | import Data.Char 12 | import qualified NLP.Tokenize as NLP 13 | import qualified NLP.Snowball as NLP 14 | import Control.Monad ((>=>)) 15 | import Data.Maybe 16 | 17 | import HaddockTypes as Haddock 18 | import HaddockHtml as Haddock (markup) 19 | import qualified HaddockParse as Haddock (parseHaddockParagraphs) 20 | import qualified HaddockLex as Haddock (tokenise) 21 | 22 | 23 | extractSynopsisTerms :: Set Text -> String -> [Text] 24 | extractSynopsisTerms stopWords = 25 | NLP.stems NLP.English 26 | . filter (`Set.notMember` stopWords) 27 | . map (T.toCaseFold . T.pack) 28 | . concatMap splitTok 29 | . filter (not . ignoreTok) 30 | . NLP.tokenize 31 | 32 | 33 | ignoreTok :: String -> Bool 34 | ignoreTok = all isPunctuation 35 | 36 | splitTok :: String -> [String] 37 | splitTok tok = 38 | case go tok of 39 | toks@(_:_:_) -> tok:toks 40 | toks -> toks 41 | where 42 | go remaining = 43 | case break (\c -> c == ')' || c == '-' || c == '/') remaining of 44 | ([], _:trailing) -> go trailing 45 | (leading, _:trailing) -> leading : go trailing 46 | ([], []) -> [] 47 | (leading, []) -> leading : [] 48 | 49 | 50 | extractDescriptionTerms :: Set Text -> String -> [Text] 51 | extractDescriptionTerms stopWords = 52 | NLP.stems NLP.English 53 | . filter (`Set.notMember` stopWords) 54 | . map (T.toCaseFold . T.pack) 55 | . maybe 56 | [] --TODO: something here 57 | ( filter (not . ignoreTok) 58 | . NLP.tokenize 59 | . concat . Haddock.markup termsMarkup) 60 | . (Haddock.tokenise >=> Haddock.parseHaddockParagraphs) 61 | 62 | termsMarkup :: DocMarkup String [String] 63 | termsMarkup = Markup { 64 | markupEmpty = [], 65 | markupString = \s -> [s], 66 | markupParagraph = id, 67 | markupAppend = (++), 68 | markupIdentifier = \s -> [s], 69 | markupModule = const [], -- i.e. filter these out 70 | markupEmphasis = id, 71 | markupMonospaced = \s -> if length s > 1 then [] else s, 72 | markupUnorderedList = concat, 73 | markupOrderedList = concat, 74 | markupDefList = concatMap (\(d,t) -> d ++ t), 75 | markupCodeBlock = const [], 76 | markupHyperlink = \(Hyperlink _url mLabel) -> maybeToList mLabel, 77 | --TODO: extract main part of hostname 78 | markupPic = const [], 79 | markupAName = const [] 80 | } 81 | 82 | {- 83 | ------------------- 84 | -- Main experiment 85 | -- 86 | 87 | main = do 88 | pkgsFile <- readFile "pkgs" 89 | let mostFreq :: [String] 90 | pkgs :: [PackageDescription] 91 | (mostFreq, pkgs) = read pkgsFile 92 | 93 | stopWordsFile <- T.readFile "stopwords.txt" 94 | -- wordsFile <- T.readFile "/usr/share/dict/words" 95 | -- let ws = Set.fromList (map T.toLower $ T.lines wordsFile) 96 | 97 | 98 | print "reading file" 99 | evaluate (length mostFreq + length pkgs) 100 | print "done" 101 | 102 | let stopWords = Set.fromList $ T.lines stopWordsFile 103 | print stopWords 104 | 105 | sequence_ 106 | [ putStrLn $ display (packageName pkg) ++ ": " 107 | ++ --intercalate ", " 108 | (description pkg) ++ "\n" 109 | ++ intercalate ", " 110 | (map T.unpack $ extractDescriptionTerms stopWords (description pkg)) ++ "\n" 111 | | pkg <- pkgs 112 | , let pkgname = display (packageName pkg) ] 113 | -} 114 | -------------------------------------------------------------------------------- /demo/ExtractNameTerms.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 2 | 3 | module ExtractNameTerms ( 4 | extractPackageNameTerms, 5 | extractModuleNameTerms, 6 | ) where 7 | 8 | import Data.Text (Text) 9 | import qualified Data.Text as T 10 | import Data.Char (isUpper, isDigit) 11 | import Data.List 12 | import Data.List.Split hiding (Splitter) 13 | import Data.Maybe (maybeToList) 14 | 15 | import Data.Functor.Identity 16 | import Control.Applicative 17 | import Control.Monad 18 | import Control.Monad.List 19 | import Control.Monad.Writer 20 | import Control.Monad.State 21 | 22 | 23 | extractModuleNameTerms :: String -> [Text] 24 | extractModuleNameTerms modname = 25 | map T.toCaseFold $ 26 | nub $ 27 | map T.pack $ 28 | flip runSplitter modname $ do 29 | _ <- forEachPart splitDot 30 | _ <- forEachPart splitCamlCase 31 | satisfy (not . singleChar) 32 | get >>= emit 33 | 34 | extractPackageNameTerms :: String -> [Text] 35 | extractPackageNameTerms pkgname = 36 | map T.toCaseFold $ 37 | nub $ 38 | map T.pack $ 39 | flip runSplitter pkgname $ do 40 | 41 | fstComponentHyphen <- forEachPart splitHyphen 42 | 43 | satisfy (`notElem` ["hs", "haskell"]) 44 | 45 | _ <- forEachPart stripPrefixH 46 | 47 | fstComponentCaml <- forEachPart splitCamlCase 48 | 49 | fstComponent2 <- forEachPart splitOn2 50 | 51 | when (fstComponentHyphen && fstComponentCaml && fstComponent2) $ do 52 | forEachPartAndWhole stripPrefix_h 53 | _ <- forEachPart (maybeToList . stripPrefix "lib") 54 | _ <- forEachPart (maybeToList . stripSuffix "lib") 55 | _ <- forEachPart stripSuffixNum 56 | satisfy (not . singleChar) 57 | 58 | get >>= emit 59 | 60 | newtype Split a = Split (StateT String (ListT (WriterT [String] Identity)) a) 61 | deriving (Functor, Applicative, Alternative, Monad, MonadPlus, MonadState String) 62 | 63 | emit :: String -> Split () 64 | emit x = Split (lift (lift (tell [x]))) 65 | 66 | forEach :: [a] -> Split a 67 | forEach = msum . map return 68 | 69 | runSplitter :: Split () -> String -> [String] 70 | runSplitter (Split m) s = snd (runWriter (runListT (runStateT m s))) 71 | 72 | singleChar :: String -> Bool 73 | singleChar [_] = True 74 | singleChar _ = False 75 | 76 | satisfy :: (String -> Bool) -> Split () 77 | satisfy p = get >>= guard . p 78 | 79 | forEachPart :: (String -> [String]) -> Split Bool 80 | forEachPart parts = do 81 | t <- get 82 | case parts t of 83 | [] -> return True 84 | [t'] | t == t' -> return True 85 | ts -> do emit t 86 | (t', n) <- forEach (zip ts [1::Int ..]) 87 | put t' 88 | return (n==1) 89 | 90 | forEachPartAndWhole :: (String -> [String]) -> Split () 91 | forEachPartAndWhole parts = do 92 | t <- get 93 | case parts t of 94 | [] -> return () 95 | ts -> forEach (t:ts) >>= put 96 | 97 | 98 | splitDot :: String -> [String] 99 | splitDot = split (dropBlanks $ dropDelims $ whenElt (=='.')) 100 | 101 | splitHyphen :: String -> [String] 102 | splitHyphen = split (dropBlanks $ dropDelims $ whenElt (=='-')) 103 | 104 | splitCamlCase :: String -> [String] 105 | splitCamlCase = split (dropInitBlank $ condense $ keepDelimsL $ whenElt isUpper) 106 | 107 | stripPrefixH :: String -> [String] 108 | stripPrefixH ('H':'S':frag) | all isUpper frag = [frag] 109 | stripPrefixH "HTTP" = [] 110 | stripPrefixH ('H':frag@(c:_)) | isUpper c = [frag] 111 | stripPrefixH _ = [] 112 | 113 | stripPrefix_h :: String -> [String] 114 | stripPrefix_h "http" = [] 115 | stripPrefix_h "html" = [] 116 | stripPrefix_h ('h':'s':frag) = ['s':frag, frag] 117 | stripPrefix_h ('h':frag) {- | Set.notMember (T.pack w) ws -} = [frag] 118 | stripPrefix_h _ = [] 119 | 120 | stripSuffix :: String -> String -> Maybe String 121 | stripSuffix s t = fmap reverse (stripPrefix (reverse s) (reverse t)) 122 | 123 | stripSuffixNum :: String -> [String] 124 | stripSuffixNum s 125 | | null rd || null rs' = [] 126 | | otherwise = [s', d] 127 | where 128 | rs = reverse s 129 | (rd, rs') = span isDigit rs 130 | d = reverse rd 131 | s' = reverse rs' 132 | 133 | splitOn2 :: String -> [String] 134 | splitOn2 t = 135 | case break (=='2') t of 136 | (from@(_:_), '2':to@(c:_)) 137 | | not (isDigit c) 138 | , not (length from == 1 && length to == 1) 139 | -> [from, to] 140 | _ -> [] 141 | 142 | {- 143 | ------------------- 144 | -- Main experiment 145 | -- 146 | 147 | main = do 148 | pkgsFile <- readFile "pkgs" 149 | let mostFreq :: [String] 150 | pkgs :: [PackageDescription] 151 | (mostFreq, pkgs) = read pkgsFile 152 | 153 | -- wordsFile <- T.readFile "/usr/share/dict/words" 154 | -- let ws = Set.fromList (map T.toLower $ T.lines wordsFile) 155 | 156 | 157 | print "reading file" 158 | evaluate (length mostFreq + length pkgs) 159 | print "done" 160 | 161 | sequence_ 162 | [ putStrLn $ display (packageName pkg) ++ " -> " 163 | ++ intercalate ", " (map T.unpack $ extractNameTerms pkgname) 164 | | pkg <- pkgs 165 | , let pkgname = display (packageName pkg) ] 166 | 167 | main = do 168 | pkgsFile <- readFile "pkgs3" 169 | let pkgs :: [PackageDescription] 170 | pkgs = map read (lines pkgsFile) 171 | 172 | -- print "forcing pkgs..." 173 | -- evaluate (foldl' (\a p -> seq p a) () pkgs) 174 | 175 | sequence_ 176 | [ putStrLn $ display (packageName pkg) ++ ": " ++ display mod ++ " -> " 177 | ++ intercalate ", " (map T.unpack $ extractModuleNameTerms (display mod)) 178 | | pkg <- pkgs 179 | , Just lib <- [library pkg] 180 | , let mods = exposedModules lib 181 | , mod <- mods ] 182 | -} 183 | -------------------------------------------------------------------------------- /demo/HaddockHtml.hs: -------------------------------------------------------------------------------- 1 | -- stolen from Haddock's Util.hs and Doc.hs 2 | module HaddockHtml (markup, docAppend, docParagraph) where 3 | 4 | import HaddockTypes 5 | import Data.Char (isSpace) 6 | 7 | markup :: DocMarkup id a -> Doc id -> a 8 | markup m DocEmpty = markupEmpty m 9 | markup m (DocAppend d1 d2) = markupAppend m (markup m d1) (markup m d2) 10 | markup m (DocString s) = markupString m s 11 | markup m (DocParagraph d) = markupParagraph m (markup m d) 12 | markup m (DocIdentifier x) = markupIdentifier m x 13 | markup m (DocModule mod0) = markupModule m mod0 14 | markup m (DocEmphasis d) = markupEmphasis m (markup m d) 15 | markup m (DocMonospaced d) = markupMonospaced m (markup m d) 16 | markup m (DocUnorderedList ds) = markupUnorderedList m (map (markup m) ds) 17 | markup m (DocOrderedList ds) = markupOrderedList m (map (markup m) ds) 18 | markup m (DocDefList ds) = markupDefList m (map (markupPair m) ds) 19 | markup m (DocCodeBlock d) = markupCodeBlock m (markup m d) 20 | markup m (DocHyperlink l) = markupHyperlink m l 21 | markup m (DocAName ref) = markupAName m ref 22 | markup m (DocPic img) = markupPic m img 23 | 24 | markupPair :: DocMarkup id a -> (Doc id, Doc id) -> (a, a) 25 | markupPair m (a,b) = (markup m a, markup m b) 26 | 27 | -- ----------------------------------------------------------------------------- 28 | -- ** Smart constructors 29 | 30 | -- used to make parsing easier; we group the list items later 31 | docAppend :: Doc id -> Doc id -> Doc id 32 | docAppend (DocUnorderedList ds1) (DocUnorderedList ds2) 33 | = DocUnorderedList (ds1++ds2) 34 | docAppend (DocUnorderedList ds1) (DocAppend (DocUnorderedList ds2) d) 35 | = DocAppend (DocUnorderedList (ds1++ds2)) d 36 | docAppend (DocOrderedList ds1) (DocOrderedList ds2) 37 | = DocOrderedList (ds1++ds2) 38 | docAppend (DocOrderedList ds1) (DocAppend (DocOrderedList ds2) d) 39 | = DocAppend (DocOrderedList (ds1++ds2)) d 40 | docAppend (DocDefList ds1) (DocDefList ds2) 41 | = DocDefList (ds1++ds2) 42 | docAppend (DocDefList ds1) (DocAppend (DocDefList ds2) d) 43 | = DocAppend (DocDefList (ds1++ds2)) d 44 | docAppend DocEmpty d = d 45 | docAppend d DocEmpty = d 46 | docAppend d1 d2 47 | = DocAppend d1 d2 48 | 49 | 50 | -- again to make parsing easier - we spot a paragraph whose only item 51 | -- is a DocMonospaced and make it into a DocCodeBlock 52 | docParagraph :: Doc id -> Doc id 53 | docParagraph (DocMonospaced p) 54 | = DocCodeBlock (docCodeBlock p) 55 | docParagraph (DocAppend (DocString s1) (DocMonospaced p)) 56 | | all isSpace s1 57 | = DocCodeBlock (docCodeBlock p) 58 | docParagraph (DocAppend (DocString s1) 59 | (DocAppend (DocMonospaced p) (DocString s2))) 60 | | all isSpace s1 && all isSpace s2 61 | = DocCodeBlock (docCodeBlock p) 62 | docParagraph (DocAppend (DocMonospaced p) (DocString s2)) 63 | | all isSpace s2 64 | = DocCodeBlock (docCodeBlock p) 65 | docParagraph p 66 | = DocParagraph p 67 | 68 | 69 | -- Drop trailing whitespace from @..@ code blocks. Otherwise this: 70 | -- 71 | -- -- @ 72 | -- -- foo 73 | -- -- @ 74 | -- 75 | -- turns into (DocCodeBlock "\nfoo\n ") which when rendered in HTML 76 | -- gives an extra vertical space after the code block. The single space 77 | -- on the final line seems to trigger the extra vertical space. 78 | -- 79 | docCodeBlock :: Doc id -> Doc id 80 | docCodeBlock (DocString s) 81 | = DocString (reverse $ dropWhile (`elem` " \t") $ reverse s) 82 | docCodeBlock (DocAppend l r) 83 | = DocAppend l (docCodeBlock r) 84 | docCodeBlock d = d 85 | -------------------------------------------------------------------------------- /demo/HaddockLex.x: -------------------------------------------------------------------------------- 1 | -- 2 | -- Haddock - A Haskell Documentation Tool 3 | -- 4 | -- (c) Simon Marlow 2002 5 | -- 6 | 7 | { 8 | -- Disable warnings that the generated code causes 9 | {-# OPTIONS_GHC -fno-warn-deprecated-flags 10 | -fno-warn-unused-binds 11 | -fno-warn-unused-imports 12 | -fno-warn-unused-matches 13 | -fno-warn-missing-signatures 14 | -fno-warn-tabs #-} 15 | module HaddockLex ( 16 | Token(..), 17 | tokenise 18 | ) where 19 | 20 | import Data.Char 21 | import Data.Word (Word8) 22 | import qualified Data.Bits 23 | import Numeric 24 | import Control.Monad (liftM) 25 | import HaddockTypes (RdrName) 26 | } 27 | 28 | $ws = $white # \n 29 | $digit = [0-9] 30 | $hexdigit = [0-9a-fA-F] 31 | $special = [\"\@] 32 | $alphanum = [A-Za-z0-9] 33 | $ident = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:] 34 | 35 | :- 36 | 37 | -- beginning of a paragraph 38 | <0,para> { 39 | $ws* \n ; 40 | $ws* \> { begin birdtrack } 41 | $ws* [\*\-] { token TokBullet `andBegin` string } 42 | $ws* \[ { token TokDefStart `andBegin` def } 43 | $ws* \( $digit+ \) { token TokNumber `andBegin` string } 44 | $ws* { begin string } 45 | } 46 | 47 | -- beginning of a line 48 | { 49 | $ws* \> { begin birdtrack } 50 | $ws* \n { token TokPara `andBegin` para } 51 | -- Here, we really want to be able to say 52 | -- $ws* (\n | ) { token TokPara `andBegin` para} 53 | -- because otherwise a trailing line of whitespace will result in 54 | -- a spurious TokString at the end of a docstring. We don't have , 55 | -- though (NOW I realise what it was for :-). To get around this, we always 56 | -- append \n to the end of a docstring. 57 | () { begin string } 58 | } 59 | 60 | .* \n? { strtokenNL TokBirdTrack `andBegin` line } 61 | 62 | { 63 | $special { strtoken $ \s -> TokSpecial (head s) } 64 | \<\< [^\<\>]* \>\> { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) } 65 | \< [^\<\>]* \> { strtoken $ \s -> TokURL (init (tail s)) } 66 | \# [^\#]* \# { strtoken $ \s -> TokAName (init (tail s)) } 67 | \/ [^\/]* \/ { strtoken $ \s -> TokEmphasis (init (tail s)) } 68 | [\'\`] $ident+ [\'\`] { ident } 69 | \\ . { strtoken (TokString . tail) } 70 | "&#" $digit+ \; { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] } 71 | "&#" [xX] $hexdigit+ \; { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n]; _ -> error "hexParser: Can't happen" } 72 | -- allow special characters through if they don't fit one of the previous 73 | -- patterns. 74 | [\/\'\`\<\#\&\\] { strtoken TokString } 75 | [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line } 76 | [^ $special \/ \< \# \n \'\` \& \\ \]]+ { strtoken TokString } 77 | } 78 | 79 | { 80 | \] { token TokDefEnd `andBegin` string } 81 | } 82 | 83 | -- ']' doesn't have any special meaning outside of the [...] at the beginning 84 | -- of a definition paragraph. 85 | { 86 | \] { strtoken TokString } 87 | } 88 | 89 | { 90 | data Token 91 | = TokPara 92 | | TokNumber 93 | | TokBullet 94 | | TokDefStart 95 | | TokDefEnd 96 | | TokSpecial Char 97 | | TokIdent RdrName 98 | | TokString String 99 | | TokURL String 100 | | TokPic String 101 | | TokEmphasis String 102 | | TokAName String 103 | | TokBirdTrack String 104 | deriving Show 105 | 106 | -- ----------------------------------------------------------------------------- 107 | -- Alex support stuff 108 | 109 | type StartCode = Int 110 | type Action = String -> StartCode -> (StartCode -> Maybe [Token]) -> Maybe [Token] 111 | 112 | --TODO: we ought to switch to ByteString input. 113 | type AlexInput = (Char, [Word8], String) 114 | 115 | -- | For alex >= 3 116 | -- 117 | -- See also alexGetChar 118 | alexGetByte :: AlexInput -> Maybe (Word8,AlexInput) 119 | alexGetByte (c,(b:bs),s) = Just (b,(c,bs,s)) 120 | alexGetByte (c,[],[]) = Nothing 121 | alexGetByte (_,[],(c:s)) = case utf8Encode c of 122 | (b:bs) -> Just (b, (c, bs, s)) 123 | 124 | -- | Encode a Haskell String to a list of Word8 values, in UTF8 format. 125 | utf8Encode :: Char -> [Word8] 126 | utf8Encode = map fromIntegral . go . ord 127 | where 128 | go oc 129 | | oc <= 0x7f = [oc] 130 | 131 | | oc <= 0x7ff = [ 0xc0 + (oc `Data.Bits.shiftR` 6) 132 | , 0x80 + oc Data.Bits..&. 0x3f 133 | ] 134 | 135 | | oc <= 0xffff = [ 0xe0 + (oc `Data.Bits.shiftR` 12) 136 | , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f) 137 | , 0x80 + oc Data.Bits..&. 0x3f 138 | ] 139 | | otherwise = [ 0xf0 + (oc `Data.Bits.shiftR` 18) 140 | , 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f) 141 | , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f) 142 | , 0x80 + oc Data.Bits..&. 0x3f 143 | ] 144 | 145 | -- | For alex < 3 146 | -- 147 | -- See also alexGetByte 148 | alexGetChar :: AlexInput -> Maybe (Char, AlexInput) 149 | alexGetChar (_, _, []) = Nothing 150 | alexGetChar (_, _, c:cs) = Just (c, (c,[],cs)) 151 | 152 | alexInputPrevChar (c,_) = c 153 | 154 | tokenise :: String -> Maybe [Token] 155 | tokenise str = 156 | go ('\n', [], eofHack str) para 157 | where 158 | go inp@(_,_,str') sc = 159 | case alexScan inp sc of 160 | AlexEOF -> Just [] 161 | AlexError _ -> Nothing 162 | AlexSkip inp' _ -> go inp' sc 163 | AlexToken inp' len act -> act (take len str') sc (\sc' -> go inp' sc') 164 | 165 | -- NB. we add a final \n to the string, (see comment in the beginning of line 166 | -- production above). 167 | eofHack str = str++"\n" 168 | 169 | andBegin :: Action -> StartCode -> Action 170 | andBegin act new_sc = \str _ cont -> act str new_sc cont 171 | 172 | token :: Token -> Action 173 | token t = \_ sc cont -> liftM (t :) (cont sc) 174 | 175 | strtoken, strtokenNL :: (String -> Token) -> Action 176 | strtoken t = \str sc cont -> liftM (t str :) (cont sc) 177 | strtokenNL t = \str sc cont -> liftM (t (filter (/= '\r') str) :) (cont sc) 178 | -- ^ We only want LF line endings in our internal doc string format, so we 179 | -- filter out all CRs. 180 | 181 | begin :: StartCode -> Action 182 | begin sc = \_ _ cont -> cont sc 183 | 184 | -- ----------------------------------------------------------------------------- 185 | -- Lex a string as a Haskell identifier 186 | 187 | ident :: Action 188 | ident str sc cont = liftM (TokIdent str :) (cont sc) 189 | } 190 | -------------------------------------------------------------------------------- /demo/HaddockParse.y: -------------------------------------------------------------------------------- 1 | { 2 | -- Disable warnings that the generated code causes 3 | {-# OPTIONS_GHC -fno-warn-deprecated-flags 4 | -fno-warn-missing-signatures 5 | -fno-warn-unused-binds 6 | -fno-warn-unused-matches 7 | -fno-warn-lazy-unlifted-bindings 8 | -fno-warn-name-shadowing 9 | -fno-warn-incomplete-patterns 10 | -fno-warn-tabs #-} 11 | module HaddockParse (parseHaddockParagraphs) where 12 | 13 | import HaddockLex 14 | import HaddockHtml 15 | import HaddockTypes 16 | import Data.Char (isSpace) 17 | } 18 | 19 | %expect 0 20 | 21 | %tokentype { Token } 22 | 23 | %token 24 | '@' { TokSpecial '@' } 25 | '[' { TokDefStart } 26 | ']' { TokDefEnd } 27 | DQUO { TokSpecial '\"' } 28 | URL { TokURL $$ } 29 | PIC { TokPic $$ } 30 | ANAME { TokAName $$ } 31 | '/../' { TokEmphasis $$ } 32 | '-' { TokBullet } 33 | '(n)' { TokNumber } 34 | '>..' { TokBirdTrack $$ } 35 | IDENT { TokIdent $$ } 36 | PARA { TokPara } 37 | STRING { TokString $$ } 38 | 39 | %monad { Maybe } 40 | 41 | %name parseHaddockParagraphs doc 42 | %name parseHaddockString seq 43 | 44 | %% 45 | 46 | doc :: { Doc RdrName } 47 | : apara PARA doc { docAppend $1 $3 } 48 | | PARA doc { $2 } 49 | | apara { $1 } 50 | | {- empty -} { DocEmpty } 51 | 52 | apara :: { Doc RdrName } 53 | : ulpara { DocUnorderedList [$1] } 54 | | olpara { DocOrderedList [$1] } 55 | | defpara { DocDefList [$1] } 56 | | para { $1 } 57 | 58 | ulpara :: { Doc RdrName } 59 | : '-' para { $2 } 60 | 61 | olpara :: { Doc RdrName } 62 | : '(n)' para { $2 } 63 | 64 | defpara :: { (Doc RdrName, Doc RdrName) } 65 | : '[' seq ']' seq { ($2, $4) } 66 | 67 | para :: { Doc RdrName } 68 | : seq { docParagraph $1 } 69 | | codepara { DocCodeBlock $1 } 70 | 71 | codepara :: { Doc RdrName } 72 | : '>..' codepara { docAppend (DocString $1) $2 } 73 | | '>..' { DocString $1 } 74 | 75 | seq :: { Doc RdrName } 76 | : elem seq { docAppend $1 $2 } 77 | | elem { $1 } 78 | 79 | elem :: { Doc RdrName } 80 | : elem1 { $1 } 81 | | '@' seq1 '@' { DocMonospaced $2 } 82 | 83 | seq1 :: { Doc RdrName } 84 | : PARA seq1 { docAppend (DocString "\n") $2 } 85 | | elem1 seq1 { docAppend $1 $2 } 86 | | elem1 { $1 } 87 | 88 | elem1 :: { Doc RdrName } 89 | : STRING { DocString $1 } 90 | | '/../' { DocEmphasis (DocString $1) } 91 | | URL { DocHyperlink (makeHyperlink $1) } 92 | | PIC { DocPic $1 } 93 | | ANAME { DocAName $1 } 94 | | IDENT { DocIdentifier $1 } 95 | | DQUO strings DQUO { DocModule $2 } 96 | 97 | strings :: { String } 98 | : STRING { $1 } 99 | | STRING strings { $1 ++ $2 } 100 | 101 | { 102 | happyError :: [Token] -> Maybe a 103 | happyError toks = Nothing 104 | 105 | -- | Create a `Hyperlink` from given string. 106 | -- 107 | -- A hyperlink consists of a URL and an optional label. The label is separated 108 | -- from the url by one or more whitespace characters. 109 | makeHyperlink :: String -> Hyperlink 110 | makeHyperlink input = case break isSpace $ strip input of 111 | (url, "") -> Hyperlink url Nothing 112 | (url, label) -> Hyperlink url (Just . dropWhile isSpace $ label) 113 | 114 | -- | Remove all leading and trailing whitespace 115 | strip :: String -> String 116 | strip = dropWhile isSpace . reverse . dropWhile isSpace . reverse 117 | } 118 | -------------------------------------------------------------------------------- /demo/HaddockTypes.hs: -------------------------------------------------------------------------------- 1 | -- stolen from Haddock's Types.hs 2 | module HaddockTypes where 3 | 4 | data Doc id 5 | = DocEmpty 6 | | DocAppend (Doc id) (Doc id) 7 | | DocString String 8 | | DocParagraph (Doc id) 9 | | DocIdentifier id 10 | | DocModule String 11 | | DocEmphasis (Doc id) 12 | | DocMonospaced (Doc id) 13 | | DocUnorderedList [Doc id] 14 | | DocOrderedList [Doc id] 15 | | DocDefList [(Doc id, Doc id)] 16 | | DocCodeBlock (Doc id) 17 | | DocHyperlink Hyperlink 18 | | DocPic String 19 | | DocAName String 20 | 21 | data Hyperlink = Hyperlink 22 | { hyperlinkUrl :: String 23 | , hyperlinkLabel :: Maybe String 24 | } deriving (Eq, Show) 25 | 26 | -- | DocMarkup is a set of instructions for marking up documentation. 27 | -- In fact, it's really just a mapping from 'Doc' to some other 28 | -- type [a], where [a] is usually the type of the output (HTML, say). 29 | 30 | data DocMarkup id a = Markup { 31 | markupEmpty :: a, 32 | markupString :: String -> a, 33 | markupParagraph :: a -> a, 34 | markupAppend :: a -> a -> a, 35 | markupIdentifier :: id -> a, 36 | markupModule :: String -> a, 37 | markupEmphasis :: a -> a, 38 | markupMonospaced :: a -> a, 39 | markupUnorderedList :: [a] -> a, 40 | markupOrderedList :: [a] -> a, 41 | markupDefList :: [(a,a)] -> a, 42 | markupCodeBlock :: a -> a, 43 | markupHyperlink :: Hyperlink -> a, 44 | markupAName :: String -> a, 45 | markupPic :: String -> a 46 | } 47 | 48 | type RdrName = String 49 | -------------------------------------------------------------------------------- /demo/Main.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Data.SearchEngine 4 | import PackageSearch 5 | 6 | import PackageIndexUtils 7 | 8 | import Data.List 9 | import qualified Data.Map as Map 10 | import qualified Data.Text as T 11 | import qualified Data.Text.IO as T 12 | import Data.Time 13 | 14 | import Control.Monad 15 | import Control.Exception 16 | import System.IO 17 | import System.Directory 18 | import System.Exit 19 | 20 | import Distribution.PackageDescription (PackageDescription) 21 | import Distribution.PackageDescription.Configuration (flattenPackageDescription) 22 | import Distribution.Package (packageVersion, packageName) 23 | import Distribution.Text (display) 24 | 25 | 26 | ------------------- 27 | -- Main experiment 28 | -- 29 | 30 | main :: IO () 31 | main = do 32 | putStrLn "reading 00-index.tar..." 33 | pkgs <- readPackages 34 | 35 | putStrLn "forcing pkgs..." 36 | evaluate (foldl' (\a p -> seq p a) () pkgs) 37 | 38 | let searchengine = insertDocs pkgs initialPkgSearchEngine 39 | 40 | putStrLn "constructing index..." 41 | printTiming "done" $ 42 | evaluate searchengine >> return () 43 | putStrLn $ "search engine invariant: " ++ show (invariant searchengine) 44 | 45 | -- print [ avgFieldLength ctx s | s <- [minBound..maxBound] ] 46 | 47 | -- print $ take 100 $ sortBy (flip compare) $ map Set.size $ Map.elems (termMap searchindex) 48 | -- T.putStr $ T.unlines $ Map.keys (termMap searchindex) 49 | -- let SearchEngine{searchIndex=SearchIndex{termMap, termIdMap, docKeyMap, docIdMap}} = searchengine 50 | -- print (Map.size termMap, IntMap.size termIdMap, Map.size docKeyMap, IntMap.size docIdMap) 51 | 52 | let loop = do 53 | putStr "search term> " 54 | hFlush stdout 55 | t <- T.getLine 56 | unless (T.null t) $ do 57 | putStrLn "Ranked results:" 58 | let rankedResults = queryExplain searchengine (T.words t) 59 | 60 | putStr $ unlines 61 | [ show (overallScore explanation) ++ ": " ++ display pkgname 62 | | (explanation, pkgname) <- take 10 rankedResults ] 63 | 64 | loop 65 | return () 66 | loop 67 | 68 | printTiming :: String -> IO () -> IO () 69 | printTiming msg action = do 70 | t <- getCurrentTime 71 | action 72 | t' <- getCurrentTime 73 | putStrLn (msg ++ ". time: " ++ show (diffUTCTime t' t)) 74 | 75 | readPackages :: IO [PackageDescription] 76 | readPackages = do 77 | exists <- doesFileExist "00-index.tar" 78 | when (not exists) $ do 79 | putStrLn "This program needs a 00-index.tar package index." 80 | putStrLn "Please grab 00-index.tar.gz from hackage and gunzip it." 81 | exitFailure 82 | 83 | pkgs <- PackageIndexUtils.readPackageIndexFile "00-index.tar" 84 | let latestPkgs = Map.fromListWith 85 | (\a b -> if packageVersion (fst a) > packageVersion (fst b) 86 | then a else b) 87 | [ (packageName pkgid, (pkgid, pkg)) 88 | | (pkgid, pkg) <- pkgs ] 89 | 90 | return . map (flattenPackageDescription . snd) 91 | . Map.elems 92 | $ latestPkgs 93 | 94 | -------------------------------------------------------------------------------- /demo/PackageIndexUtils.hs: -------------------------------------------------------------------------------- 1 | ----------------------------------------------------------------------------- 2 | -- | 3 | -- Module : Distribution.Client.IndexUtils 4 | -- Copyright : (c) Duncan Coutts 2008 5 | -- License : BSD-like 6 | -- 7 | -- Maintainer : duncan@community.haskell.org 8 | -- Stability : provisional 9 | -- Portability : portable 10 | -- 11 | -- Extra utils related to the package indexes. 12 | ----------------------------------------------------------------------------- 13 | module PackageIndexUtils ( 14 | readPackageIndexFile 15 | ) where 16 | 17 | import qualified Codec.Archive.Tar as Tar 18 | 19 | import Distribution.Package 20 | ( PackageId, PackageIdentifier(..), PackageName(..) ) 21 | import Distribution.PackageDescription 22 | ( GenericPackageDescription ) 23 | import Distribution.PackageDescription.Parse 24 | ( parsePackageDescription ) 25 | import Distribution.ParseUtils 26 | ( ParseResult(..) ) 27 | import Distribution.Text 28 | ( simpleParse ) 29 | import Distribution.Simple.Utils 30 | ( fromUTF8 ) 31 | 32 | import Data.Maybe (fromMaybe) 33 | import qualified Data.ByteString.Lazy as BS 34 | import qualified Data.ByteString.Lazy.Char8 as BS.Char8 35 | import Data.ByteString.Lazy (ByteString) 36 | import System.FilePath (takeExtension, splitDirectories, normalise) 37 | 38 | 39 | readPackageIndexFile :: FilePath -> IO [(PackageId, GenericPackageDescription)] 40 | readPackageIndexFile indexFile = 41 | either fail return 42 | . parsePackageIndex 43 | =<< BS.readFile indexFile 44 | 45 | -- | Parse an uncompressed \"00-index.tar\" repository index file represented 46 | -- as a 'ByteString'. 47 | -- 48 | parsePackageIndex :: ByteString 49 | -> Either String [(PackageId, GenericPackageDescription)] 50 | parsePackageIndex = accum [] . Tar.read 51 | where 52 | accum pkgs es = case es of 53 | Tar.Fail err -> Left (show err) 54 | Tar.Done -> Right (reverse pkgs) 55 | Tar.Next e es' -> accum pkgs' es' 56 | where 57 | pkgs' = extract pkgs e 58 | 59 | extract pkgs entry = 60 | fromMaybe pkgs $ tryExtractPkg 61 | where 62 | tryExtractPkg = do 63 | (pkgid, pkg) <- extractPkg entry 64 | return ((pkgid, pkg):pkgs) 65 | 66 | 67 | extractPkg :: Tar.Entry -> Maybe (PackageId, GenericPackageDescription) 68 | extractPkg entry = case Tar.entryContent entry of 69 | Tar.NormalFile content _ 70 | | takeExtension fileName == ".cabal" 71 | -> case splitDirectories (normalise fileName) of 72 | [pkgname,vers,_] -> case simpleParse vers of 73 | Just ver -> Just (pkgid, descr) 74 | where 75 | pkgid = PackageIdentifier (PackageName pkgname) ver 76 | parsed = parsePackageDescription . fromUTF8 . BS.Char8.unpack 77 | $ content 78 | descr = case parsed of 79 | ParseOk _ d -> d 80 | _ -> error $ "Couldn't read cabal file " 81 | ++ show fileName 82 | _ -> Nothing 83 | _ -> Nothing 84 | _ -> Nothing 85 | where 86 | fileName = Tar.entryPath entry 87 | -------------------------------------------------------------------------------- /demo/PackageSearch.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings, NamedFieldPuns #-} 2 | module PackageSearch ( 3 | PkgSearchEngine, 4 | initialPkgSearchEngine, 5 | defaultSearchRankParameters, 6 | PkgDocField(..), 7 | ) where 8 | 9 | import Data.SearchEngine 10 | 11 | import ExtractNameTerms 12 | import ExtractDescriptionTerms 13 | 14 | import Data.Ix 15 | import Data.Set (Set) 16 | import qualified Data.Set as Set 17 | import Data.Text (Text) 18 | import qualified Data.Text as T 19 | import NLP.Snowball 20 | 21 | import Distribution.Package 22 | import Distribution.PackageDescription 23 | import Distribution.Text (display) 24 | 25 | 26 | type PkgSearchEngine = SearchEngine 27 | PackageDescription 28 | PackageName 29 | PkgDocField 30 | NoFeatures 31 | 32 | data PkgDocField = NameField 33 | | SynopsisField 34 | | DescriptionField 35 | deriving (Eq, Ord, Enum, Bounded, Ix, Show) 36 | 37 | initialPkgSearchEngine :: PkgSearchEngine 38 | initialPkgSearchEngine = 39 | initSearchEngine pkgSearchConfig defaultSearchRankParameters 40 | 41 | pkgSearchConfig :: SearchConfig PackageDescription 42 | PackageName PkgDocField NoFeatures 43 | pkgSearchConfig = 44 | SearchConfig { 45 | documentKey = packageName, 46 | extractDocumentTerms = extractTokens, 47 | transformQueryTerm = normaliseQueryToken, 48 | documentFeatureValue = const noFeatures 49 | } 50 | where 51 | extractTokens :: PackageDescription -> PkgDocField -> [Text] 52 | extractTokens pkg NameField = extractPackageNameTerms 53 | (display $ packageName pkg) 54 | extractTokens pkg SynopsisField = extractSynopsisTerms 55 | stopWords (synopsis pkg) 56 | extractTokens pkg DescriptionField = extractDescriptionTerms 57 | stopWords (description pkg) 58 | 59 | normaliseQueryToken :: Text -> PkgDocField -> Text 60 | normaliseQueryToken tok = 61 | let tokFold = T.toCaseFold tok 62 | tokStem = stem English tokFold 63 | in \field -> case field of 64 | NameField -> tokFold 65 | SynopsisField -> tokStem 66 | DescriptionField -> tokStem 67 | 68 | defaultSearchRankParameters :: SearchRankParameters PkgDocField NoFeatures 69 | defaultSearchRankParameters = 70 | SearchRankParameters { 71 | paramK1, 72 | paramB, 73 | paramFieldWeights, 74 | paramFeatureWeights = noFeatures, 75 | paramFeatureFunctions = noFeatures, 76 | paramResultsetSoftLimit = 200, 77 | paramResultsetHardLimit = 400, 78 | paramAutosuggestPrefilterLimit = 500, 79 | paramAutosuggestPostfilterLimit = 500 80 | } 81 | where 82 | paramK1 :: Float 83 | paramK1 = 1.5 84 | 85 | paramB :: PkgDocField -> Float 86 | paramB NameField = 0.9 87 | paramB SynopsisField = 0.5 88 | paramB DescriptionField = 0.5 89 | 90 | paramFieldWeights :: PkgDocField -> Float 91 | paramFieldWeights NameField = 20 92 | paramFieldWeights SynopsisField = 5 93 | paramFieldWeights DescriptionField = 1 94 | 95 | 96 | stopWords :: Set Term 97 | stopWords = 98 | Set.fromList 99 | ["haskell","library","simple","using","interface","functions", 100 | "implementation","package","support","'s","based","for","a","and","the", 101 | "to","of","with","in","an","on","from","that","as","into","by","is", 102 | "some","which","or","like","your","other","can","at","over","be","it", 103 | "within","their","this","but","are","get","one","all","you","so","only", 104 | "now","how","where","when","up","has","been","about","them","then","see", 105 | "no","do","than","should","out","off","much","if","i","have","also"] 106 | 107 | -------------------------------------------------------------------------------- /full-text-search.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 3.0 2 | 3 | name: full-text-search 4 | version: 0.2.2.3 5 | synopsis: In-memory full text search engine 6 | 7 | description: 8 | An in-memory full text search engine library. It lets you 9 | run full-text queries on a collection of your documents. 10 | 11 | Features: 12 | 13 | * Keyword queries and auto-complete\/auto-suggest queries. 14 | 15 | * Can search over any type of \"document\". 16 | (You explain how to extract search terms from them.) 17 | 18 | * Supports documents with multiple fields 19 | (e.g. title, body) 20 | 21 | * Supports documents with non-term features 22 | (e.g. quality score, page rank) 23 | 24 | * Uses the state of the art BM25F ranking function 25 | 26 | * Adjustable ranking parameters (including field weights 27 | and non-term feature scores) 28 | 29 | * In-memory but quite compact. It does not keep a copy of 30 | your original documents. 31 | 32 | * Quick incremental index updates, making it possible to 33 | keep your text search in-sync with your data. 34 | 35 | It is independent of the document type, so you have to 36 | write the document-specific parts: extracting search terms 37 | and any stop words, case-normalisation or stemming. This 38 | is quite easy using libraries such as 39 | and 40 | . 41 | 42 | The source package includes a demo to illustrate how to 43 | use the library. The demo is a simplified version of how 44 | the library is used in the 45 | 46 | where it provides the backend for the package search feature. 47 | 48 | bug-reports: https://github.com/well-typed/full-text-search/issues 49 | license: BSD-3-Clause 50 | license-file: LICENSE 51 | author: Duncan Coutts 52 | maintainer: Duncan Coutts , 53 | Adam Gundry 54 | copyright: 2013-2014 Duncan Coutts, 2014 Well-Typed LLP, 55 | 2014-2023 IRIS Connect Ltd. 56 | category: Data, Text, Search 57 | build-type: Simple 58 | extra-doc-files: changelog 59 | 60 | tested-with: GHC ==9.2.8 || ==9.4.8 || ==9.6.6 || ==9.8.2 || ==9.10.1 || ==9.12.1 61 | 62 | source-repository head 63 | type: git 64 | location: git@github.com:well-typed/full-text-search.git 65 | 66 | flag build-search-demo 67 | default: False 68 | description: Build a little program illustrating the use of the library 69 | manual: True 70 | 71 | common base-deps 72 | build-depends: base >=4.16 && <4.22, 73 | array >=0.4 && <0.6, 74 | vector >=0.11 && <0.14, 75 | containers >=0.4 && <0.8, 76 | text >=0.11 && <2.2 77 | 78 | library 79 | import: base-deps 80 | exposed-modules: Data.SearchEngine, 81 | Data.SearchEngine.BM25F 82 | other-modules: Data.SearchEngine.Types, 83 | Data.SearchEngine.Update, 84 | Data.SearchEngine.Query, 85 | Data.SearchEngine.Autosuggest, 86 | Data.SearchEngine.SearchIndex, 87 | Data.SearchEngine.DocFeatVals, 88 | Data.SearchEngine.TermBag, 89 | Data.SearchEngine.DocTermIds, 90 | Data.SearchEngine.DocIdSet 91 | hs-source-dirs: src 92 | other-extensions: BangPatterns, 93 | NamedFieldPuns, 94 | RecordWildCards, 95 | GeneralizedNewtypeDeriving, 96 | ScopedTypeVariables 97 | default-language: Haskell2010 98 | ghc-options: -Wall -funbox-strict-fields 99 | 100 | 101 | executable search-demo 102 | main-is: Main.hs 103 | other-modules: PackageSearch 104 | ExtractNameTerms 105 | ExtractDescriptionTerms 106 | PackageIndexUtils 107 | -- support code for package descriptions: 108 | HaddockHtml 109 | HaddockLex 110 | HaddockParse 111 | HaddockTypes 112 | hs-source-dirs: demo 113 | if !flag(build-search-demo) 114 | buildable: False 115 | else 116 | build-depends: full-text-search, 117 | base, 118 | text, 119 | containers, 120 | array, 121 | tokenize >= 0.1 && <0.4, 122 | snowball >= 1.0 && <1.1, 123 | transformers >= 0.5 && <0.6, 124 | split >= 0.2 && <0.3, 125 | Cabal >= 1.14 && <3.15, 126 | bytestring >= 0.12 && <0.13, 127 | filepath >= 1.5 && <1.6, 128 | directory >= 1.3 && <1.5, 129 | tar >= 0.6 && <0.7, 130 | time >= 1.14 && <1.15, 131 | mtl >= 2.2 && <2.4 132 | build-tool-depends: alex:alex, happy:happy 133 | default-language: Haskell2010 134 | other-extensions: GeneralizedNewtypeDeriving 135 | ghc-options: -Wall 136 | 137 | test-suite qc-props 138 | import: base-deps 139 | type: exitcode-stdio-1.0 140 | main-is: Main.hs 141 | hs-source-dirs: src, tests 142 | build-depends: QuickCheck ==2.*, 143 | tasty >=0.8, 144 | tasty-quickcheck >=0.8 145 | other-modules: Test.Data.SearchEngine.TermBag, 146 | Test.Data.SearchEngine.DocIdSet, 147 | Data.SearchEngine.DocTermIds, 148 | Data.SearchEngine.DocIdSet, 149 | Data.SearchEngine.TermBag 150 | default-language: Haskell2010 151 | ghc-options: -Wall 152 | -------------------------------------------------------------------------------- /src/Data/SearchEngine.hs: -------------------------------------------------------------------------------- 1 | module Data.SearchEngine ( 2 | 3 | -- * Basic interface 4 | 5 | -- ** Querying 6 | Term, 7 | query, 8 | 9 | -- *** Query auto-completion \/ auto-suggestion 10 | queryAutosuggest, 11 | ResultsFilter(..), 12 | queryAutosuggestPredicate, 13 | queryAutosuggestMatchingDocuments, 14 | 15 | -- ** Making a search engine instance 16 | initSearchEngine, 17 | SearchEngine, 18 | SearchConfig(..), 19 | SearchRankParameters(..), 20 | FeatureFunction(..), 21 | 22 | -- ** Helper type for non-term features 23 | NoFeatures, 24 | noFeatures, 25 | 26 | -- ** Managing documents to be searched 27 | insertDoc, 28 | insertDocs, 29 | deleteDoc, 30 | 31 | -- * Explain mode for query result rankings 32 | queryExplain, 33 | Explanation(..), 34 | setRankParams, 35 | 36 | -- * Internal sanity check 37 | invariant, 38 | ) where 39 | 40 | import Data.SearchEngine.Types 41 | import Data.SearchEngine.Update 42 | import Data.SearchEngine.Query 43 | import Data.SearchEngine.Autosuggest 44 | 45 | -------------------------------------------------------------------------------- /src/Data/SearchEngine/Autosuggest.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, NamedFieldPuns, RecordWildCards, 2 | ScopedTypeVariables #-} 3 | 4 | module Data.SearchEngine.Autosuggest ( 5 | 6 | -- * Query auto-completion \/ auto-suggestion 7 | queryAutosuggest, 8 | ResultsFilter(..), 9 | 10 | queryAutosuggestPredicate, 11 | queryAutosuggestMatchingDocuments 12 | 13 | ) where 14 | 15 | import Data.SearchEngine.Types 16 | import Data.SearchEngine.Query (ResultsFilter(..)) 17 | import qualified Data.SearchEngine.Query as Query 18 | import qualified Data.SearchEngine.SearchIndex as SI 19 | import qualified Data.SearchEngine.DocIdSet as DocIdSet 20 | import qualified Data.SearchEngine.DocTermIds as DocTermIds 21 | import qualified Data.SearchEngine.BM25F as BM25F 22 | 23 | import Data.Ix 24 | import Data.Ord 25 | import Data.List 26 | import Data.Maybe 27 | import qualified Data.Map as Map 28 | import qualified Data.IntSet as IntSet 29 | import qualified Data.Vector.Unboxed as Vec 30 | 31 | 32 | -- | Execute an \"auto-suggest\" query. This is where one of the search terms 33 | -- is an incomplete prefix and we are looking for possible completions of that 34 | -- search term, and result documents to go with the possible completions. 35 | -- 36 | -- An auto-suggest query only gives useful results when the 'SearchEngine' is 37 | -- configured to use a non-term feature score. That is, when we can give 38 | -- documents an importance score independent of what terms we are looking for. 39 | -- This is because an auto-suggest query is backwards from a normal query: we 40 | -- are asking for relevant terms occurring in important or popular documents 41 | -- so we need some notion of important or popular. Without this we would just 42 | -- be ranking based on term frequency which while it makes sense for normal 43 | -- \"forward\" queries is pretty meaningless for auto-suggest \"reverse\" 44 | -- queries. Indeed for single-term auto-suggest queries the ranking function 45 | -- we use will assign 0 for all documents and completions if there is no 46 | -- non-term feature scores. 47 | -- 48 | queryAutosuggest :: (Ix field, Bounded field, Ix feature, Bounded feature) => 49 | SearchEngine doc key field feature -> 50 | ResultsFilter key -> 51 | [Term] -> Term -> ([(Term, Float)], [(key, Float)]) 52 | queryAutosuggest se resultsFilter precedingTerms partialTerm = 53 | 54 | step_external 55 | . step_rank 56 | . step_scoreDs 57 | . step_scoreTs 58 | . step_cache 59 | . step_postfilterlimit 60 | . step_filter 61 | . step_prefilterlimit 62 | . step_process 63 | $ step_prep 64 | precedingTerms partialTerm 65 | 66 | where 67 | -- Construct the auto-suggest query from the query terms 68 | step_prep pre_ts t = mkAutosuggestQuery se pre_ts t 69 | 70 | -- Find the appropriate subset of ts and ds 71 | -- and an intermediate result that will be useful later: 72 | -- { (t, ds ∩ ds_t) | t ∈ ts, ds ∩ ds_t ≠ ∅ } 73 | step_process (ts, ds, pre_ts) = (ts', ds', tdss', pre_ts) 74 | where 75 | (tdss', ts', ds') = processAutosuggestQuery se (ts, ds, pre_ts) 76 | 77 | -- If the number of docs results is huge then we may not want to bother 78 | -- and just return no results. Even the filtering of a huge number of 79 | -- docs can be expensive. 80 | step_prefilterlimit args@(_, ds, _, _) 81 | | withinPrefilterLimit se ds = args 82 | | otherwise = ([], DocIdSet.empty, [], []) 83 | 84 | -- Filter ds to those that are visible for this query 85 | -- and at the same time, do the docid -> docinfo lookup 86 | -- (needed at this step anyway to do the filter) 87 | step_filter (ts, ds, tdss, pre_ts) = (ts, ds_info, tdss, pre_ts) 88 | where 89 | ds_info = filterAutosuggestQuery se resultsFilter ds 90 | 91 | -- If the number of docs results is huge then we may not want to bother 92 | -- and just return no results. Scoring a large number of docs is expensive. 93 | step_postfilterlimit args@(_, ds_info, _, _) 94 | | withinPostfilterLimit se ds_info = args 95 | | otherwise = ([], [], [], []) 96 | 97 | -- For all ds, calculate and cache a couple bits of info needed 98 | -- later for scoring completion terms and doc results 99 | step_cache (ts, ds_info, tdss, pre_ts) = (ds_info', tdss) 100 | where 101 | ds_info' = cacheDocScoringInfo se ts ds_info pre_ts 102 | 103 | -- Score the completion terms 104 | step_scoreTs (ds_info, tdss) = (ds_info, tdss, ts_scored) 105 | where 106 | ts_scored = scoreAutosuggestQueryCompletions tdss ds_info 107 | 108 | -- Score the doc results (making use of the completion scores) 109 | step_scoreDs (ds_info, tdss, ts_scored) = (ts_scored, ds_scored) 110 | where 111 | ds_scored = scoreAutosuggestQueryResults tdss ds_info ts_scored 112 | 113 | -- Rank the completions and results based on their scores 114 | step_rank = sortResults 115 | 116 | -- Convert from internal Ids into external forms: Term and doc key 117 | step_external = convertIdsToExternal se 118 | 119 | 120 | -- | Given an incomplete prefix query, find the set of documents that match 121 | -- possible completions of that query. This should be less computationally 122 | -- expensive than 'queryAutosuggest' as it does not do any ranking of documents. 123 | -- However, it does not apply the pre-filter or post-filter limits, and the list 124 | -- may be large when the query terms occur in many documents. The order of 125 | -- returned keys is unspecified. 126 | queryAutosuggestMatchingDocuments :: (Ix field, Bounded field, Ord key) => 127 | SearchEngine doc key field feature -> 128 | [Term] -> Term -> [key] 129 | queryAutosuggestMatchingDocuments se@SearchEngine{searchIndex} precedingTerms partialTerm = 130 | let (_, _, ds) = processAutosuggestQuery se (mkAutosuggestQuery se precedingTerms partialTerm) 131 | in map (SI.getDocKey searchIndex) (DocIdSet.toList ds) 132 | 133 | -- | Given an incomplete prefix query, return a predicate that indicates whether 134 | -- a key is in the set of documents that match possible completions of that 135 | -- query. This is equivalent to calling 'queryAutosuggestMatchingDocuments' and 136 | -- testing whether the key is in the list, but should be more efficient. 137 | -- 138 | -- This does not apply the pre-filter or post-filter limits. 139 | queryAutosuggestPredicate :: (Ix field, Bounded field, Ord key) => 140 | SearchEngine doc key field feature -> 141 | [Term] -> Term -> (key -> Bool) 142 | queryAutosuggestPredicate se@SearchEngine{searchIndex} precedingTerms partialTerm = 143 | let (_, _, ds) = processAutosuggestQuery se (mkAutosuggestQuery se precedingTerms partialTerm) 144 | in (\ key -> maybe False (flip DocIdSet.member ds) (SI.lookupDocKeyDocId searchIndex key)) 145 | 146 | 147 | -- We apply hard limits both before and after filtering. 148 | -- The post-filter limit is to avoid scoring 1000s of documents. 149 | -- The pre-filter limit is to avoid filtering 1000s of docs (which in some 150 | -- apps may be expensive itself) 151 | 152 | withinPrefilterLimit :: SearchEngine doc key field feature -> 153 | DocIdSet -> Bool 154 | withinPrefilterLimit SearchEngine{searchRankParams} ds = 155 | DocIdSet.size ds <= paramAutosuggestPrefilterLimit searchRankParams 156 | 157 | withinPostfilterLimit :: SearchEngine doc key field feature -> 158 | [a] -> Bool 159 | withinPostfilterLimit SearchEngine{searchRankParams} ds_info = 160 | length ds_info <= paramAutosuggestPostfilterLimit searchRankParams 161 | 162 | 163 | sortResults :: (Ord av, Ord bv) => ([(a,av)], [(b,bv)]) -> ([(a,av)], [(b,bv)]) 164 | sortResults (xs, ys) = 165 | ( sortBySndDescending xs 166 | , sortBySndDescending ys ) 167 | where 168 | sortBySndDescending :: Ord v => [(x,v)] -> [(x,v)] 169 | sortBySndDescending = sortBy (flip (comparing snd)) 170 | 171 | convertIdsToExternal :: SearchEngine doc key field feature -> 172 | ([(TermId, v)], [(DocId, v)]) -> ([(Term, v)], [(key, v)]) 173 | convertIdsToExternal SearchEngine{searchIndex} (termids, docids) = 174 | ( [ (SI.getTerm searchIndex termid, s) | (termid, s) <- termids ] 175 | , [ (SI.getDocKey searchIndex docid, s) | (docid, s) <- docids ] 176 | ) 177 | 178 | 179 | -- From Bast and Weber: 180 | -- 181 | -- An autocompletion query is a pair (T, D), where T is a range of terms 182 | -- (all possible completions of the last term which the user has started 183 | -- typing) and D is a set of documents (the hits for the preceding part of 184 | -- the query). 185 | -- 186 | -- We augment this with the preceding terms because we will need these to 187 | -- score the set of documents D. 188 | -- 189 | -- Note that the set D will be the entire collection in the case that the 190 | -- preceding part of the query is empty. For efficiency we represent that 191 | -- case specially with Maybe. 192 | 193 | type AutosuggestQuery = (Map.Map TermId DocIdSet, Maybe DocIdSet, [TermId]) 194 | 195 | mkAutosuggestQuery :: (Ix field, Bounded field) => 196 | SearchEngine doc key field feature -> 197 | [Term] -> Term -> AutosuggestQuery 198 | mkAutosuggestQuery se@SearchEngine{ searchIndex } 199 | precedingTerms partialTerm = 200 | (completionTerms, precedingDocHits, precedingTerms') 201 | where 202 | completionTerms = 203 | Map.unions 204 | [ Map.fromList (SI.lookupTermsByPrefix searchIndex partialTerm') 205 | | partialTerm' <- Query.expandTransformedQueryTerm se partialTerm 206 | ] 207 | 208 | (precedingTerms', precedingDocHits) 209 | | null precedingTerms = ([], Nothing) 210 | | otherwise = fmap carefulUnions 211 | (lookupRawResults precedingTerms) 212 | 213 | -- For the preceding terms, we compute the union of the sets of documents in 214 | -- which they appear. This means that a query like "Apple Blackberry C" 215 | -- will look for documents containing "Apple" or "Blackberry", then later 216 | -- intersect that set with documents containing completions of "C". 217 | -- 218 | -- In general we want to use union rather than intersection here, because 219 | -- the preceding terms might contain some useful and some missing terms, and 220 | -- if we took the intersection we would end up with no results; thus we rely 221 | -- on scoring to rank the best matches highest. 222 | -- 223 | -- However, this leads to an issue: if some of the terms are extremely 224 | -- common, we might end up taking unions of very large document sets, which 225 | -- is a performance disaster. We address this by unioning only sets smaller 226 | -- than the pre-filter limit (but falling back on the whole collection if 227 | -- all sets are too large). This means that: 228 | -- 229 | -- * A query containing a mixture of common and uncommon preceding terms 230 | -- will be completed/ranked solely based on the uncommon terms. For 231 | -- example, "Apple Blackberry C" will be equivalent to "Blackberry C" if 232 | -- there are many apples. 233 | -- 234 | -- * A query containing only common preceding terms will be 235 | -- completed/ranked as if only the final term was present. For example, 236 | -- "Apple Blackberry C" will be equivalent to "C" if there are many 237 | -- apples and blackberries. 238 | -- 239 | carefulUnions :: [DocIdSet] -> Maybe DocIdSet 240 | carefulUnions dss 241 | | null dss = Just DocIdSet.empty 242 | | null dss' = Nothing 243 | | otherwise = Just (DocIdSet.unions dss') 244 | where 245 | dss' = filter (withinPrefilterLimit se) dss 246 | 247 | lookupRawResults :: [Term] -> ([TermId], [DocIdSet]) 248 | lookupRawResults ts = 249 | unzip $ catMaybes 250 | [ SI.lookupTerm searchIndex t' 251 | | t <- ts 252 | , t' <- Query.expandTransformedQueryTerm se t 253 | ] 254 | 255 | 256 | 257 | -- From Bast and Weber: 258 | -- 259 | -- To process the query means to compute the subset T' ⊆ T of terms that 260 | -- occur in at least one document from D, as well as the subset D' ⊆ D of 261 | -- documents that contain at least one of these words. 262 | -- 263 | -- The obvious way to use an inverted index to process an autocompletion 264 | -- query (T, D) is to compute, for each t ∈ T, the intersections D ∩ Dt. 265 | -- Then, T' is simply the set of all t for which the intersection was 266 | -- non-empty, and D' is the union of all (non-empty) intersections. 267 | -- 268 | -- We will do this but additionally we will return all the non-empty 269 | -- intersections because they will be useful when scoring. 270 | 271 | processAutosuggestQuery :: SearchEngine doc key field feature -> 272 | AutosuggestQuery -> 273 | ([(TermId, DocIdSet)], [TermId], DocIdSet) 274 | processAutosuggestQuery se (completionTerms, precedingDocHits, _) 275 | -- Check all the individual document sets are smaller than the pre-filter 276 | -- limit. If any are larger, their union must also be too large, so we return 277 | -- no results now rather than having to compute the union (which may be 278 | -- expensive) only for it to inevitably hit the limit. 279 | | all (withinPrefilterLimit se) docSets = 280 | ( completionTermAndDocSets 281 | , completionTerms' 282 | , allTermDocSet 283 | ) 284 | | otherwise = ([], [], DocIdSet.empty) 285 | where 286 | -- We look up each candidate completion to find the set of documents 287 | -- it appears in, and filtering (intersecting) down to just those 288 | -- appearing in the existing partial query results (if any). 289 | -- Candidate completions not appearing at all within the existing 290 | -- partial query results are excluded at this stage. 291 | -- 292 | -- We have to keep these doc sets for the whole process, so we keep 293 | -- them as the compact DocIdSet type. 294 | -- 295 | completionTermAndDocSets :: [(TermId, DocIdSet)] 296 | completionTermAndDocSets = 297 | [ (t, ds_t') 298 | | (t, ds_t) <- Map.toList completionTerms 299 | , let ds_t' = case precedingDocHits of 300 | Just ds -> ds `DocIdSet.intersection` ds_t 301 | Nothing -> ds_t 302 | , not (DocIdSet.null ds_t') 303 | ] 304 | 305 | -- The remaining candidate completions 306 | completionTerms' :: [TermId] 307 | docSets :: [DocIdSet] 308 | (completionTerms', docSets) = unzip completionTermAndDocSets 309 | 310 | -- The union of all these is this set of documents that form the results. 311 | allTermDocSet :: DocIdSet 312 | allTermDocSet = DocIdSet.unions docSets 313 | 314 | 315 | filterAutosuggestQuery :: SearchEngine doc key field feature -> 316 | ResultsFilter key -> 317 | DocIdSet -> 318 | [(DocId, (key, DocTermIds field, DocFeatVals feature))] 319 | filterAutosuggestQuery SearchEngine{ searchIndex } resultsFilter ds = 320 | case resultsFilter of 321 | NoFilter -> 322 | [ (docid, doc) 323 | | docid <- DocIdSet.toList ds 324 | , let doc = SI.lookupDocId searchIndex docid ] 325 | 326 | FilterPredicate predicate -> 327 | [ (docid, doc) 328 | | docid <- DocIdSet.toList ds 329 | , let doc@(k,_,_) = SI.lookupDocId searchIndex docid 330 | , predicate k ] 331 | 332 | FilterBulkPredicate bulkPredicate -> 333 | [ (docid, doc) 334 | | let docids = DocIdSet.toList ds 335 | docinf = map (SI.lookupDocId searchIndex) docids 336 | keep = bulkPredicate [ k | (k,_,_) <- docinf ] 337 | , (docid, doc, True) <- zip3 docids docinf keep ] 338 | 339 | 340 | -- Scoring 341 | ------------- 342 | -- 343 | -- From Bast and Weber: 344 | -- In practice, only a selection of items from these lists can and will be 345 | -- presented to the user, and it is of course crucial that the most relevant 346 | -- completions and hits are selected. 347 | -- 348 | -- A standard approach for this task in ad-hoc retrieval is to have a 349 | -- precomputed score for each term-in-document pair, and when a query is 350 | -- being processed, to aggregate these scores for each candidate document, 351 | -- and return documents with the highest such aggregated scores. 352 | -- 353 | -- Both INV and HYB can be easily adapted to implement any such scoring and 354 | -- aggregation scheme: store by each term-in-document pair its precomputed 355 | -- score, and when intersecting, aggregate the scores. A decision has to be 356 | -- made on how to reconcile scores from different completions within the 357 | -- same document. We suggest the following: when merging the intersections 358 | -- (which gives the set D' according to Definition 1), compute for each 359 | -- document in D' the maximal score achieved for some completion in T' 360 | -- contained in that document, and compute for each completion in T' the 361 | -- maximal score achieved for a hit from D' achieved for this completion. 362 | -- 363 | -- So firstly let us explore what this means and then discuss why it does not 364 | -- work for BM25. 365 | -- 366 | -- The "precomputed score for each term-in-document pair" refers to the bm25 367 | -- score for this term in this document (and obviously doesn't have to be 368 | -- precomputed, though that'd be faster). 369 | -- 370 | -- So the score for a document d ∈ D' is: 371 | -- maximum of score for d ∈ D ∩ Dt, for any t ∈ T' 372 | -- 373 | -- While the score for a completion t ∈ T' is: 374 | -- maximum of score for d ∈ D ∩ Dt 375 | -- 376 | -- So for documents we say their score is their best score for any of the 377 | -- completion terms they contain. While for completions we say their score 378 | -- is their best score for any of the documents they appear in. 379 | -- 380 | -- For a scoring function like BM25 this appears to be not a good method, both 381 | -- in principle and in practice. Consider what terms get high BM25 scores: 382 | -- very rare ones. So this means we're going to score highly documents that 383 | -- contain the least frequent terms, and completions that are themselves very 384 | -- rare. This is silly. 385 | -- 386 | -- Another important thing to note is that if we use this scoring method then 387 | -- we are using the BM25 score in a way that makes no sense. The BM25 score 388 | -- for different documents for the /same/ set of terms are comparable. The 389 | -- score for the same for different document with different terms are simply 390 | -- not comparable. 391 | -- 392 | -- This also makes sense if you consider what question the BM25 score is 393 | -- answering: "what is the likelihood that this document is relevant given that 394 | -- I judge these terms to be relevant". However an auto-suggest query is 395 | -- different: "what is the likelihood that this term is relevant given the 396 | -- importance/popularity of the documents (and any preceding terms I've judged 397 | -- to be relevant)". They are both conditional likelihood questions but with 398 | -- different premises. 399 | -- 400 | -- More generally, term frequency information simply isn't useful for 401 | -- auto-suggest queries. We don't want results that have the most obscure terms 402 | -- nor the most common terms, not even something in-between. Term frequency 403 | -- just doesn't tell us anything unless we've already judged terms to be 404 | -- relevant, and in an auto-suggest query we've not done that yet. 405 | -- 406 | -- What we really need is information on the importance/popularity of the 407 | -- documents. We can actually do something with that. 408 | -- 409 | -- So, instead we follow a different strategy. We require that we have 410 | -- importance/popularity info for the documents. 411 | -- 412 | -- A first approximation would be to rank result documents by their importance 413 | -- and completion terms by the sum of the importance of the documents each 414 | -- term appears in. 415 | -- 416 | -- Score for a document d ∈ D' 417 | -- importance score for d 418 | -- 419 | -- Score for a completion t ∈ T' 420 | -- sum of importance score for d ∈ D ∩ Dt 421 | -- 422 | -- The only problem with this is that just because a term appears in an 423 | -- important document, doesn't mean that term is /about/ that document, or to 424 | -- put it another way, that term may not be relevant for that document. For 425 | -- example common words like "the" likely appear in all important documents 426 | -- but this doesn't really tell us anything because "the" isn't an important 427 | -- keyword. 428 | -- 429 | -- So what we want to do is to weight the document importance by the relevance 430 | -- of the keyword to the document. So now if we have an important document and 431 | -- a relevant keyword for that document then we get a high score, but an 432 | -- irrelevant term like "the" would get a very low weighting and so would not 433 | -- contribute much to the score, even for very important documents. 434 | -- 435 | -- The intuition is that we will score term completions by taking the 436 | -- document importance weighted by the relevance of that term to that document 437 | -- and summing over all the documents where the term occurs. 438 | -- 439 | -- We define document importance (for the set D') to be the BM25F score for 440 | -- the documents with any preceding terms. So this includes the non-term 441 | -- feature score for the importance/popularity, and also takes account of 442 | -- preceding terms if there were any. 443 | -- 444 | -- We define term relevance (for terms in documents) to be the BM25F score for 445 | -- that term in that document as a fraction of the total BM25F score for all 446 | -- terms in the document. Thus the relevance of all terms in a document sums 447 | -- to 1. 448 | -- 449 | -- Now we can re-weight the document importance by the term relevance: 450 | -- 451 | -- Score for a completion t ∈ T' 452 | -- sum (for d ∈ D ∩ Dt) of ( importance for d * relevance for t in d ) 453 | -- 454 | -- And now for document result scores. We don't want to just stick with the 455 | -- innate document importance. We want to re-weight by the completion term 456 | -- scores: 457 | -- 458 | -- Score for a document d ∈ D' 459 | -- sum (for t ∈ T' ∩ d) (importance score for d * score for completion t) 460 | -- 461 | -- Clear as mud? 462 | 463 | type DocImportance = Float 464 | type TermRelevanceBreakdown = Map.Map TermId Float 465 | 466 | -- | Precompute the document importance and the term relevance breakdown for 467 | -- all the documents. This will be used in scoring the term completions 468 | -- and the result documents. They will all be used and some used many 469 | -- times so it's better to compute up-front and share. 470 | -- 471 | -- This is actually the expensive bit (which is why we've filtered already). 472 | -- 473 | cacheDocScoringInfo :: (Ix field, Bounded field, Ix feature, Bounded feature) => 474 | SearchEngine doc key field feature -> 475 | [TermId] -> 476 | [(DocId, (key, DocTermIds field, DocFeatVals feature))] -> 477 | [TermId] -> 478 | Map.Map DocId (DocImportance, TermRelevanceBreakdown) 479 | cacheDocScoringInfo se completionTerms allTermDocInfo precedingTerms = 480 | Map.fromList 481 | [ (docid, (docImportance, termRelevances)) 482 | | (docid, (_dockey, doctermids, docfeatvals)) <- allTermDocInfo 483 | , let docImportance = Query.relevanceScore se precedingTerms 484 | doctermids docfeatvals 485 | termRelevances = relevanceBreakdown se doctermids docfeatvals 486 | completionTerms 487 | ] 488 | 489 | -- | Calculate the relevance of each of a given set of terms to the given 490 | -- document. 491 | -- 492 | -- We define the \"relevance\" of each term in a document to be its 493 | -- term-in-document score as a fraction of the total of the scores for all 494 | -- terms in the document. Thus the sum of all the relevance values in the 495 | -- document is 1. 496 | -- 497 | -- Note: we have to calculate the relevance for all terms in the document 498 | -- but we only keep the relevance value for the terms of interest. 499 | -- 500 | relevanceBreakdown :: forall doc key field feature. 501 | (Ix field, Bounded field, Ix feature, Bounded feature) => 502 | SearchEngine doc key field feature -> 503 | DocTermIds field -> DocFeatVals feature -> 504 | [TermId] -> TermRelevanceBreakdown 505 | relevanceBreakdown SearchEngine{ bm25Context } doctermids docfeatvals ts = 506 | let -- We'll calculate the bm25 score for each term in this document 507 | bm25Doc = Query.indexDocToBM25Doc doctermids docfeatvals 508 | 509 | -- Cache the info that depends only on this doc, not the terms 510 | termScore :: (TermId -> (field -> Int) -> Float) 511 | termScore = BM25F.scoreTermsBulk bm25Context bm25Doc 512 | 513 | -- The DocTermIds has the info we need to do bulk scoring, but it's 514 | -- a sparse representation, so we first convert it to a dense table 515 | term :: Int -> TermId 516 | count :: Int -> field -> Int 517 | (!numTerms, term, count) = DocTermIds.denseTable doctermids 518 | 519 | -- We generate the vector of scores for all terms, based on looking up 520 | -- the termid and the per-field counts in the dense table 521 | termScores :: Vec.Vector Float 522 | !termScores = Vec.generate numTerms $ \i -> 523 | termScore (term i) (\f -> count i f) 524 | 525 | -- We keep only the values for the terms we're interested in 526 | -- and normalise so we get the relevence fraction 527 | !scoreSum = Vec.sum termScores 528 | !tset = IntSet.fromList (map fromEnum ts) 529 | in Map.fromList 530 | . Vec.toList 531 | . Vec.map (\(t,s) -> (t, s/scoreSum)) 532 | . Vec.filter (\(t,_) -> fromEnum t `IntSet.member` tset) 533 | . Vec.imap (\i s -> (term i, s)) 534 | $ termScores 535 | 536 | 537 | scoreAutosuggestQueryCompletions :: [(TermId, DocIdSet)] 538 | -> Map.Map DocId (Float, Map.Map TermId Float) 539 | -> [(TermId, Float)] 540 | scoreAutosuggestQueryCompletions completionTermAndDocSets allTermDocInfo = 541 | [ (t, candidateScore t ds_t) 542 | | (t, ds_t) <- completionTermAndDocSets ] 543 | where 544 | -- The score for a completion is the sum of the importance of the 545 | -- documents in which that completion occurs, weighted by the relevance 546 | -- of the term to each document. For example we can have a very 547 | -- important document and our completion term is highly relevant to it 548 | -- or we could have a large number of moderately important documents 549 | -- that our term is quite relevant to. In either example the completion 550 | -- term would score highly. 551 | candidateScore :: TermId -> DocIdSet -> Float 552 | candidateScore t ds_t = 553 | sum [ docImportance * termRelevance 554 | | Just (docImportance, termRelevances) <- 555 | map (`Map.lookup` allTermDocInfo) (DocIdSet.toList ds_t) 556 | , let termRelevance = termRelevances Map.! t 557 | ] 558 | 559 | 560 | scoreAutosuggestQueryResults :: [(TermId, DocIdSet)] -> 561 | Map.Map DocId (Float, Map.Map TermId Float) -> 562 | [(TermId, Float)] -> 563 | [(DocId, Float)] 564 | scoreAutosuggestQueryResults completionTermAndDocSets allTermDocInfo 565 | scoredCandidates = 566 | Map.toList $ Map.fromListWith (+) 567 | [ (docid, docImportance * score_t) 568 | | ((_, ds_t), (_, score_t)) <- zip completionTermAndDocSets scoredCandidates 569 | , let docids = DocIdSet.toList ds_t 570 | docinfo = map (`Map.lookup` allTermDocInfo) docids 571 | , (docid, Just (docImportance, _)) <- zip docids docinfo 572 | ] 573 | 574 | -------------------------------------------------------------------------------- /src/Data/SearchEngine/BM25F.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecordWildCards, BangPatterns, ScopedTypeVariables #-} 2 | 3 | -- | An implementation of BM25F ranking. See: 4 | -- 5 | -- * A quick overview: 6 | -- 7 | -- * /The Probabilistic Relevance Framework: BM25 and Beyond/ 8 | -- 9 | -- 10 | -- * /An Introduction to Information Retrieval/ 11 | -- 12 | -- 13 | module Data.SearchEngine.BM25F ( 14 | -- * The ranking function 15 | score, 16 | Context(..), 17 | FeatureFunction(..), 18 | Doc(..), 19 | -- ** Specialised variants 20 | scoreTermsBulk, 21 | 22 | -- * Explaining the score 23 | Explanation(..), 24 | explain, 25 | ) where 26 | 27 | import Data.Ix 28 | import Data.Array.Unboxed 29 | 30 | data Context term field feature = Context { 31 | numDocsTotal :: !Int, 32 | avgFieldLength :: field -> Float, 33 | numDocsWithTerm :: term -> Int, 34 | paramK1 :: !Float, 35 | paramB :: field -> Float, 36 | -- consider minimum length to prevent massive B bonus? 37 | fieldWeight :: field -> Float, 38 | featureWeight :: feature -> Float, 39 | featureFunction :: feature -> FeatureFunction 40 | } 41 | 42 | data Doc term field feature = Doc { 43 | docFieldLength :: field -> Int, 44 | docFieldTermFrequency :: field -> term -> Int, 45 | docFeatureValue :: feature -> Float 46 | } 47 | 48 | 49 | -- | The BM25F score for a document for a given set of terms. 50 | -- 51 | score :: (Ix field, Bounded field, Ix feature, Bounded feature) => 52 | Context term field feature -> 53 | Doc term field feature -> [term] -> Float 54 | score ctx doc terms = 55 | sum (map (weightedTermScore ctx doc) terms) 56 | + sum (map (weightedNonTermScore ctx doc) features) 57 | 58 | where 59 | features = range (minBound, maxBound) 60 | 61 | 62 | weightedTermScore :: (Ix field, Bounded field) => 63 | Context term field feature -> 64 | Doc term field feature -> term -> Float 65 | weightedTermScore ctx doc t = 66 | weightIDF ctx t * tf' 67 | / (k1 + tf') 68 | where 69 | tf' = weightedDocTermFrequency ctx doc t 70 | k1 = paramK1 ctx 71 | 72 | 73 | weightIDF :: Context term field feature -> term -> Float 74 | weightIDF ctx t = 75 | log ((n - n_t + 0.5) / (n_t + 0.5)) 76 | where 77 | n = fromIntegral (numDocsTotal ctx) 78 | n_t = fromIntegral (numDocsWithTerm ctx t) 79 | 80 | 81 | weightedDocTermFrequency :: (Ix field, Bounded field) => 82 | Context term field feature -> 83 | Doc term field feature -> term -> Float 84 | weightedDocTermFrequency ctx doc t = 85 | sum [ w_f * tf_f / _B_f 86 | | field <- range (minBound, maxBound) 87 | , let w_f = fieldWeight ctx field 88 | tf_f = fromIntegral (docFieldTermFrequency doc field t) 89 | _B_f = lengthNorm ctx doc field 90 | , not (isNaN _B_f) 91 | ] 92 | -- When the avgFieldLength is 0 we have a field which is empty for all 93 | -- documents. Unfortunately it leads to a NaN because the 94 | -- docFieldTermFrequency will also be 0 so we get 0/0. What we want to 95 | -- do in this situation is have that field contribute nothing to the 96 | -- score. The simplest way to achieve that is to skip if _B_f is NaN. 97 | -- So I think this is fine and not an ugly hack. 98 | 99 | lengthNorm :: Context term field feature -> 100 | Doc term field feature -> field -> Float 101 | lengthNorm ctx doc field = 102 | (1-b_f) + b_f * sl_f / avgsl_f 103 | where 104 | b_f = paramB ctx field 105 | sl_f = fromIntegral (docFieldLength doc field) 106 | avgsl_f = avgFieldLength ctx field 107 | 108 | 109 | weightedNonTermScore :: (Ix feature, Bounded feature) => 110 | Context term field feature -> 111 | Doc term field feature -> feature -> Float 112 | weightedNonTermScore ctx doc feature = 113 | w_f * _V_f f_f 114 | where 115 | w_f = featureWeight ctx feature 116 | _V_f = applyFeatureFunction (featureFunction ctx feature) 117 | f_f = docFeatureValue doc feature 118 | 119 | 120 | data FeatureFunction 121 | = LogarithmicFunction Float -- ^ @log (\lambda_i + f_i)@ 122 | | RationalFunction Float -- ^ @f_i / (\lambda_i + f_i)@ 123 | | SigmoidFunction Float Float -- ^ @1 / (\lambda + exp(-(\lambda' * f_i))@ 124 | 125 | applyFeatureFunction :: FeatureFunction -> (Float -> Float) 126 | applyFeatureFunction (LogarithmicFunction p1) = \fi -> log (p1 + fi) 127 | applyFeatureFunction (RationalFunction p1) = \fi -> fi / (p1 + fi) 128 | applyFeatureFunction (SigmoidFunction p1 p2) = \fi -> 1 / (p1 + exp (-fi * p2)) 129 | 130 | 131 | ----------------------------- 132 | -- Bulk scoring of many terms 133 | -- 134 | 135 | -- | Most of the time we want to score several different documents for the same 136 | -- set of terms, but sometimes we want to score one document for many terms 137 | -- and in that case we can save a bit of work by doing it in bulk. It lets us 138 | -- calculate once and share things that depend only on the document, and not 139 | -- the term. 140 | -- 141 | -- To take advantage of the sharing you must partially apply and name the 142 | -- per-doc score functon, e.g. 143 | -- 144 | -- > let score :: term -> (field -> Int) -> Float 145 | -- > score = BM25.bulkScorer ctx doc 146 | -- > in sum [ score t (\f -> counts ! (t, f)) | t <- ts ] 147 | -- 148 | scoreTermsBulk :: forall field term feature. (Ix field, Bounded field) => 149 | Context term field feature -> 150 | Doc term field feature -> 151 | (term -> (field -> Int) -> Float) 152 | scoreTermsBulk ctx doc = 153 | -- This is just a rearrangement of weightedTermScore and 154 | -- weightedDocTermFrequency above, with the doc-constant bits hoisted out. 155 | 156 | \t tFreq -> 157 | let !tf' = sum [ w!f * tf_f / _B!f 158 | | f <- range (minBound, maxBound) 159 | , let tf_f = fromIntegral (tFreq f) 160 | _B_f = _B!f 161 | , not (isNaN _B_f) 162 | ] 163 | 164 | in weightIDF ctx t * tf' 165 | / (k1 + tf') 166 | where 167 | -- So long as the caller does the partial application thing then these 168 | -- values can all be shared between many calls with different terms. 169 | 170 | !k1 = paramK1 ctx 171 | w, _B :: UArray field Float 172 | !w = array (minBound, maxBound) 173 | [ (field, fieldWeight ctx field) 174 | | field <- range (minBound, maxBound) ] 175 | !_B = array (minBound, maxBound) 176 | [ (field, lengthNorm ctx doc field) 177 | | field <- range (minBound, maxBound) ] 178 | 179 | 180 | ------------------ 181 | -- Explanation 182 | -- 183 | 184 | -- | A breakdown of the BM25F score, to explain somewhat how it relates to 185 | -- the inputs, and so you can compare the scores of different documents. 186 | -- 187 | data Explanation field feature term = Explanation { 188 | -- | The overall score is the sum of the 'termScores', 'positionScore' 189 | -- and 'nonTermScore' 190 | overallScore :: Float, 191 | 192 | -- | There is a score contribution from each query term. This is the 193 | -- score for the term across all fields in the document (but see 194 | -- 'termFieldScores'). 195 | termScores :: [(term, Float)], 196 | {- 197 | -- | There is a score contribution for positional information. Terms 198 | -- appearing in the document close together give a bonus. 199 | positionScore :: [(field, Float)], 200 | -} 201 | -- | The document can have an inate bonus score independent of the terms 202 | -- in the query. For example this might be a popularity score. 203 | nonTermScores :: [(feature, Float)], 204 | 205 | -- | This does /not/ contribute to the 'overallScore'. It is an 206 | -- indication of how the 'termScores' relates to per-field scores. 207 | -- Note however that the term score for all fields is /not/ simply 208 | -- sum of the per-field scores. The point of the BM25F scoring function 209 | -- is that a linear combination of per-field scores is wrong, and BM25F 210 | -- does a more cunning non-linear combination. 211 | -- 212 | -- However, it is still useful as an indication to see scores for each 213 | -- field for a term, to see how the compare. 214 | -- 215 | termFieldScores :: [(term, [(field, Float)])] 216 | } 217 | deriving Show 218 | 219 | instance Functor (Explanation field feature) where 220 | fmap f e@Explanation{..} = 221 | e { 222 | termScores = [ (f t, s) | (t, s) <- termScores ], 223 | termFieldScores = [ (f t, fs) | (t, fs) <- termFieldScores ] 224 | } 225 | 226 | explain :: (Ix field, Bounded field, Ix feature, Bounded feature) => 227 | Context term field feature -> 228 | Doc term field feature -> [term] -> Explanation field feature term 229 | explain ctx doc ts = 230 | Explanation {..} 231 | where 232 | overallScore = sum (map snd termScores) 233 | -- + sum (map snd positionScore) 234 | + sum (map snd nonTermScores) 235 | termScores = [ (t, weightedTermScore ctx doc t) | t <- ts ] 236 | -- positionScore = [ (f, 0) | f <- range (minBound, maxBound) ] 237 | nonTermScores = [ (feature, weightedNonTermScore ctx doc feature) 238 | | feature <- range (minBound, maxBound) ] 239 | 240 | termFieldScores = 241 | [ (t, fieldScores) 242 | | t <- ts 243 | , let fieldScores = 244 | [ (f, weightedTermScore ctx' doc t) 245 | | f <- range (minBound, maxBound) 246 | , let ctx' = ctx { fieldWeight = fieldWeightOnly f } 247 | ] 248 | ] 249 | fieldWeightOnly f f' | sameField f f' = fieldWeight ctx f' 250 | | otherwise = 0 251 | 252 | sameField f f' = index (minBound, maxBound) f 253 | == index (minBound, maxBound) f' 254 | -------------------------------------------------------------------------------- /src/Data/SearchEngine/DocFeatVals.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving #-} 2 | module Data.SearchEngine.DocFeatVals ( 3 | DocFeatVals, 4 | featureValue, 5 | create, 6 | ) where 7 | 8 | import Data.SearchEngine.DocTermIds (vecIndexIx, vecCreateIx) 9 | import Data.Vector (Vector) 10 | import Data.Ix (Ix) 11 | 12 | 13 | -- | Storage for the non-term feature values i a document. 14 | -- 15 | newtype DocFeatVals feature = DocFeatVals (Vector Float) 16 | deriving (Show) 17 | 18 | featureValue :: (Ix feature, Bounded feature) => DocFeatVals feature -> feature -> Float 19 | featureValue (DocFeatVals featVec) = vecIndexIx featVec 20 | 21 | create :: (Ix feature, Bounded feature) => 22 | (feature -> Float) -> DocFeatVals feature 23 | create docFeatVals = 24 | DocFeatVals (vecCreateIx docFeatVals) 25 | 26 | -------------------------------------------------------------------------------- /src/Data/SearchEngine/DocIdSet.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, MultiParamTypeClasses, 2 | TypeFamilies #-} 3 | module Data.SearchEngine.DocIdSet ( 4 | DocId(DocId), 5 | DocIdSet(..), 6 | null, 7 | size, 8 | empty, 9 | singleton, 10 | fromList, 11 | toList, 12 | insert, 13 | delete, 14 | member, 15 | union, 16 | unions, 17 | intersection, 18 | invariant, 19 | ) where 20 | 21 | import Data.Word 22 | import qualified Data.Vector.Unboxed as Vec 23 | import qualified Data.Vector.Unboxed.Mutable as MVec 24 | import qualified Data.Vector.Generic as GVec 25 | import qualified Data.Vector.Generic.Mutable as GMVec 26 | import Control.Monad.ST 27 | import Control.Monad (liftM) 28 | import qualified Data.Set as Set 29 | import qualified Data.List as List 30 | import Data.Function (on) 31 | 32 | import Prelude hiding (null) 33 | 34 | 35 | newtype DocId = DocId { unDocId :: Word32 } 36 | deriving (Eq, Ord, Show, Enum, Bounded) 37 | 38 | newtype DocIdSet = DocIdSet (Vec.Vector DocId) 39 | deriving (Eq, Show) 40 | 41 | -- represented as a sorted sequence of ids 42 | invariant :: DocIdSet -> Bool 43 | invariant (DocIdSet vec) = 44 | strictlyAscending (Vec.toList vec) 45 | where 46 | strictlyAscending (a:xs@(b:_)) = a < b && strictlyAscending xs 47 | strictlyAscending _ = True 48 | 49 | 50 | size :: DocIdSet -> Int 51 | size (DocIdSet vec) = Vec.length vec 52 | 53 | null :: DocIdSet -> Bool 54 | null (DocIdSet vec) = Vec.null vec 55 | 56 | empty :: DocIdSet 57 | empty = DocIdSet Vec.empty 58 | 59 | singleton :: DocId -> DocIdSet 60 | singleton = DocIdSet . Vec.singleton 61 | 62 | fromList :: [DocId] -> DocIdSet 63 | fromList = DocIdSet . Vec.fromList . Set.toAscList . Set.fromList 64 | 65 | toList :: DocIdSet -> [DocId] 66 | toList (DocIdSet vec) = Vec.toList vec 67 | 68 | insert :: DocId -> DocIdSet -> DocIdSet 69 | insert x (DocIdSet vec) = 70 | case binarySearch vec 0 (Vec.length vec - 1) x of 71 | (_, True) -> DocIdSet vec 72 | (i, False) -> case Vec.splitAt i vec of 73 | (before, after) -> 74 | DocIdSet (Vec.concat [before, Vec.singleton x, after]) 75 | 76 | delete :: DocId -> DocIdSet -> DocIdSet 77 | delete x (DocIdSet vec) = 78 | case binarySearch vec 0 (Vec.length vec - 1) x of 79 | (_, False) -> DocIdSet vec 80 | (i, True) -> case Vec.splitAt i vec of 81 | (before, after) -> 82 | DocIdSet (before Vec.++ Vec.tail after) 83 | 84 | member :: DocId -> DocIdSet -> Bool 85 | member x (DocIdSet vec) = snd (binarySearch vec 0 (Vec.length vec - 1) x) 86 | 87 | binarySearch :: Vec.Vector DocId -> Int -> Int -> DocId -> (Int, Bool) 88 | binarySearch vec !a !b !key 89 | | a > b = (a, False) 90 | | otherwise = 91 | let mid = (a + b) `div` 2 92 | in case compare key (vec Vec.! mid) of 93 | LT -> binarySearch vec a (mid-1) key 94 | EQ -> (mid, True) 95 | GT -> binarySearch vec (mid+1) b key 96 | 97 | unions :: [DocIdSet] -> DocIdSet 98 | unions = List.foldl' union empty 99 | -- a bit more effecient if we merge small ones first 100 | . List.sortBy (compare `on` size) 101 | 102 | union :: DocIdSet -> DocIdSet -> DocIdSet 103 | union x y | null x = y 104 | | null y = x 105 | union (DocIdSet xs) (DocIdSet ys) = 106 | DocIdSet (Vec.create (MVec.new sizeBound >>= writeMergedUnion xs ys)) 107 | where 108 | sizeBound = Vec.length xs + Vec.length ys 109 | 110 | writeMergedUnion :: Vec.Vector DocId -> Vec.Vector DocId -> 111 | MVec.MVector s DocId -> ST s (MVec.MVector s DocId) 112 | writeMergedUnion xs0 ys0 !out = do 113 | i <- go xs0 ys0 0 114 | return $! MVec.take i out 115 | where 116 | go !xs !ys !i 117 | | Vec.null xs = do Vec.copy (MVec.slice i (Vec.length ys) out) ys 118 | return (i + Vec.length ys) 119 | | Vec.null ys = do Vec.copy (MVec.slice i (Vec.length xs) out) xs 120 | return (i + Vec.length xs) 121 | | otherwise = let x = Vec.head xs; y = Vec.head ys 122 | in case compare x y of 123 | GT -> do MVec.write out i y 124 | go xs (Vec.tail ys) (i+1) 125 | EQ -> do MVec.write out i x 126 | go (Vec.tail xs) (Vec.tail ys) (i+1) 127 | LT -> do MVec.write out i x 128 | go (Vec.tail xs) ys (i+1) 129 | 130 | intersection :: DocIdSet -> DocIdSet -> DocIdSet 131 | intersection x y | null x = empty 132 | | null y = empty 133 | intersection (DocIdSet xs) (DocIdSet ys) = 134 | DocIdSet (Vec.create (MVec.new sizeBound >>= writeMergedIntersection xs ys)) 135 | where 136 | sizeBound = max (Vec.length xs) (Vec.length ys) 137 | 138 | writeMergedIntersection :: Vec.Vector DocId -> Vec.Vector DocId -> 139 | MVec.MVector s DocId -> ST s (MVec.MVector s DocId) 140 | writeMergedIntersection xs0 ys0 !out = do 141 | i <- go xs0 ys0 0 142 | return $! MVec.take i out 143 | where 144 | go !xs !ys !i 145 | | Vec.null xs = return i 146 | | Vec.null ys = return i 147 | | otherwise = let x = Vec.head xs; y = Vec.head ys 148 | in case compare x y of 149 | GT -> go xs (Vec.tail ys) i 150 | EQ -> do MVec.write out i x 151 | go (Vec.tail xs) (Vec.tail ys) (i+1) 152 | LT -> go (Vec.tail xs) ys i 153 | 154 | ------------------------------------------------------------------------------ 155 | -- verbose Unbox instances 156 | -- 157 | 158 | instance MVec.Unbox DocId 159 | 160 | newtype instance MVec.MVector s DocId = MV_DocId (MVec.MVector s Word32) 161 | 162 | instance GMVec.MVector MVec.MVector DocId where 163 | basicLength (MV_DocId v) = GMVec.basicLength v 164 | basicUnsafeSlice i l (MV_DocId v) = MV_DocId (GMVec.basicUnsafeSlice i l v) 165 | basicUnsafeNew l = MV_DocId `liftM` GMVec.basicUnsafeNew l 166 | basicInitialize (MV_DocId v) = GMVec.basicInitialize v 167 | basicUnsafeReplicate l x = MV_DocId `liftM` GMVec.basicUnsafeReplicate l (unDocId x) 168 | basicUnsafeRead (MV_DocId v) i = DocId `liftM` GMVec.basicUnsafeRead v i 169 | basicUnsafeWrite (MV_DocId v) i x = GMVec.basicUnsafeWrite v i (unDocId x) 170 | basicClear (MV_DocId v) = GMVec.basicClear v 171 | basicSet (MV_DocId v) x = GMVec.basicSet v (unDocId x) 172 | basicUnsafeGrow (MV_DocId v) l = MV_DocId `liftM` GMVec.basicUnsafeGrow v l 173 | basicUnsafeCopy (MV_DocId v) (MV_DocId v') = GMVec.basicUnsafeCopy v v' 174 | basicUnsafeMove (MV_DocId v) (MV_DocId v') = GMVec.basicUnsafeMove v v' 175 | basicOverlaps (MV_DocId v) (MV_DocId v') = GMVec.basicOverlaps v v' 176 | {-# INLINE basicLength #-} 177 | {-# INLINE basicUnsafeSlice #-} 178 | {-# INLINE basicOverlaps #-} 179 | {-# INLINE basicUnsafeNew #-} 180 | {-# INLINE basicInitialize #-} 181 | {-# INLINE basicUnsafeReplicate #-} 182 | {-# INLINE basicUnsafeRead #-} 183 | {-# INLINE basicUnsafeWrite #-} 184 | {-# INLINE basicClear #-} 185 | {-# INLINE basicSet #-} 186 | {-# INLINE basicUnsafeCopy #-} 187 | {-# INLINE basicUnsafeMove #-} 188 | {-# INLINE basicUnsafeGrow #-} 189 | 190 | newtype instance Vec.Vector DocId = V_DocId (Vec.Vector Word32) 191 | 192 | instance GVec.Vector Vec.Vector DocId where 193 | basicUnsafeFreeze (MV_DocId mv) = V_DocId `liftM` GVec.basicUnsafeFreeze mv 194 | basicUnsafeThaw (V_DocId v) = MV_DocId `liftM` GVec.basicUnsafeThaw v 195 | basicLength (V_DocId v) = GVec.basicLength v 196 | basicUnsafeSlice i l (V_DocId v) = V_DocId (GVec.basicUnsafeSlice i l v) 197 | basicUnsafeIndexM (V_DocId v) i = DocId `liftM` GVec.basicUnsafeIndexM v i 198 | basicUnsafeCopy (MV_DocId mv) 199 | (V_DocId v) = GVec.basicUnsafeCopy mv v 200 | elemseq (V_DocId v) x = GVec.elemseq v (unDocId x) 201 | {-# INLINE basicUnsafeFreeze #-} 202 | {-# INLINE basicUnsafeThaw #-} 203 | {-# INLINE basicLength #-} 204 | {-# INLINE basicUnsafeSlice #-} 205 | {-# INLINE basicUnsafeIndexM #-} 206 | {-# INLINE basicUnsafeCopy #-} 207 | {-# INLINE elemseq #-} 208 | -------------------------------------------------------------------------------- /src/Data/SearchEngine/DocTermIds.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving #-} 2 | module Data.SearchEngine.DocTermIds ( 3 | DocTermIds, 4 | TermId, 5 | fieldLength, 6 | fieldTermCount, 7 | fieldElems, 8 | create, 9 | denseTable, 10 | vecIndexIx, 11 | vecCreateIx, 12 | ) where 13 | 14 | import Data.SearchEngine.TermBag (TermBag, TermId) 15 | import qualified Data.SearchEngine.TermBag as TermBag 16 | 17 | import Data.Vector (Vector, (!)) 18 | import qualified Data.Vector as Vec 19 | import qualified Data.Vector.Unboxed as UVec 20 | import Data.Ix (Ix) 21 | import qualified Data.Ix as Ix 22 | 23 | 24 | -- | The 'TermId's for the 'Term's that occur in a document. Documents may have 25 | -- multiple fields and the 'DocTerms' type holds them separately for each field. 26 | -- 27 | newtype DocTermIds field = DocTermIds (Vector TermBag) 28 | deriving (Show) 29 | 30 | getField :: (Ix field, Bounded field) => DocTermIds field -> field -> TermBag 31 | getField (DocTermIds fieldVec) = vecIndexIx fieldVec 32 | 33 | create :: (Ix field, Bounded field) => 34 | (field -> [TermId]) -> DocTermIds field 35 | create docTermIds = 36 | DocTermIds (vecCreateIx (TermBag.fromList . docTermIds)) 37 | 38 | -- | The number of terms in a field within the document. 39 | fieldLength :: (Ix field, Bounded field) => DocTermIds field -> field -> Int 40 | fieldLength docterms field = 41 | TermBag.size (getField docterms field) 42 | 43 | -- | /O(log n)/ The frequency of a particular term in a field within the document. 44 | -- 45 | fieldTermCount :: (Ix field, Bounded field) => 46 | DocTermIds field -> field -> TermId -> Int 47 | fieldTermCount docterms field termid = 48 | fromIntegral (TermBag.termCount (getField docterms field) termid) 49 | 50 | fieldElems :: (Ix field, Bounded field) => DocTermIds field -> field -> [TermId] 51 | fieldElems docterms field = 52 | TermBag.elems (getField docterms field) 53 | 54 | -- | The 'DocTermIds' is really a sparse 2d array, and doing lookups with 55 | -- 'fieldTermCount' has a O(log n) cost. This function converts to a dense 56 | -- tabular representation which then enables linear scans. 57 | -- 58 | denseTable :: (Ix field, Bounded field) => DocTermIds field -> 59 | (Int, Int -> TermId, Int -> field -> Int) 60 | denseTable (DocTermIds fieldVec) = 61 | let (!termids, !termcounts) = TermBag.denseTable (Vec.toList fieldVec) 62 | !numTerms = UVec.length termids 63 | in ( numTerms 64 | , \i -> termids UVec.! i 65 | , \i ix -> let j = Ix.index (minBound, maxBound) ix 66 | in fromIntegral (termcounts UVec.! (j * numTerms + i)) 67 | ) 68 | 69 | --------------------------------- 70 | -- Vector indexed by Ix Bounded 71 | -- 72 | 73 | vecIndexIx :: (Ix ix, Bounded ix) => Vector a -> ix -> a 74 | vecIndexIx vec ix = vec ! Ix.index (minBound, maxBound) ix 75 | 76 | vecCreateIx :: (Ix ix, Bounded ix) => (ix -> a) -> Vector a 77 | vecCreateIx f = Vec.fromListN (Ix.rangeSize bounds) 78 | [ y | ix <- Ix.range bounds, let !y = f ix ] 79 | where 80 | bounds = (minBound, maxBound) 81 | 82 | -------------------------------------------------------------------------------- /src/Data/SearchEngine/Query.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, NamedFieldPuns, RecordWildCards #-} 2 | 3 | module Data.SearchEngine.Query ( 4 | 5 | -- * Querying 6 | query, 7 | ResultsFilter(..), 8 | 9 | -- * Explain mode for query result rankings 10 | queryExplain, 11 | BM25F.Explanation(..), 12 | setRankParams, 13 | 14 | -- ** Utils used by autosuggest 15 | relevanceScore, 16 | indexDocToBM25Doc, 17 | expandTransformedQueryTerm, 18 | ) where 19 | 20 | import Data.SearchEngine.Types 21 | import qualified Data.SearchEngine.SearchIndex as SI 22 | import qualified Data.SearchEngine.DocIdSet as DocIdSet 23 | import qualified Data.SearchEngine.DocTermIds as DocTermIds 24 | import qualified Data.SearchEngine.DocFeatVals as DocFeatVals 25 | import qualified Data.SearchEngine.BM25F as BM25F 26 | 27 | import Data.Ix 28 | import Data.List 29 | import Data.Function 30 | import Data.Maybe 31 | 32 | 33 | -- | Execute a normal query. Find the documents in which one or more of 34 | -- the search terms appear and return them in ranked order. 35 | -- 36 | -- The number of documents returned is limited by the 'paramResultsetSoftLimit' 37 | -- and 'paramResultsetHardLimit' paramaters. This also limits the cost of the 38 | -- query (which is primarily the cost of scoring each document). 39 | -- 40 | -- The given terms are all assumed to be complete (as opposed to prefixes 41 | -- like with 'queryAutosuggest'). 42 | -- 43 | query :: (Ix field, Bounded field, Ix feature, Bounded feature) => 44 | SearchEngine doc key field feature -> 45 | [Term] -> [key] 46 | query se@SearchEngine{ searchIndex, 47 | searchRankParams = SearchRankParameters{..} } 48 | terms = 49 | 50 | let -- Start by transforming/normalising all the query terms. 51 | -- This can be done differently for each field we search by. 52 | lookupTerms :: [Term] 53 | lookupTerms = concatMap (expandTransformedQueryTerm se) terms 54 | 55 | -- Then we look up all the normalised terms in the index. 56 | rawresults :: [Maybe (TermId, DocIdSet)] 57 | rawresults = map (SI.lookupTerm searchIndex) lookupTerms 58 | 59 | -- For the terms that occur in the index, this gives us the term's id 60 | -- and the set of documents that the term occurs in. 61 | termids :: [TermId] 62 | docidsets :: [DocIdSet] 63 | (termids, docidsets) = unzip (catMaybes rawresults) 64 | 65 | -- We looked up the documents that *any* of the term occur in (not all) 66 | -- so this could be rather a lot of docs if the user uses a few common 67 | -- terms. Scoring these result docs is a non-trivial cost so we want to 68 | -- limit the number that we have to score. The standard trick is to 69 | -- consider the doc sets in the order of size, smallest to biggest. Once 70 | -- we have gone over a certain threshold of docs then don't bother with 71 | -- the doc sets for the remaining terms. This tends to work because the 72 | -- scoring gives lower weight to terms that occur in many documents. 73 | unrankedResults :: DocIdSet 74 | unrankedResults = pruneRelevantResults 75 | paramResultsetSoftLimit 76 | paramResultsetHardLimit 77 | docidsets 78 | 79 | --TODO: technically this isn't quite correct. Because each field can 80 | -- be normalised differently, we can end up with different termids for 81 | -- the same original search term, and then we score those as if they 82 | -- were different terms, which makes a difference when the term appears 83 | -- in multiple fields (exactly the case BM25F is supposed to deal with). 84 | -- What we ought to have instead is an Array (Int, field) TermId, and 85 | -- make the scoring use the appropriate termid for each field, but to 86 | -- consider them the "same" term. 87 | in rankResults se termids (DocIdSet.toList unrankedResults) 88 | 89 | -- | Before looking up a term in the main index we need to normalise it 90 | -- using the 'transformQueryTerm'. Of course the transform can be different 91 | -- for different fields, so we have to collect all the forms (eliminating 92 | -- duplicates). 93 | -- 94 | expandTransformedQueryTerm :: (Ix field, Bounded field) => 95 | SearchEngine doc key field feature -> 96 | Term -> [Term] 97 | expandTransformedQueryTerm SearchEngine{searchConfig} term = 98 | nub [ transformForField field 99 | | let transformForField = transformQueryTerm searchConfig term 100 | , field <- range (minBound, maxBound) ] 101 | 102 | 103 | rankResults :: (Ix field, Bounded field, Ix feature, Bounded feature) => 104 | SearchEngine doc key field feature -> 105 | [TermId] -> [DocId] -> [key] 106 | rankResults se@SearchEngine{searchIndex} queryTerms docids = 107 | map snd 108 | $ sortBy (flip compare `on` fst) 109 | [ (relevanceScore se queryTerms doctermids docfeatvals, dockey) 110 | | docid <- docids 111 | , let (dockey, doctermids, docfeatvals) = SI.lookupDocId searchIndex docid ] 112 | 113 | relevanceScore :: (Ix field, Bounded field, Ix feature, Bounded feature) => 114 | SearchEngine doc key field feature -> 115 | [TermId] -> DocTermIds field -> DocFeatVals feature -> Float 116 | relevanceScore SearchEngine{bm25Context} queryTerms doctermids docfeatvals = 117 | BM25F.score bm25Context doc queryTerms 118 | where 119 | doc = indexDocToBM25Doc doctermids docfeatvals 120 | 121 | indexDocToBM25Doc :: (Ix field, Bounded field, Ix feature, Bounded feature) => 122 | DocTermIds field -> 123 | DocFeatVals feature -> 124 | BM25F.Doc TermId field feature 125 | indexDocToBM25Doc doctermids docfeatvals = 126 | BM25F.Doc { 127 | BM25F.docFieldLength = DocTermIds.fieldLength doctermids, 128 | BM25F.docFieldTermFrequency = DocTermIds.fieldTermCount doctermids, 129 | BM25F.docFeatureValue = DocFeatVals.featureValue docfeatvals 130 | } 131 | 132 | pruneRelevantResults :: Int -> Int -> [DocIdSet] -> DocIdSet 133 | pruneRelevantResults softLimit hardLimit = 134 | -- Look at the docsets starting with the smallest ones. Smaller docsets 135 | -- correspond to the rarer terms, which are the ones that score most highly. 136 | go DocIdSet.empty . sortBy (compare `on` DocIdSet.size) 137 | where 138 | go !acc [] = acc 139 | go !acc (d:ds) 140 | -- If this is the first one, we add it anyway, otherwise we're in 141 | -- danger of returning no results at all. 142 | | DocIdSet.null acc = go d ds 143 | -- We consider the size our docset would be if we add this extra one... 144 | -- If it puts us over the hard limit then stop. 145 | | size > hardLimit = acc 146 | -- If it puts us over soft limit then we add it and stop 147 | | size > softLimit = DocIdSet.union acc d 148 | -- Otherwise we can add it and carry on to consider the remainder 149 | | otherwise = go (DocIdSet.union acc d) ds 150 | where 151 | size = DocIdSet.size acc + DocIdSet.size d 152 | 153 | 154 | -------------------------------- 155 | -- Normal query with explanation 156 | -- 157 | 158 | queryExplain :: (Ix field, Bounded field, Ix feature, Bounded feature) => 159 | SearchEngine doc key field feature -> 160 | [Term] -> [(BM25F.Explanation field feature Term, key)] 161 | queryExplain se@SearchEngine{ searchIndex, 162 | searchConfig = SearchConfig{transformQueryTerm}, 163 | searchRankParams = SearchRankParameters{..} } 164 | terms = 165 | 166 | -- See 'query' above for explanation. Really we ought to combine them. 167 | let lookupTerms :: [Term] 168 | lookupTerms = [ term' 169 | | term <- terms 170 | , let transformForField = transformQueryTerm term 171 | , term' <- nub [ transformForField field 172 | | field <- range (minBound, maxBound) ] 173 | ] 174 | 175 | rawresults :: [Maybe (TermId, DocIdSet)] 176 | rawresults = map (SI.lookupTerm searchIndex) lookupTerms 177 | 178 | termids :: [TermId] 179 | docidsets :: [DocIdSet] 180 | (termids, docidsets) = unzip (catMaybes rawresults) 181 | 182 | unrankedResults :: DocIdSet 183 | unrankedResults = pruneRelevantResults 184 | paramResultsetSoftLimit 185 | paramResultsetHardLimit 186 | docidsets 187 | 188 | in rankExplainResults se termids (DocIdSet.toList unrankedResults) 189 | 190 | rankExplainResults :: (Ix field, Bounded field, Ix feature, Bounded feature) => 191 | SearchEngine doc key field feature -> 192 | [TermId] -> 193 | [DocId] -> 194 | [(BM25F.Explanation field feature Term, key)] 195 | rankExplainResults se@SearchEngine{searchIndex} queryTerms docids = 196 | sortBy (flip compare `on` (BM25F.overallScore . fst)) 197 | [ (explainRelevanceScore se queryTerms doctermids docfeatvals, dockey) 198 | | docid <- docids 199 | , let (dockey, doctermids, docfeatvals) = SI.lookupDocId searchIndex docid ] 200 | 201 | 202 | explainRelevanceScore :: (Ix field, Bounded field, Ix feature, Bounded feature) => 203 | SearchEngine doc key field feature -> 204 | [TermId] -> 205 | DocTermIds field -> 206 | DocFeatVals feature -> 207 | BM25F.Explanation field feature Term 208 | explainRelevanceScore SearchEngine{bm25Context, searchIndex} 209 | queryTerms doctermids docfeatvals = 210 | fmap (SI.getTerm searchIndex) (BM25F.explain bm25Context doc queryTerms) 211 | where 212 | doc = indexDocToBM25Doc doctermids docfeatvals 213 | 214 | 215 | setRankParams :: SearchRankParameters field feature -> 216 | SearchEngine doc key field feature -> 217 | SearchEngine doc key field feature 218 | setRankParams params@SearchRankParameters{..} se = 219 | se { 220 | searchRankParams = params, 221 | bm25Context = (bm25Context se) { 222 | BM25F.paramK1 = paramK1, 223 | BM25F.paramB = paramB, 224 | BM25F.fieldWeight = paramFieldWeights, 225 | BM25F.featureWeight = paramFeatureWeights, 226 | BM25F.featureFunction = paramFeatureFunctions 227 | } 228 | } 229 | 230 | 231 | -------------------------------- 232 | -- Results filter 233 | -- 234 | 235 | -- | In some applications it is necessary to enforce some security or 236 | -- visibility rule about the query results (e.g. in a typical DB-based 237 | -- application different users can see different data items). Typically 238 | -- it would be too expensive to build different search indexes for the 239 | -- different contexts and so the strategy is to use one index containing 240 | -- everything and filter for visibility in the results. This means the 241 | -- filter condition is different for different queries (e.g. performed 242 | -- on behalf of different users). 243 | -- 244 | -- Filtering the results after a query is possible but not the most efficient 245 | -- thing to do because we've had to score all the not-visible documents. 246 | -- The better thing to do is to filter as part of the query, this way we can 247 | -- filter before the expensive scoring. 248 | -- 249 | -- We provide one further optimisation: bulk predicates. In some applications 250 | -- it can be quicker to check the security\/visibility of a whole bunch of 251 | -- results all in one go. 252 | -- 253 | data ResultsFilter key = NoFilter 254 | | FilterPredicate (key -> Bool) 255 | | FilterBulkPredicate ([key] -> [Bool]) 256 | --TODO: allow filtering & non-feature score lookup in one bulk op 257 | 258 | -------------------------------------------------------------------------------- /src/Data/SearchEngine/SearchIndex.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, NamedFieldPuns #-} 2 | 3 | module Data.SearchEngine.SearchIndex ( 4 | SearchIndex, 5 | Term, 6 | TermId, 7 | DocId, 8 | 9 | emptySearchIndex, 10 | insertDoc, 11 | deleteDoc, 12 | 13 | docCount, 14 | lookupTerm, 15 | lookupTermsByPrefix, 16 | lookupTermId, 17 | lookupDocId, 18 | lookupDocKey, 19 | lookupDocKeyDocId, 20 | 21 | getTerm, 22 | getDocKey, 23 | 24 | invariant, 25 | ) where 26 | 27 | import Data.SearchEngine.DocIdSet (DocIdSet, DocId) 28 | import qualified Data.SearchEngine.DocIdSet as DocIdSet 29 | import Data.SearchEngine.DocTermIds (DocTermIds, TermId, vecIndexIx, vecCreateIx) 30 | import qualified Data.SearchEngine.DocTermIds as DocTermIds 31 | import Data.SearchEngine.DocFeatVals (DocFeatVals) 32 | import qualified Data.SearchEngine.DocFeatVals as DocFeatVals 33 | 34 | import Data.Ix (Ix) 35 | import qualified Data.Ix as Ix 36 | import Data.Map (Map) 37 | import qualified Data.Map as Map 38 | import Data.IntMap (IntMap) 39 | import qualified Data.IntMap as IntMap 40 | import qualified Data.Set as Set 41 | import Data.Text (Text) 42 | import qualified Data.Text as T 43 | import qualified Data.List as List 44 | 45 | import Control.Exception (assert) 46 | 47 | -- | Terms are short strings, usually whole words. 48 | -- 49 | type Term = Text 50 | 51 | -- | The search index is essentially a many-to-many mapping between documents 52 | -- and terms. Each document contains many terms and each term occurs in many 53 | -- documents. It is a bidirectional mapping as we need to support lookups in 54 | -- both directions. 55 | -- 56 | -- Documents are identified by a key (in Ord) while terms are text values. 57 | -- Inside the index however we assign compact numeric ids to both documents and 58 | -- terms. The advantage of this is a much more compact in-memory representation 59 | -- and the disadvantage is greater complexity. In particular it means we have 60 | -- to manage bidirectional mappings between document keys and ids, and between 61 | -- terms and term ids. 62 | -- 63 | -- So the mappings we maintain can be depicted as: 64 | -- 65 | -- > Term <-- 1:1 --> TermId 66 | -- > \ ^ 67 | -- > \ | 68 | -- > 1:many many:many 69 | -- > \ | 70 | -- > \-> v 71 | -- > DocKey <-- 1:1 --> DocId 72 | -- 73 | -- For efficiency, these details are exposed in the interface. In particular 74 | -- the mapping from TermId to many DocIds is exposed via a 'DocIdSet', 75 | -- and the mapping from DocIds to TermIds is exposed via 'DocTermIds'. 76 | -- 77 | -- The main reason we need to keep the DocId -> TermId is to allow for 78 | -- efficient incremental updates. 79 | -- 80 | data SearchIndex key field feature = SearchIndex { 81 | -- the indexes 82 | termMap :: !(Map Term TermInfo), 83 | termIdMap :: !(IntMap TermIdInfo), 84 | docIdMap :: !(IntMap (DocInfo key field feature)), 85 | docKeyMap :: !(Map key DocId), 86 | 87 | -- auto-increment key counters 88 | nextTermId :: TermId, 89 | nextDocId :: DocId 90 | } 91 | deriving Show 92 | 93 | data TermInfo = TermInfo !TermId !DocIdSet 94 | deriving Show 95 | 96 | data TermIdInfo = TermIdInfo !Term !DocIdSet 97 | deriving (Show, Eq) 98 | 99 | data DocInfo key field feature = DocInfo !key !(DocTermIds field) 100 | !(DocFeatVals feature) 101 | deriving Show 102 | 103 | 104 | ----------------------- 105 | -- SearchIndex basics 106 | -- 107 | 108 | emptySearchIndex :: SearchIndex key field feature 109 | emptySearchIndex = 110 | SearchIndex 111 | Map.empty 112 | IntMap.empty 113 | IntMap.empty 114 | Map.empty 115 | minBound 116 | minBound 117 | 118 | checkInvariant :: (Ord key, Ix field, Bounded field) => 119 | SearchIndex key field feature -> SearchIndex key field feature 120 | checkInvariant si = assert (invariant si) si 121 | 122 | invariant :: (Ord key, Ix field, Bounded field) => 123 | SearchIndex key field feature -> Bool 124 | invariant SearchIndex{termMap, termIdMap, docKeyMap, docIdMap} = 125 | and [ IntMap.lookup (fromEnum termId) termIdMap 126 | == Just (TermIdInfo term docidset) 127 | | (term, (TermInfo termId docidset)) <- Map.assocs termMap ] 128 | && and [ case Map.lookup term termMap of 129 | Just (TermInfo termId' docidset') -> toEnum termId == termId' 130 | && docidset == docidset' 131 | Nothing -> False 132 | | (termId, (TermIdInfo term docidset)) <- IntMap.assocs termIdMap ] 133 | && and [ case IntMap.lookup (fromEnum docId) docIdMap of 134 | Just (DocInfo docKey' _ _) -> docKey == docKey' 135 | Nothing -> False 136 | | (docKey, docId) <- Map.assocs docKeyMap ] 137 | && and [ Map.lookup docKey docKeyMap == Just (toEnum docId) 138 | | (docId, DocInfo docKey _ _) <- IntMap.assocs docIdMap ] 139 | && and [ DocIdSet.invariant docIdSet 140 | | (_term, (TermInfo _ docIdSet)) <- Map.assocs termMap ] 141 | && and [ any (\field -> DocTermIds.fieldTermCount docterms field termId > 0) fields 142 | | (_term, (TermInfo termId docIdSet)) <- Map.assocs termMap 143 | , docId <- DocIdSet.toList docIdSet 144 | , let DocInfo _ docterms _ = docIdMap IntMap.! fromEnum docId ] 145 | && and [ IntMap.member (fromEnum termid) termIdMap 146 | | (_docId, DocInfo _ docTerms _) <- IntMap.assocs docIdMap 147 | , field <- fields 148 | , termid <- DocTermIds.fieldElems docTerms field ] 149 | where 150 | fields = Ix.range (minBound, maxBound) 151 | 152 | 153 | ------------------- 154 | -- Lookups 155 | -- 156 | 157 | docCount :: SearchIndex key field feature -> Int 158 | docCount SearchIndex{docIdMap} = IntMap.size docIdMap 159 | 160 | lookupTerm :: SearchIndex key field feature -> Term -> Maybe (TermId, DocIdSet) 161 | lookupTerm SearchIndex{termMap} term = 162 | case Map.lookup term termMap of 163 | Nothing -> Nothing 164 | Just (TermInfo termid docidset) -> Just (termid, docidset) 165 | 166 | lookupTermsByPrefix :: SearchIndex key field feature -> 167 | Term -> [(TermId, DocIdSet)] 168 | lookupTermsByPrefix SearchIndex{termMap} term = 169 | [ (termid, docidset) 170 | | (TermInfo termid docidset) <- lookupPrefix term termMap ] 171 | 172 | lookupTermId :: SearchIndex key field feature -> TermId -> DocIdSet 173 | lookupTermId SearchIndex{termIdMap} termid = 174 | case IntMap.lookup (fromEnum termid) termIdMap of 175 | Nothing -> error $ "lookupTermId: not found " ++ show termid 176 | Just (TermIdInfo _ docidset) -> docidset 177 | 178 | lookupDocId :: SearchIndex key field feature -> 179 | DocId -> (key, DocTermIds field, DocFeatVals feature) 180 | lookupDocId SearchIndex{docIdMap} docid = 181 | case IntMap.lookup (fromEnum docid) docIdMap of 182 | Nothing -> errNotFound 183 | Just (DocInfo key doctermids docfeatvals) -> (key, doctermids, docfeatvals) 184 | where 185 | errNotFound = error $ "lookupDocId: not found " ++ show docid 186 | 187 | lookupDocKey :: Ord key => SearchIndex key field feature -> 188 | key -> Maybe (DocTermIds field) 189 | lookupDocKey SearchIndex{docKeyMap, docIdMap} key = do 190 | case Map.lookup key docKeyMap of 191 | Nothing -> Nothing 192 | Just docid -> 193 | case IntMap.lookup (fromEnum docid) docIdMap of 194 | Nothing -> error "lookupDocKey: internal error" 195 | Just (DocInfo _key doctermids _) -> Just doctermids 196 | 197 | lookupDocKeyDocId :: Ord key => SearchIndex key field feature -> key -> Maybe DocId 198 | lookupDocKeyDocId SearchIndex{docKeyMap} key = Map.lookup key docKeyMap 199 | 200 | 201 | getTerm :: SearchIndex key field feature -> TermId -> Term 202 | getTerm SearchIndex{termIdMap} termId = 203 | case termIdMap IntMap.! fromEnum termId of TermIdInfo term _ -> term 204 | 205 | getTermId :: SearchIndex key field feature -> Term -> TermId 206 | getTermId SearchIndex{termMap} term = 207 | case termMap Map.! term of TermInfo termid _ -> termid 208 | 209 | getDocKey :: SearchIndex key field feature -> DocId -> key 210 | getDocKey SearchIndex{docIdMap} docid = 211 | case docIdMap IntMap.! fromEnum docid of 212 | DocInfo dockey _ _ -> dockey 213 | 214 | getDocTermIds :: SearchIndex key field feature -> DocId -> DocTermIds field 215 | getDocTermIds SearchIndex{docIdMap} docid = 216 | case docIdMap IntMap.! fromEnum docid of 217 | DocInfo _ doctermids _ -> doctermids 218 | 219 | -------------------- 220 | -- Insert & delete 221 | -- 222 | 223 | -- Procedure for adding a new doc... 224 | -- (key, field -> [Term]) 225 | -- alloc docid for key 226 | -- add term occurences for docid (include rev map for termid) 227 | -- construct indexdoc now that we have all the term -> termid entries 228 | -- insert indexdoc 229 | 230 | -- Procedure for updating a doc... 231 | -- (key, field -> [Term]) 232 | -- find docid for key 233 | -- lookup old terms for docid (using termid rev map) 234 | -- calc term occurrences to add, term occurrences to delete 235 | -- add new term occurrences, delete old term occurrences 236 | -- construct indexdoc now that we have all the term -> termid entries 237 | -- insert indexdoc 238 | 239 | -- Procedure for deleting a doc... 240 | -- (key, field -> [Term]) 241 | -- find docid for key 242 | -- lookup old terms for docid (using termid rev map) 243 | -- delete old term occurrences 244 | -- delete indexdoc 245 | 246 | -- | This is the representation for documents to be added to the index. 247 | -- Documents may 248 | -- 249 | type DocTerms field = field -> [Term] 250 | type DocFeatureValues feature = feature -> Float 251 | 252 | insertDoc :: (Ord key, Ix field, Bounded field, Ix feature, Bounded feature) => 253 | key -> DocTerms field -> DocFeatureValues feature -> 254 | SearchIndex key field feature -> SearchIndex key field feature 255 | insertDoc key userDocTerms userDocFeats si@SearchIndex{docKeyMap} 256 | | Just docid <- Map.lookup key docKeyMap 257 | = -- Some older version of the doc is already present in the index, 258 | -- So we keep its docid. Now have to update the doc itself 259 | -- and update the terms by removing old ones and adding new ones. 260 | let oldTermsIds = getDocTermIds si docid 261 | userDocTerms' = memoiseDocTerms userDocTerms 262 | newTerms = docTermSet userDocTerms' 263 | oldTerms = docTermIdsTermSet si oldTermsIds 264 | -- We optimise for the typical case of significant overlap between 265 | -- the terms in the old and new versions of the document. 266 | delTerms = oldTerms `Set.difference` newTerms 267 | addTerms = newTerms `Set.difference` oldTerms 268 | 269 | -- Note: adding the doc relies on all the terms being in the termMap 270 | -- already, so we first add all the term occurences for the docid. 271 | in checkInvariant 272 | . insertDocIdToDocEntry docid key userDocTerms' userDocFeats 273 | . insertTermToDocIdEntries (Set.toList addTerms) docid 274 | . deleteTermToDocIdEntries (Set.toList delTerms) docid 275 | $ si 276 | 277 | | otherwise 278 | = -- We're dealing with a new doc, so allocate a docid for the key 279 | let (si', docid) = allocFreshDocId si 280 | userDocTerms' = memoiseDocTerms userDocTerms 281 | addTerms = docTermSet userDocTerms' 282 | 283 | -- Note: adding the doc relies on all the terms being in the termMap 284 | -- already, so we first add all the term occurences for the docid. 285 | in checkInvariant 286 | . insertDocIdToDocEntry docid key userDocTerms' userDocFeats 287 | . insertDocKeyToIdEntry key docid 288 | . insertTermToDocIdEntries (Set.toList addTerms) docid 289 | $ si' 290 | 291 | deleteDoc :: (Ord key, Ix field, Bounded field) => 292 | key -> 293 | SearchIndex key field feature -> SearchIndex key field feature 294 | deleteDoc key si@SearchIndex{docKeyMap} 295 | | Just docid <- Map.lookup key docKeyMap 296 | = let oldTermsIds = getDocTermIds si docid 297 | oldTerms = docTermIdsTermSet si oldTermsIds 298 | in checkInvariant 299 | . deleteDocEntry docid key 300 | . deleteTermToDocIdEntries (Set.toList oldTerms) docid 301 | $ si 302 | 303 | | otherwise = si 304 | 305 | 306 | ---------------------------------- 307 | -- Insert & delete support utils 308 | -- 309 | 310 | 311 | memoiseDocTerms :: (Ix field, Bounded field) => DocTerms field -> DocTerms field 312 | memoiseDocTerms docTermsFn = 313 | \field -> vecIndexIx vec field 314 | where 315 | vec = vecCreateIx docTermsFn 316 | 317 | docTermSet :: (Bounded t, Ix t) => DocTerms t -> Set.Set Term 318 | docTermSet docterms = 319 | Set.unions [ Set.fromList (docterms field) 320 | | field <- Ix.range (minBound, maxBound) ] 321 | 322 | docTermIdsTermSet :: (Bounded field, Ix field) => 323 | SearchIndex key field feature -> 324 | DocTermIds field -> Set.Set Term 325 | docTermIdsTermSet si doctermids = 326 | Set.unions [ Set.fromList terms 327 | | field <- Ix.range (minBound, maxBound) 328 | , let termids = DocTermIds.fieldElems doctermids field 329 | terms = map (getTerm si) termids ] 330 | 331 | -- 332 | -- The Term <-> DocId mapping 333 | -- 334 | 335 | -- | Add an entry into the 'Term' to 'DocId' mapping. 336 | insertTermToDocIdEntry :: Term -> DocId -> 337 | SearchIndex key field feature -> 338 | SearchIndex key field feature 339 | insertTermToDocIdEntry term !docid si@SearchIndex{termMap, termIdMap, nextTermId} = 340 | case Map.lookup term termMap of 341 | Nothing -> 342 | let docIdSet' = DocIdSet.singleton docid 343 | !termInfo' = TermInfo nextTermId docIdSet' 344 | !termIdInfo' = TermIdInfo term docIdSet' 345 | in si { termMap = Map.insert term termInfo' termMap 346 | , termIdMap = IntMap.insert (fromEnum nextTermId) 347 | termIdInfo' termIdMap 348 | , nextTermId = succ nextTermId } 349 | 350 | Just (TermInfo termId docIdSet) -> 351 | let docIdSet' = DocIdSet.insert docid docIdSet 352 | !termInfo' = TermInfo termId docIdSet' 353 | !termIdInfo' = TermIdInfo term docIdSet' 354 | in si { termMap = Map.insert term termInfo' termMap 355 | , termIdMap = IntMap.insert (fromEnum termId) 356 | termIdInfo' termIdMap 357 | } 358 | 359 | -- | Add multiple entries into the 'Term' to 'DocId' mapping: many terms that 360 | -- map to the same document. 361 | insertTermToDocIdEntries :: [Term] -> DocId -> 362 | SearchIndex key field feature -> 363 | SearchIndex key field feature 364 | insertTermToDocIdEntries terms !docid si = 365 | List.foldl' (\si' term -> insertTermToDocIdEntry term docid si') si terms 366 | 367 | -- | Delete an entry from the 'Term' to 'DocId' mapping. 368 | deleteTermToDocIdEntry :: Term -> DocId -> 369 | SearchIndex key field feature -> 370 | SearchIndex key field feature 371 | deleteTermToDocIdEntry term !docid si@SearchIndex{termMap, termIdMap} = 372 | case Map.lookup term termMap of 373 | Nothing -> si 374 | Just (TermInfo termId docIdSet) -> 375 | let docIdSet' = DocIdSet.delete docid docIdSet 376 | !termInfo' = TermInfo termId docIdSet' 377 | !termIdInfo' = TermIdInfo term docIdSet' 378 | in if DocIdSet.null docIdSet' 379 | then si { termMap = Map.delete term termMap 380 | , termIdMap = IntMap.delete (fromEnum termId) termIdMap } 381 | else si { termMap = Map.insert term termInfo' termMap 382 | , termIdMap = IntMap.insert (fromEnum termId) 383 | termIdInfo' termIdMap 384 | } 385 | 386 | -- | Delete multiple entries from the 'Term' to 'DocId' mapping: many terms 387 | -- that map to the same document. 388 | deleteTermToDocIdEntries :: [Term] -> DocId -> 389 | SearchIndex key field feature -> 390 | SearchIndex key field feature 391 | deleteTermToDocIdEntries terms !docid si = 392 | List.foldl' (\si' term -> deleteTermToDocIdEntry term docid si') si terms 393 | 394 | -- 395 | -- The DocId <-> Doc mapping 396 | -- 397 | 398 | allocFreshDocId :: SearchIndex key field feature -> 399 | (SearchIndex key field feature, DocId) 400 | allocFreshDocId si@SearchIndex{nextDocId} = 401 | let !si' = si { nextDocId = succ nextDocId } 402 | in (si', nextDocId) 403 | 404 | insertDocKeyToIdEntry :: Ord key => key -> DocId -> 405 | SearchIndex key field feature -> 406 | SearchIndex key field feature 407 | insertDocKeyToIdEntry dockey !docid si@SearchIndex{docKeyMap} = 408 | si { docKeyMap = Map.insert dockey docid docKeyMap } 409 | 410 | insertDocIdToDocEntry :: (Ix field, Bounded field, 411 | Ix feature, Bounded feature) => 412 | DocId -> key -> 413 | DocTerms field -> 414 | DocFeatureValues feature -> 415 | SearchIndex key field feature -> 416 | SearchIndex key field feature 417 | insertDocIdToDocEntry !docid dockey userdocterms userdocfeats 418 | si@SearchIndex{docIdMap} = 419 | let doctermids = DocTermIds.create (map (getTermId si) . userdocterms) 420 | docfeatvals= DocFeatVals.create userdocfeats 421 | !docinfo = DocInfo dockey doctermids docfeatvals 422 | in si { docIdMap = IntMap.insert (fromEnum docid) docinfo docIdMap } 423 | 424 | deleteDocEntry :: Ord key => DocId -> key -> 425 | SearchIndex key field feature -> SearchIndex key field feature 426 | deleteDocEntry docid key si@SearchIndex{docIdMap, docKeyMap} = 427 | si { docIdMap = IntMap.delete (fromEnum docid) docIdMap 428 | , docKeyMap = Map.delete key docKeyMap } 429 | 430 | -- 431 | -- Data.Map utils 432 | -- 433 | 434 | -- Data.Map does not support prefix lookups directly (unlike a trie) 435 | -- but we can implement it reasonably efficiently using split: 436 | 437 | -- | Lookup values for a range of keys (inclusive lower bound and exclusive 438 | -- upper bound) 439 | -- 440 | lookupRange :: Ord k => (k, k) -> Map k v -> [v] 441 | lookupRange (lb, ub) m = 442 | let (_, mv, gt) = Map.splitLookup lb m 443 | (between, _) = Map.split ub gt 444 | in case mv of 445 | Just v -> v : Map.elems between 446 | Nothing -> Map.elems between 447 | 448 | lookupPrefix :: Text -> Map Text v -> [v] 449 | lookupPrefix t _ | T.null t = [] 450 | lookupPrefix t m = lookupRange (t, prefixUpperBound t) m 451 | 452 | prefixUpperBound :: Text -> Text 453 | prefixUpperBound = succLast . T.dropWhileEnd (== maxBound) 454 | where 455 | succLast t = T.init t `T.snoc` succ (T.last t) 456 | 457 | -------------------------------------------------------------------------------- /src/Data/SearchEngine/TermBag.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, MultiParamTypeClasses, 2 | TypeFamilies #-} 3 | {-# LANGUAGE CPP #-} 4 | #if __GLASGOW_HASKELL__ >= 908 5 | {-# OPTIONS_GHC -Wno-x-partial #-} 6 | #endif 7 | 8 | module Data.SearchEngine.TermBag ( 9 | TermId(TermId), TermCount, 10 | TermBag, 11 | size, 12 | fromList, 13 | toList, 14 | elems, 15 | termCount, 16 | denseTable, 17 | invariant 18 | ) where 19 | 20 | import qualified Data.Vector.Unboxed as Vec 21 | import qualified Data.Vector.Unboxed.Mutable as MVec 22 | import qualified Data.Vector.Generic as GVec 23 | import qualified Data.Vector.Generic.Mutable as GMVec 24 | import Control.Monad.ST 25 | import Control.Monad (liftM) 26 | import qualified Data.Map as Map 27 | import Data.Word (Word32, Word8) 28 | import Data.Bits 29 | import qualified Data.List as List 30 | import Data.Function (on) 31 | 32 | newtype TermId = TermId { unTermId :: Word32 } 33 | deriving (Eq, Ord, Show, Enum) 34 | 35 | instance Bounded TermId where 36 | minBound = TermId 0 37 | maxBound = TermId 0x00FFFFFF 38 | 39 | data TermBag = TermBag !Int !(Vec.Vector TermIdAndCount) 40 | deriving Show 41 | 42 | -- We sneakily stuff both the TermId and the bag count into one 32bit word 43 | type TermIdAndCount = Word32 44 | type TermCount = Word8 45 | 46 | -- Bottom 24 bits is the TermId, top 8 bits is the bag count 47 | termIdAndCount :: TermId -> Int -> TermIdAndCount 48 | termIdAndCount (TermId termid) freq = 49 | (min (fromIntegral freq) 255 `shiftL` 24) 50 | .|. (termid .&. 0x00FFFFFF) 51 | 52 | getTermId :: TermIdAndCount -> TermId 53 | getTermId word = TermId (word .&. 0x00FFFFFF) 54 | 55 | getTermCount :: TermIdAndCount -> TermCount 56 | getTermCount word = fromIntegral (word `shiftR` 24) 57 | 58 | invariant :: TermBag -> Bool 59 | invariant (TermBag _ vec) = 60 | strictlyAscending (Vec.toList vec) 61 | where 62 | strictlyAscending (a:xs@(b:_)) = getTermId a < getTermId b 63 | && strictlyAscending xs 64 | strictlyAscending _ = True 65 | 66 | size :: TermBag -> Int 67 | size (TermBag sz _) = sz 68 | 69 | elems :: TermBag -> [TermId] 70 | elems (TermBag _ vec) = map getTermId (Vec.toList vec) 71 | 72 | toList :: TermBag -> [(TermId, TermCount)] 73 | toList (TermBag _ vec) = [ (getTermId x, getTermCount x) 74 | | x <- Vec.toList vec ] 75 | 76 | termCount :: TermBag -> TermId -> TermCount 77 | termCount (TermBag _ vec) = 78 | binarySearch 0 (Vec.length vec - 1) 79 | where 80 | binarySearch :: Int -> Int -> TermId -> TermCount 81 | binarySearch !a !b !key 82 | | a > b = 0 83 | | otherwise = 84 | let mid = (a + b) `div` 2 85 | tidAndCount = vec Vec.! mid 86 | in case compare key (getTermId tidAndCount) of 87 | LT -> binarySearch a (mid-1) key 88 | EQ -> getTermCount tidAndCount 89 | GT -> binarySearch (mid+1) b key 90 | 91 | fromList :: [TermId] -> TermBag 92 | fromList termids = 93 | let bag = Map.fromListWith (+) [ (t, 1) | t <- termids ] 94 | sz = Map.foldl' (+) 0 bag 95 | vec = Vec.fromListN (Map.size bag) 96 | [ termIdAndCount termid freq 97 | | (termid, freq) <- Map.toAscList bag ] 98 | in TermBag sz vec 99 | 100 | -- | Given a bunch of term bags, merge them into a table for easier subsequent 101 | -- processing. This is bascially a sparse to dense conversion. Missing entries 102 | -- are filled in with 0. We represent the table as one vector for the 103 | -- term ids and a 2d array for the counts. 104 | -- 105 | -- Unfortunately vector does not directly support 2d arrays and array does 106 | -- not make it easy to trim arrays. 107 | -- 108 | denseTable :: [TermBag] -> (Vec.Vector TermId, Vec.Vector TermCount) 109 | denseTable termbags = 110 | (tids, tcts) 111 | where 112 | -- First merge the TermIds into one array 113 | -- then make a linear pass to create the counts array 114 | -- filling in 0s or the counts as we find them 115 | !numBags = length termbags 116 | !tids = unionsTermId termbags 117 | !numTerms = Vec.length tids 118 | !numCounts = numTerms * numBags 119 | !tcts = Vec.create (do 120 | out <- MVec.new numCounts 121 | sequence_ 122 | [ writeMergedTermCounts tids bag out i 123 | | (n, TermBag _ bag) <- zip [0..] termbags 124 | , let i = n * numTerms ] 125 | return out 126 | ) 127 | 128 | writeMergedTermCounts :: Vec.Vector TermId -> Vec.Vector TermIdAndCount -> 129 | MVec.MVector s TermCount -> Int -> ST s () 130 | writeMergedTermCounts xs0 ys0 !out i0 = 131 | -- assume xs & ys are sorted, and ys contains a subset of xs 132 | go xs0 ys0 i0 133 | where 134 | go !xs !ys !i 135 | | Vec.null ys = MVec.set (MVec.slice i (Vec.length xs) out) 0 136 | | Vec.null xs = return () 137 | | otherwise = let x = Vec.head xs 138 | ytc = Vec.head ys 139 | y = getTermId ytc 140 | c = getTermCount ytc 141 | in case x == y of 142 | True -> do MVec.write out i c 143 | go (Vec.tail xs) (Vec.tail ys) (i+1) 144 | False -> do MVec.write out i 0 145 | go (Vec.tail xs) ys (i+1) 146 | 147 | -- | Given a set of term bags, form the set of TermIds 148 | -- 149 | unionsTermId :: [TermBag] -> Vec.Vector TermId 150 | unionsTermId tbs = 151 | case List.sortBy (compare `on` bagVecLength) tbs of 152 | [] -> Vec.empty 153 | [TermBag _ xs] -> (Vec.map getTermId xs) 154 | (x0:x1:xs) -> List.foldl' union3 (union2 x0 x1) xs 155 | where 156 | bagVecLength (TermBag _ vec) = Vec.length vec 157 | 158 | union2 :: TermBag -> TermBag -> Vec.Vector TermId 159 | union2 (TermBag _ xs) (TermBag _ ys) = 160 | Vec.create (MVec.new sizeBound >>= writeMergedUnion2 xs ys) 161 | where 162 | sizeBound = Vec.length xs + Vec.length ys 163 | 164 | writeMergedUnion2 :: Vec.Vector TermIdAndCount -> Vec.Vector TermIdAndCount -> 165 | MVec.MVector s TermId -> ST s (MVec.MVector s TermId) 166 | writeMergedUnion2 xs0 ys0 !out = do 167 | i <- go xs0 ys0 0 168 | return $! MVec.take i out 169 | where 170 | go !xs !ys !i 171 | | Vec.null xs = do Vec.copy (MVec.slice i (Vec.length ys) out) 172 | (Vec.map getTermId ys) 173 | return (i + Vec.length ys) 174 | | Vec.null ys = do Vec.copy (MVec.slice i (Vec.length xs) out) 175 | (Vec.map getTermId xs) 176 | return (i + Vec.length xs) 177 | | otherwise = let x = getTermId (Vec.head xs) 178 | y = getTermId (Vec.head ys) 179 | in case compare x y of 180 | GT -> do MVec.write out i y 181 | go xs (Vec.tail ys) (i+1) 182 | EQ -> do MVec.write out i x 183 | go (Vec.tail xs) (Vec.tail ys) (i+1) 184 | LT -> do MVec.write out i x 185 | go (Vec.tail xs) ys (i+1) 186 | 187 | union3 :: Vec.Vector TermId -> TermBag -> Vec.Vector TermId 188 | union3 xs (TermBag _ ys) = 189 | Vec.create (MVec.new sizeBound >>= writeMergedUnion3 xs ys) 190 | where 191 | sizeBound = Vec.length xs + Vec.length ys 192 | 193 | writeMergedUnion3 :: Vec.Vector TermId -> Vec.Vector TermIdAndCount -> 194 | MVec.MVector s TermId -> ST s (MVec.MVector s TermId) 195 | writeMergedUnion3 xs0 ys0 !out = do 196 | i <- go xs0 ys0 0 197 | return $! MVec.take i out 198 | where 199 | go !xs !ys !i 200 | | Vec.null xs = do Vec.copy (MVec.slice i (Vec.length ys) out) 201 | (Vec.map getTermId ys) 202 | return (i + Vec.length ys) 203 | | Vec.null ys = do Vec.copy (MVec.slice i (Vec.length xs) out) xs 204 | return (i + Vec.length xs) 205 | | otherwise = let x = Vec.head xs 206 | y = getTermId (Vec.head ys) 207 | in case compare x y of 208 | GT -> do MVec.write out i y 209 | go xs (Vec.tail ys) (i+1) 210 | EQ -> do MVec.write out i x 211 | go (Vec.tail xs) (Vec.tail ys) (i+1) 212 | LT -> do MVec.write out i x 213 | go (Vec.tail xs) ys (i+1) 214 | 215 | ------------------------------------------------------------------------------ 216 | -- verbose Unbox instances 217 | -- 218 | 219 | instance MVec.Unbox TermId 220 | 221 | newtype instance MVec.MVector s TermId = MV_TermId (MVec.MVector s Word32) 222 | 223 | instance GMVec.MVector MVec.MVector TermId where 224 | basicLength (MV_TermId v) = GMVec.basicLength v 225 | basicUnsafeSlice i l (MV_TermId v) = MV_TermId (GMVec.basicUnsafeSlice i l v) 226 | basicUnsafeNew l = MV_TermId `liftM` GMVec.basicUnsafeNew l 227 | basicInitialize (MV_TermId v) = GMVec.basicInitialize v 228 | basicUnsafeReplicate l x = MV_TermId `liftM` GMVec.basicUnsafeReplicate l (unTermId x) 229 | basicUnsafeRead (MV_TermId v) i = TermId `liftM` GMVec.basicUnsafeRead v i 230 | basicUnsafeWrite (MV_TermId v) i x = GMVec.basicUnsafeWrite v i (unTermId x) 231 | basicClear (MV_TermId v) = GMVec.basicClear v 232 | basicSet (MV_TermId v) x = GMVec.basicSet v (unTermId x) 233 | basicUnsafeGrow (MV_TermId v) l = MV_TermId `liftM` GMVec.basicUnsafeGrow v l 234 | basicUnsafeCopy (MV_TermId v) (MV_TermId v') = GMVec.basicUnsafeCopy v v' 235 | basicUnsafeMove (MV_TermId v) (MV_TermId v') = GMVec.basicUnsafeMove v v' 236 | basicOverlaps (MV_TermId v) (MV_TermId v') = GMVec.basicOverlaps v v' 237 | {-# INLINE basicLength #-} 238 | {-# INLINE basicUnsafeSlice #-} 239 | {-# INLINE basicOverlaps #-} 240 | {-# INLINE basicUnsafeNew #-} 241 | {-# INLINE basicInitialize #-} 242 | {-# INLINE basicUnsafeReplicate #-} 243 | {-# INLINE basicUnsafeRead #-} 244 | {-# INLINE basicUnsafeWrite #-} 245 | {-# INLINE basicClear #-} 246 | {-# INLINE basicSet #-} 247 | {-# INLINE basicUnsafeCopy #-} 248 | {-# INLINE basicUnsafeMove #-} 249 | {-# INLINE basicUnsafeGrow #-} 250 | 251 | newtype instance Vec.Vector TermId = V_TermId (Vec.Vector Word32) 252 | 253 | instance GVec.Vector Vec.Vector TermId where 254 | basicUnsafeFreeze (MV_TermId mv) = V_TermId `liftM` GVec.basicUnsafeFreeze mv 255 | basicUnsafeThaw (V_TermId v) = MV_TermId `liftM` GVec.basicUnsafeThaw v 256 | basicLength (V_TermId v) = GVec.basicLength v 257 | basicUnsafeSlice i l (V_TermId v) = V_TermId (GVec.basicUnsafeSlice i l v) 258 | basicUnsafeIndexM (V_TermId v) i = TermId `liftM` GVec.basicUnsafeIndexM v i 259 | basicUnsafeCopy (MV_TermId mv) 260 | (V_TermId v) = GVec.basicUnsafeCopy mv v 261 | elemseq (V_TermId v) x = GVec.elemseq v (unTermId x) 262 | {-# INLINE basicUnsafeFreeze #-} 263 | {-# INLINE basicUnsafeThaw #-} 264 | {-# INLINE basicLength #-} 265 | {-# INLINE basicUnsafeSlice #-} 266 | {-# INLINE basicUnsafeIndexM #-} 267 | {-# INLINE basicUnsafeCopy #-} 268 | {-# INLINE elemseq #-} 269 | -------------------------------------------------------------------------------- /src/Data/SearchEngine/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE NamedFieldPuns, RecordWildCards #-} 2 | 3 | module Data.SearchEngine.Types ( 4 | -- * Search engine types and helper functions 5 | SearchEngine(..), 6 | SearchConfig(..), 7 | SearchRankParameters(..), 8 | BM25F.FeatureFunction(..), 9 | initSearchEngine, 10 | cacheBM25Context, 11 | 12 | -- ** Helper type for non-term features 13 | NoFeatures, 14 | noFeatures, 15 | 16 | -- * Re-export SearchIndex and other types 17 | SearchIndex, Term, TermId, 18 | DocIdSet, DocId, 19 | DocTermIds, DocFeatVals, 20 | 21 | -- * Internal sanity check 22 | invariant, 23 | ) where 24 | 25 | import Data.SearchEngine.SearchIndex (SearchIndex, Term, TermId) 26 | import qualified Data.SearchEngine.SearchIndex as SI 27 | import Data.SearchEngine.DocIdSet (DocIdSet, DocId) 28 | import qualified Data.SearchEngine.DocIdSet as DocIdSet 29 | import Data.SearchEngine.DocFeatVals (DocFeatVals) 30 | import Data.SearchEngine.DocTermIds (DocTermIds) 31 | import qualified Data.SearchEngine.BM25F as BM25F 32 | 33 | import Data.Ix 34 | import Data.Array.Unboxed 35 | 36 | 37 | 38 | data SearchConfig doc key field feature = SearchConfig { 39 | documentKey :: doc -> key, 40 | extractDocumentTerms :: doc -> field -> [Term], 41 | transformQueryTerm :: Term -> field -> Term, 42 | documentFeatureValue :: doc -> feature -> Float 43 | } 44 | 45 | data SearchRankParameters field feature = SearchRankParameters { 46 | paramK1 :: !Float, 47 | paramB :: field -> Float, 48 | paramFieldWeights :: field -> Float, 49 | paramFeatureWeights :: feature -> Float, 50 | paramFeatureFunctions :: feature -> BM25F.FeatureFunction, 51 | 52 | paramResultsetSoftLimit :: !Int, 53 | paramResultsetHardLimit :: !Int, 54 | paramAutosuggestPrefilterLimit :: !Int, 55 | paramAutosuggestPostfilterLimit :: !Int 56 | } 57 | 58 | data SearchEngine doc key field feature = SearchEngine { 59 | searchIndex :: !(SearchIndex key field feature), 60 | searchConfig :: !(SearchConfig doc key field feature), 61 | searchRankParams :: !(SearchRankParameters field feature), 62 | 63 | -- cached info 64 | sumFieldLengths :: !(UArray field Int), 65 | bm25Context :: BM25F.Context TermId field feature 66 | } 67 | 68 | invariant :: (Ord key, Ix field, Bounded field) => 69 | SearchEngine doc key field feature -> Bool 70 | invariant SearchEngine{searchIndex} = 71 | SI.invariant searchIndex 72 | -- && check caches 73 | 74 | initSearchEngine :: (Ix field, Bounded field, Ix feature, Bounded feature) => 75 | SearchConfig doc key field feature -> 76 | SearchRankParameters field feature -> 77 | SearchEngine doc key field feature 78 | initSearchEngine config params = 79 | cacheBM25Context 80 | SearchEngine { 81 | searchIndex = SI.emptySearchIndex, 82 | searchConfig = config, 83 | searchRankParams = params, 84 | sumFieldLengths = listArray (minBound, maxBound) (repeat 0), 85 | bm25Context = undefined 86 | } 87 | 88 | cacheBM25Context :: Ix field => 89 | SearchEngine doc key field feature -> 90 | SearchEngine doc key field feature 91 | cacheBM25Context 92 | se@SearchEngine { 93 | searchRankParams = SearchRankParameters{..}, 94 | searchIndex, 95 | sumFieldLengths 96 | } 97 | = se { bm25Context = bm25Context' } 98 | where 99 | bm25Context' = BM25F.Context { 100 | BM25F.numDocsTotal = SI.docCount searchIndex, 101 | BM25F.avgFieldLength = \f -> fromIntegral (sumFieldLengths ! f) 102 | / fromIntegral (SI.docCount searchIndex), 103 | BM25F.numDocsWithTerm = DocIdSet.size . SI.lookupTermId searchIndex, 104 | BM25F.paramK1 = paramK1, 105 | BM25F.paramB = paramB, 106 | BM25F.fieldWeight = paramFieldWeights, 107 | BM25F.featureWeight = paramFeatureWeights, 108 | BM25F.featureFunction = paramFeatureFunctions 109 | } 110 | 111 | 112 | ----------------------------- 113 | 114 | data NoFeatures = NoFeatures 115 | deriving (Eq, Ord, Bounded, Show) 116 | 117 | instance Ix NoFeatures where 118 | range _ = [] 119 | inRange _ _ = False 120 | index _ _ = -1 121 | 122 | noFeatures :: NoFeatures -> a 123 | noFeatures _ = error "noFeatures" 124 | 125 | -------------------------------------------------------------------------------- /src/Data/SearchEngine/Update.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, NamedFieldPuns, RecordWildCards #-} 2 | 3 | module Data.SearchEngine.Update ( 4 | 5 | -- * Managing documents to be searched 6 | insertDoc, 7 | insertDocs, 8 | deleteDoc, 9 | 10 | ) where 11 | 12 | import Data.SearchEngine.Types 13 | import qualified Data.SearchEngine.SearchIndex as SI 14 | import qualified Data.SearchEngine.DocTermIds as DocTermIds 15 | 16 | import qualified Data.List as List 17 | import Data.Ix 18 | import Data.Array.Unboxed 19 | 20 | 21 | insertDocs :: (Ord key, Ix field, Bounded field, Ix feature, Bounded feature) => 22 | [doc] -> 23 | SearchEngine doc key field feature -> 24 | SearchEngine doc key field feature 25 | insertDocs docs se = List.foldl' (\se' doc -> insertDoc doc se') se docs 26 | 27 | 28 | insertDoc :: (Ord key, Ix field, Bounded field, Ix feature, Bounded feature) => 29 | doc -> 30 | SearchEngine doc key field feature -> 31 | SearchEngine doc key field feature 32 | insertDoc doc se@SearchEngine{ searchConfig = SearchConfig { 33 | documentKey, 34 | extractDocumentTerms, 35 | documentFeatureValue 36 | } 37 | , searchIndex } = 38 | let key = documentKey doc 39 | searchIndex' = SI.insertDoc key (extractDocumentTerms doc) 40 | (documentFeatureValue doc) 41 | searchIndex 42 | oldDoc = SI.lookupDocKey searchIndex key 43 | newDoc = SI.lookupDocKey searchIndex' key 44 | 45 | in cacheBM25Context $ 46 | updateCachedFieldLengths oldDoc newDoc $ 47 | se { searchIndex = searchIndex' } 48 | 49 | 50 | deleteDoc :: (Ord key, Ix field, Bounded field) => 51 | key -> 52 | SearchEngine doc key field feature -> 53 | SearchEngine doc key field feature 54 | deleteDoc key se@SearchEngine{searchIndex} = 55 | let searchIndex' = SI.deleteDoc key searchIndex 56 | oldDoc = SI.lookupDocKey searchIndex key 57 | 58 | in cacheBM25Context $ 59 | updateCachedFieldLengths oldDoc Nothing $ 60 | se { searchIndex = searchIndex' } 61 | 62 | 63 | updateCachedFieldLengths :: (Ix field, Bounded field) => 64 | Maybe (DocTermIds field) -> Maybe (DocTermIds field) -> 65 | SearchEngine doc key field feature -> 66 | SearchEngine doc key field feature 67 | updateCachedFieldLengths Nothing (Just newDoc) se@SearchEngine{sumFieldLengths} = 68 | se { 69 | sumFieldLengths = 70 | array (bounds sumFieldLengths) 71 | [ (i, n + DocTermIds.fieldLength newDoc i) 72 | | (i, n) <- assocs sumFieldLengths ] 73 | } 74 | updateCachedFieldLengths (Just oldDoc) (Just newDoc) se@SearchEngine{sumFieldLengths} = 75 | se { 76 | sumFieldLengths = 77 | array (bounds sumFieldLengths) 78 | [ (i, n - DocTermIds.fieldLength oldDoc i 79 | + DocTermIds.fieldLength newDoc i) 80 | | (i, n) <- assocs sumFieldLengths ] 81 | } 82 | updateCachedFieldLengths (Just oldDoc) Nothing se@SearchEngine{sumFieldLengths} = 83 | se { 84 | sumFieldLengths = 85 | array (bounds sumFieldLengths) 86 | [ (i, n - DocTermIds.fieldLength oldDoc i) 87 | | (i, n) <- assocs sumFieldLengths ] 88 | } 89 | updateCachedFieldLengths Nothing Nothing se = se 90 | 91 | -------------------------------------------------------------------------------- /tests/Main.hs: -------------------------------------------------------------------------------- 1 | 2 | import qualified Test.Data.SearchEngine.DocIdSet as DocIdSet 3 | import qualified Test.Data.SearchEngine.TermBag as TermBag 4 | 5 | import Test.Tasty 6 | import Test.Tasty.QuickCheck 7 | 8 | 9 | main :: IO () 10 | main = defaultMain $ 11 | testGroup "" 12 | [ docIdSetTests 13 | , termBagTests 14 | ] 15 | 16 | docIdSetTests :: TestTree 17 | docIdSetTests = 18 | testGroup "TermIdSet" 19 | [ testProperty "prop_insert" DocIdSet.prop_insert 20 | , testProperty "prop_delete" DocIdSet.prop_delete 21 | , testProperty "prop_delete'" DocIdSet.prop_delete' 22 | , testProperty "prop_union" DocIdSet.prop_union 23 | , testProperty "prop_union'" DocIdSet.prop_union' 24 | ] 25 | 26 | termBagTests :: TestTree 27 | termBagTests = 28 | testGroup "TermBag" 29 | [ testProperty "prop_invariant" TermBag.prop_invariant 30 | , testProperty "prop_elems" TermBag.prop_elems 31 | , testProperty "prop_fromList" TermBag.prop_fromList 32 | , testProperty "prop_size" TermBag.prop_size 33 | , testProperty "prop_termCount" TermBag.prop_termCount 34 | , testProperty "prop_denseTable1" TermBag.prop_denseTable1 35 | , testProperty "prop_denseTable2" TermBag.prop_denseTable2 36 | ] 37 | 38 | 39 | -------------------------------------------------------------------------------- /tests/Test/Data/SearchEngine/DocIdSet.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -fno-warn-orphans #-} 2 | module Test.Data.SearchEngine.DocIdSet where 3 | 4 | import Data.SearchEngine.DocIdSet (DocIdSet(DocIdSet), DocId(DocId)) 5 | import qualified Data.SearchEngine.DocIdSet as DocIdSet 6 | 7 | import qualified Data.Vector.Unboxed as Vec 8 | import qualified Data.List as List 9 | import Test.QuickCheck 10 | 11 | 12 | instance Arbitrary DocIdSet where 13 | arbitrary = DocIdSet.fromList `fmap` (listOf arbitrary) 14 | 15 | instance Arbitrary DocId where 16 | arbitrary = DocId `fmap` choose (0,15) 17 | 18 | 19 | prop_insert :: DocIdSet -> DocId -> Bool 20 | prop_insert dset x = 21 | let dset' = DocIdSet.insert x dset 22 | in DocIdSet.invariant dset && DocIdSet.invariant dset' 23 | && all (`member` dset') (x : DocIdSet.toList dset) 24 | 25 | prop_delete :: DocIdSet -> DocId -> Bool 26 | prop_delete dset x = 27 | let dset' = DocIdSet.delete x dset 28 | in DocIdSet.invariant dset && DocIdSet.invariant dset' 29 | && all (`member` dset') (List.delete x (DocIdSet.toList dset)) 30 | && not (x `member` dset') 31 | 32 | prop_delete' :: DocIdSet -> Bool 33 | prop_delete' dset = 34 | all (prop_delete dset) (DocIdSet.toList dset) 35 | 36 | prop_union :: DocIdSet -> DocIdSet -> Bool 37 | prop_union dset1 dset2 = 38 | let dset = DocIdSet.union dset1 dset2 39 | dset' = DocIdSet.fromList (List.union (DocIdSet.toList dset1) (DocIdSet.toList dset2)) 40 | 41 | in DocIdSet.invariant dset && DocIdSet.invariant dset' 42 | && dset == dset' 43 | 44 | prop_union' :: DocIdSet -> DocIdSet -> Bool 45 | prop_union' dset1 dset2 = 46 | let dset = DocIdSet.union dset1 dset2 47 | dset' = List.foldl' (\s i -> DocIdSet.insert i s) dset1 (DocIdSet.toList dset2) 48 | dset'' = List.foldl' (\s i -> DocIdSet.insert i s) dset2 (DocIdSet.toList dset1) 49 | in DocIdSet.invariant dset && DocIdSet.invariant dset' && DocIdSet.invariant dset'' 50 | && dset == dset' 51 | && dset' == dset'' 52 | 53 | member :: DocId -> DocIdSet -> Bool 54 | member x (DocIdSet vec) = 55 | x `List.elem` Vec.toList vec 56 | 57 | -------------------------------------------------------------------------------- /tests/Test/Data/SearchEngine/TermBag.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -Wno-orphans #-} 2 | {-# LANGUAGE CPP #-} 3 | #if __GLASGOW_HASKELL__ >= 908 4 | {-# OPTIONS_GHC -Wno-x-partial #-} 5 | #endif 6 | 7 | module Test.Data.SearchEngine.TermBag where 8 | 9 | import Data.SearchEngine.TermBag 10 | 11 | import qualified Data.Vector.Unboxed as Vec 12 | import qualified Data.List as List 13 | import Test.QuickCheck 14 | 15 | 16 | instance Arbitrary TermBag where 17 | arbitrary = fromList `fmap` (listOf arbitrary) 18 | 19 | instance Arbitrary TermId where 20 | arbitrary = TermId `fmap` choose (0,5) 21 | 22 | prop_invariant :: TermBag -> Bool 23 | prop_invariant = invariant 24 | 25 | prop_elems :: [TermId] -> Bool 26 | prop_elems tids = 27 | (map head . List.group . List.sort) tids 28 | == (elems . fromList) tids 29 | 30 | prop_fromList :: [TermId] -> Bool 31 | prop_fromList tids = 32 | (map (\g -> (head g, fromIntegral (length g `min` 255))) 33 | . List.group . List.sort) tids 34 | == (toList . fromList) tids 35 | 36 | prop_size :: [TermId] -> Bool 37 | prop_size tids = 38 | (size . fromList) tids == length tids 39 | 40 | prop_termCount :: [TermId] -> Bool 41 | prop_termCount tids = 42 | and [ termCount bag tid == count 43 | | let bag = fromList tids 44 | , (tid, count) <- toList bag 45 | ] 46 | 47 | prop_denseTable1 :: [TermBag] -> Bool 48 | prop_denseTable1 bags = 49 | Vec.toList terms == (List.sort . foldr List.union [] . map elems) bags 50 | where 51 | (terms, _) = denseTable bags 52 | 53 | prop_denseTable2 :: [TermBag] -> Bool 54 | prop_denseTable2 bags = 55 | and [ termCount bag (terms Vec.! t) == counts Vec.! (b * numTerms + t) 56 | | let (terms, counts) = denseTable bags 57 | numTerms = Vec.length terms 58 | , (b, bag) <- zip [0..] bags 59 | , t <- [0..Vec.length terms - 1] 60 | ] 61 | --------------------------------------------------------------------------------