├── .github
    └── workflows
    │   └── haskell-ci.yml
├── .gitignore
├── LICENSE
├── README.md
├── changelog
├── demo
    ├── ExtractDescriptionTerms.hs
    ├── ExtractNameTerms.hs
    ├── HaddockHtml.hs
    ├── HaddockLex.x
    ├── HaddockParse.y
    ├── HaddockTypes.hs
    ├── Main.hs
    ├── PackageIndexUtils.hs
    └── PackageSearch.hs
├── full-text-search.cabal
├── src
    └── Data
    │   ├── SearchEngine.hs
    │   └── SearchEngine
    │       ├── Autosuggest.hs
    │       ├── BM25F.hs
    │       ├── DocFeatVals.hs
    │       ├── DocIdSet.hs
    │       ├── DocTermIds.hs
    │       ├── Query.hs
    │       ├── SearchIndex.hs
    │       ├── TermBag.hs
    │       ├── Types.hs
    │       └── Update.hs
└── tests
    ├── Main.hs
    └── Test
        └── Data
            └── SearchEngine
                ├── DocIdSet.hs
                └── TermBag.hs


/.github/workflows/haskell-ci.yml:
--------------------------------------------------------------------------------
  1 | # This GitHub workflow config has been generated by a script via
  2 | #
  3 | #   haskell-ci 'github' 'full-text-search.cabal' '--local-ghc-options' '-Wall -Werror'
  4 | #
  5 | # To regenerate the script (for example after adjusting tested-with) run
  6 | #
  7 | #   haskell-ci regenerate
  8 | #
  9 | # For more information, see https://github.com/haskell-CI/haskell-ci
 10 | #
 11 | # version: 0.19.20250115
 12 | #
 13 | # REGENDATA ("0.19.20250115",["github","full-text-search.cabal","--local-ghc-options","-Wall -Werror"])
 14 | #
 15 | name: Haskell-CI
 16 | on:
 17 |   - push
 18 |   - pull_request
 19 | jobs:
 20 |   linux:
 21 |     name: Haskell-CI - Linux - ${{ matrix.compiler }}
 22 |     runs-on: ubuntu-20.04
 23 |     timeout-minutes:
 24 |       60
 25 |     container:
 26 |       image: buildpack-deps:jammy
 27 |     continue-on-error: ${{ matrix.allow-failure }}
 28 |     strategy:
 29 |       matrix:
 30 |         include:
 31 |           - compiler: ghc-9.12.1
 32 |             compilerKind: ghc
 33 |             compilerVersion: 9.12.1
 34 |             setup-method: ghcup
 35 |             allow-failure: false
 36 |           - compiler: ghc-9.10.1
 37 |             compilerKind: ghc
 38 |             compilerVersion: 9.10.1
 39 |             setup-method: ghcup
 40 |             allow-failure: false
 41 |           - compiler: ghc-9.8.2
 42 |             compilerKind: ghc
 43 |             compilerVersion: 9.8.2
 44 |             setup-method: ghcup
 45 |             allow-failure: false
 46 |           - compiler: ghc-9.6.6
 47 |             compilerKind: ghc
 48 |             compilerVersion: 9.6.6
 49 |             setup-method: ghcup
 50 |             allow-failure: false
 51 |           - compiler: ghc-9.4.8
 52 |             compilerKind: ghc
 53 |             compilerVersion: 9.4.8
 54 |             setup-method: ghcup
 55 |             allow-failure: false
 56 |           - compiler: ghc-9.2.8
 57 |             compilerKind: ghc
 58 |             compilerVersion: 9.2.8
 59 |             setup-method: ghcup
 60 |             allow-failure: false
 61 |       fail-fast: false
 62 |     steps:
 63 |       - name: apt-get install
 64 |         run: |
 65 |           apt-get update
 66 |           apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5
 67 |       - name: Install GHCup
 68 |         run: |
 69 |           mkdir -p "$HOME/.ghcup/bin"
 70 |           curl -sL https://downloads.haskell.org/ghcup/0.1.30.0/x86_64-linux-ghcup-0.1.30.0 > "$HOME/.ghcup/bin/ghcup"
 71 |           chmod a+x "$HOME/.ghcup/bin/ghcup"
 72 |       - name: Install cabal-install
 73 |         run: |
 74 |           "$HOME/.ghcup/bin/ghcup" install cabal 3.12.1.0 || (cat "$HOME"/.ghcup/logs/*.* && false)
 75 |           echo "CABAL=$HOME/.ghcup/bin/cabal-3.12.1.0 -vnormal+nowrap" >> "$GITHUB_ENV"
 76 |       - name: Install GHC (GHCup)
 77 |         if: matrix.setup-method == 'ghcup'
 78 |         run: |
 79 |           "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false)
 80 |           HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER")
 81 |           HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#')
 82 |           HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#')
 83 |           echo "HC=$HC" >> "$GITHUB_ENV"
 84 |           echo "HCPKG=$HCPKG" >> "$GITHUB_ENV"
 85 |           echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV"
 86 |         env:
 87 |           HCKIND: ${{ matrix.compilerKind }}
 88 |           HCNAME: ${{ matrix.compiler }}
 89 |           HCVER: ${{ matrix.compilerVersion }}
 90 |       - name: Set PATH and environment variables
 91 |         run: |
 92 |           echo "$HOME/.cabal/bin" >> $GITHUB_PATH
 93 |           echo "LANG=C.UTF-8" >> "$GITHUB_ENV"
 94 |           echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV"
 95 |           echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV"
 96 |           HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))')
 97 |           echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV"
 98 |           echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV"
 99 |           echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV"
100 |           echo "HEADHACKAGE=false" >> "$GITHUB_ENV"
101 |           echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV"
102 |         env:
103 |           HCKIND: ${{ matrix.compilerKind }}
104 |           HCNAME: ${{ matrix.compiler }}
105 |           HCVER: ${{ matrix.compilerVersion }}
106 |       - name: env
107 |         run: |
108 |           env
109 |       - name: write cabal config
110 |         run: |
111 |           mkdir -p $CABAL_DIR
112 |           cat >> $CABAL_CONFIG <<EOF
113 |           remote-build-reporting: anonymous
114 |           write-ghc-environment-files: never
115 |           remote-repo-cache: $CABAL_DIR/packages
116 |           logs-dir:          $CABAL_DIR/logs
117 |           world-file:        $CABAL_DIR/world
118 |           extra-prog-path:   $CABAL_DIR/bin
119 |           symlink-bindir:    $CABAL_DIR/bin
120 |           installdir:        $CABAL_DIR/bin
121 |           build-summary:     $CABAL_DIR/logs/build.log
122 |           store-dir:         $CABAL_DIR/store
123 |           install-dirs user
124 |             prefix: $CABAL_DIR
125 |           repository hackage.haskell.org
126 |             url: http://hackage.haskell.org/
127 |           EOF
128 |           cat >> $CABAL_CONFIG <<EOF
129 |           program-default-options
130 |             ghc-options: $GHCJOBS +RTS -M3G -RTS
131 |           EOF
132 |           cat $CABAL_CONFIG
133 |       - name: versions
134 |         run: |
135 |           $HC --version || true
136 |           $HC --print-project-git-commit-id || true
137 |           $CABAL --version || true
138 |       - name: update cabal index
139 |         run: |
140 |           $CABAL v2-update -v
141 |       - name: install cabal-plan
142 |         run: |
143 |           mkdir -p $HOME/.cabal/bin
144 |           curl -sL https://github.com/haskell-hvr/cabal-plan/releases/download/v0.7.3.0/cabal-plan-0.7.3.0-x86_64-linux.xz > cabal-plan.xz
145 |           echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2  cabal-plan.xz' | sha256sum -c -
146 |           xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan
147 |           rm -f cabal-plan.xz
148 |           chmod a+x $HOME/.cabal/bin/cabal-plan
149 |           cabal-plan --version
150 |       - name: checkout
151 |         uses: actions/checkout@v4
152 |         with:
153 |           path: source
154 |       - name: initial cabal.project for sdist
155 |         run: |
156 |           touch cabal.project
157 |           echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project
158 |           cat cabal.project
159 |       - name: sdist
160 |         run: |
161 |           mkdir -p sdist
162 |           $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist
163 |       - name: unpack
164 |         run: |
165 |           mkdir -p unpacked
166 |           find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \;
167 |       - name: generate cabal.project
168 |         run: |
169 |           PKGDIR_full_text_search="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/full-text-search-[0-9.]*')"
170 |           echo "PKGDIR_full_text_search=${PKGDIR_full_text_search}" >> "$GITHUB_ENV"
171 |           rm -f cabal.project cabal.project.local
172 |           touch cabal.project
173 |           touch cabal.project.local
174 |           echo "packages: ${PKGDIR_full_text_search}" >> cabal.project
175 |           echo "package full-text-search" >> cabal.project
176 |           echo "    ghc-options: -Werror=missing-methods" >> cabal.project
177 |           cat >> cabal.project <<EOF
178 |           package full-text-search
179 |             ghc-options: -Wall -Werror
180 |           EOF
181 |           $HCPKG list --simple-output --names-only | perl -ne 'for (split /\s+/) { print "constraints: any.$_ installed\n" unless /^(full-text-search)$/; }' >> cabal.project.local
182 |           cat cabal.project
183 |           cat cabal.project.local
184 |       - name: dump install plan
185 |         run: |
186 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all
187 |           cabal-plan
188 |       - name: restore cache
189 |         uses: actions/cache/restore@v4
190 |         with:
191 |           key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }}
192 |           path: ~/.cabal/store
193 |           restore-keys: ${{ runner.os }}-${{ matrix.compiler }}-
194 |       - name: install dependencies
195 |         run: |
196 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all
197 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all
198 |       - name: build w/o tests
199 |         run: |
200 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all
201 |       - name: build
202 |         run: |
203 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always
204 |       - name: tests
205 |         run: |
206 |           $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct
207 |       - name: cabal check
208 |         run: |
209 |           cd ${PKGDIR_full_text_search} || false
210 |           ${CABAL} -vnormal check
211 |       - name: haddock
212 |         run: |
213 |           $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all
214 |       - name: unconstrained build
215 |         run: |
216 |           rm -f cabal.project.local
217 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all
218 |       - name: save cache
219 |         if: always()
220 |         uses: actions/cache/save@v4
221 |         with:
222 |           key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }}
223 |           path: ~/.cabal/store
224 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist
2 | dist-newstyle
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013-2014 Duncan Coutts, 2014 Well-Typed LLP
 2 |               2014 IRIS Connect Ltd.
 3 | 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 |     * Redistributions of source code must retain the above copyright
10 |       notice, this list of conditions and the following disclaimer.
11 | 
12 |     * Redistributions in binary form must reproduce the above
13 |       copyright notice, this list of conditions and the following
14 |       disclaimer in the documentation and/or other materials provided
15 |       with the distribution.
16 | 
17 |     * Neither the name of Duncan Coutts nor the names of other
18 |       contributors may be used to endorse or promote products derived
19 |       from this software without specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # full-text-search
2 | An in-memory full text search engine library. It lets you run full-text queries on a collection of your documents.
3 | 
4 | See http://hackage.haskell.org/package/full-text-search
5 | 


--------------------------------------------------------------------------------
/changelog:
--------------------------------------------------------------------------------
 1 | 0.2.2.3 Ben Gamari <ben@well-typed.com> January 2025
 2 | 	* Introduce compatibility with GHCs up to 9.12
 3 | 	* Drop compatibility with GHC versions pre-9.2
 4 | 
 5 | 0.2.2.2 Adam Gundry <adam@well-typed.com> March 2023
 6 | 	* Fix bug in 0.2.2.1 autosuggest patch
 7 | 
 8 | 0.2.2.1 Adam Gundry <adam@well-typed.com> March 2023
 9 | 	* Fix autosuggest query performance bug on large datasets
10 | 
11 | 0.2.2.0 Adam Gundry <adam@well-typed.com> November 2022
12 | 	* Add queryAutosuggestPredicate and queryAutosuggestMatchingDocuments
13 |         * Compatibility with GHC 8.10 to 9.4 and new package versions
14 | 
15 | 0.2.1.4 Mikolaj Konarski <mikolaj@well-typed.com> August 2017
16 |         * Compatibility with GHC 8.0.2 and new package versions (no API changes)
17 | 
18 | 0.2.1.3 Duncan Coutts <duncan@well-typed.com> May 2015
19 |         * Compatibility with GHC 7.10 (no API changes)
20 | 
21 | 0.2.1.2 Adam Gundry <adam@well-typed.com> July 2014
22 |         * Compatibility with GHC 7.8 (no API changes)
23 | 
24 | 0.2.1.1 Duncan Coutts <duncan@well-typed.com> May 2014
25 |         * Fix for NaN scores with a doc field that is empty for all docs
26 |         * Add instance Show NoFeatures
27 | 
28 | 0.2.1.0 Duncan Coutts <duncan@well-typed.com> March 2014
29 |         * Add auto-complete / auto-suggest feature
30 |         * Add demo program
31 |         * Moved QC props into a separate test suite
32 |         * Work sponsored by IRIS Connect Ltd.
33 | 
34 | 0.2.0.0 Duncan Coutts <duncan@well-typed.com> Feb 2014
35 |         * Initial version as a separate library
36 | 
37 | 0.1.1.0 Duncan Coutts <duncan@community.haskell.org> Nov 2013
38 |         * Add "explain" mode for query
39 |         * Add non-term feature scores
40 | 
41 | 0.1.0.1 Duncan Coutts <duncan@community.haskell.org> Sept 2013
42 |         * Fix bug in index update (thanks to Matthew Gruen)
43 | 
44 | 0.1.0.0 Duncan Coutts <duncan@community.haskell.org> July 2013
45 |         * Initial version (included in hackage-server)
46 | 
47 | 


--------------------------------------------------------------------------------
/demo/ExtractDescriptionTerms.hs:
--------------------------------------------------------------------------------
  1 | 
  2 | module ExtractDescriptionTerms (
  3 |     extractSynopsisTerms,
  4 |     extractDescriptionTerms
  5 |   ) where
  6 | 
  7 | import Data.Text (Text)
  8 | import qualified Data.Text as T
  9 | import Data.Set (Set)
 10 | import qualified Data.Set as Set
 11 | import Data.Char
 12 | import qualified NLP.Tokenize as NLP
 13 | import qualified NLP.Snowball as NLP
 14 | import Control.Monad ((>=>))
 15 | import Data.Maybe
 16 | 
 17 | import HaddockTypes as Haddock
 18 | import HaddockHtml  as Haddock (markup)
 19 | import qualified HaddockParse as Haddock (parseHaddockParagraphs)
 20 | import qualified HaddockLex   as Haddock (tokenise)
 21 | 
 22 | 
 23 | extractSynopsisTerms :: Set Text -> String -> [Text]
 24 | extractSynopsisTerms stopWords =
 25 |       NLP.stems NLP.English
 26 |     . filter (`Set.notMember` stopWords)
 27 |     . map (T.toCaseFold . T.pack)
 28 |     . concatMap splitTok
 29 |     . filter (not . ignoreTok)
 30 |     . NLP.tokenize
 31 | 
 32 | 
 33 | ignoreTok :: String -> Bool  
 34 | ignoreTok = all isPunctuation
 35 | 
 36 | splitTok :: String -> [String]
 37 | splitTok tok =
 38 |     case go tok of
 39 |       toks@(_:_:_) -> tok:toks
 40 |       toks         -> toks
 41 |   where
 42 |     go remaining =
 43 |       case break (\c -> c == ')' || c == '-' || c == '/') remaining of
 44 |         ([],      _:trailing) -> go trailing
 45 |         (leading, _:trailing) -> leading : go trailing
 46 |         ([],      [])         -> []
 47 |         (leading, [])         -> leading : []
 48 | 
 49 | 
 50 | extractDescriptionTerms :: Set Text -> String -> [Text]
 51 | extractDescriptionTerms stopWords =
 52 |       NLP.stems NLP.English
 53 |     . filter (`Set.notMember` stopWords)
 54 |     . map (T.toCaseFold . T.pack)
 55 |     . maybe
 56 |         [] --TODO: something here
 57 |         (  filter (not . ignoreTok)
 58 |          . NLP.tokenize
 59 |          . concat . Haddock.markup termsMarkup)
 60 |     . (Haddock.tokenise >=> Haddock.parseHaddockParagraphs)
 61 | 
 62 | termsMarkup :: DocMarkup String [String]
 63 | termsMarkup = Markup {
 64 |   markupEmpty         = [],
 65 |   markupString        = \s -> [s],
 66 |   markupParagraph     = id,
 67 |   markupAppend        = (++),
 68 |   markupIdentifier    = \s -> [s],
 69 |   markupModule        = const [], -- i.e. filter these out
 70 |   markupEmphasis      = id,
 71 |   markupMonospaced    = \s -> if length s > 1 then [] else s,
 72 |   markupUnorderedList = concat,
 73 |   markupOrderedList   = concat,
 74 |   markupDefList       = concatMap (\(d,t) -> d ++ t),
 75 |   markupCodeBlock     = const [],
 76 |   markupHyperlink     = \(Hyperlink _url mLabel) -> maybeToList mLabel,
 77 |                         --TODO: extract main part of hostname
 78 |   markupPic           = const [],
 79 |   markupAName         = const []
 80 |   }
 81 | 
 82 | {-
 83 | -------------------
 84 | -- Main experiment
 85 | --
 86 | 
 87 | main = do
 88 |     pkgsFile <- readFile "pkgs"
 89 |     let mostFreq :: [String]
 90 |         pkgs     :: [PackageDescription]
 91 |         (mostFreq, pkgs) = read pkgsFile
 92 |     
 93 |     stopWordsFile <- T.readFile "stopwords.txt"
 94 | --    wordsFile <- T.readFile "/usr/share/dict/words"
 95 | --    let ws = Set.fromList (map T.toLower $ T.lines wordsFile)
 96 | 
 97 | 
 98 |     print "reading file"
 99 |     evaluate (length mostFreq + length pkgs)
100 |     print "done"
101 | 
102 |     let stopWords = Set.fromList $ T.lines stopWordsFile
103 |     print stopWords
104 | 
105 |     sequence_
106 |       [ putStrLn $ display (packageName pkg) ++ ": "
107 |                 ++ --intercalate ", "
108 |                    (description pkg) ++ "\n" 
109 |                 ++ intercalate ", "
110 |                    (map T.unpack $ extractDescriptionTerms stopWords (description pkg)) ++ "\n"
111 |       | pkg <- pkgs
112 |       , let pkgname = display (packageName pkg) ]
113 | -}
114 | 


--------------------------------------------------------------------------------
/demo/ExtractNameTerms.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
  2 | 
  3 | module ExtractNameTerms (
  4 |     extractPackageNameTerms,
  5 |     extractModuleNameTerms,
  6 |   ) where
  7 | 
  8 | import Data.Text (Text)
  9 | import qualified Data.Text as T
 10 | import Data.Char (isUpper, isDigit)
 11 | import Data.List
 12 | import Data.List.Split hiding (Splitter)
 13 | import Data.Maybe (maybeToList)
 14 | 
 15 | import Data.Functor.Identity
 16 | import Control.Applicative
 17 | import Control.Monad
 18 | import Control.Monad.List
 19 | import Control.Monad.Writer
 20 | import Control.Monad.State
 21 | 
 22 | 
 23 | extractModuleNameTerms :: String -> [Text]
 24 | extractModuleNameTerms modname =
 25 |   map T.toCaseFold $
 26 |   nub $
 27 |   map T.pack $
 28 |   flip runSplitter modname $ do
 29 |     _ <- forEachPart splitDot
 30 |     _ <- forEachPart splitCamlCase
 31 |     satisfy (not . singleChar)
 32 |     get >>= emit
 33 | 
 34 | extractPackageNameTerms :: String -> [Text]
 35 | extractPackageNameTerms pkgname =
 36 |   map T.toCaseFold $
 37 |   nub $
 38 |   map T.pack $
 39 |   flip runSplitter pkgname $ do
 40 | 
 41 |     fstComponentHyphen <- forEachPart splitHyphen
 42 | 
 43 |     satisfy (`notElem` ["hs", "haskell"])
 44 | 
 45 |     _ <- forEachPart stripPrefixH
 46 | 
 47 |     fstComponentCaml <- forEachPart splitCamlCase
 48 | 
 49 |     fstComponent2 <- forEachPart splitOn2
 50 | 
 51 |     when (fstComponentHyphen && fstComponentCaml && fstComponent2) $ do
 52 |       forEachPartAndWhole stripPrefix_h
 53 |     _ <- forEachPart (maybeToList . stripPrefix "lib")
 54 |     _ <- forEachPart (maybeToList . stripSuffix "lib")
 55 |     _ <- forEachPart stripSuffixNum
 56 |     satisfy (not . singleChar)
 57 | 
 58 |     get >>= emit
 59 | 
 60 | newtype Split a = Split (StateT String (ListT (WriterT [String] Identity)) a)
 61 |   deriving (Functor, Applicative, Alternative, Monad, MonadPlus, MonadState String)
 62 | 
 63 | emit :: String -> Split ()
 64 | emit x = Split (lift (lift (tell [x])))
 65 | 
 66 | forEach :: [a] -> Split a
 67 | forEach = msum . map return
 68 | 
 69 | runSplitter :: Split () -> String -> [String]
 70 | runSplitter (Split m) s = snd (runWriter (runListT (runStateT m s)))
 71 | 
 72 | singleChar :: String -> Bool
 73 | singleChar [_] = True
 74 | singleChar _   = False
 75 | 
 76 | satisfy :: (String -> Bool) -> Split ()
 77 | satisfy p = get >>= guard . p
 78 | 
 79 | forEachPart :: (String -> [String]) -> Split Bool
 80 | forEachPart parts = do
 81 |   t <- get
 82 |   case parts t of
 83 |     []             -> return True
 84 |     [t'] | t == t' -> return True
 85 |     ts             -> do emit t
 86 |                          (t', n) <- forEach (zip ts [1::Int ..])
 87 |                          put t'
 88 |                          return (n==1)
 89 | 
 90 | forEachPartAndWhole :: (String -> [String]) -> Split ()
 91 | forEachPartAndWhole parts = do
 92 |   t <- get
 93 |   case parts t of
 94 |     []             -> return ()
 95 |     ts             -> forEach (t:ts) >>= put
 96 | 
 97 | 
 98 | splitDot :: String -> [String]
 99 | splitDot = split (dropBlanks $ dropDelims $ whenElt (=='.'))
100 | 
101 | splitHyphen :: String -> [String]
102 | splitHyphen = split (dropBlanks $ dropDelims $ whenElt (=='-'))
103 | 
104 | splitCamlCase :: String -> [String]
105 | splitCamlCase = split (dropInitBlank $ condense $ keepDelimsL $ whenElt isUpper)
106 | 
107 | stripPrefixH :: String -> [String]
108 | stripPrefixH ('H':'S':frag)   | all isUpper frag = [frag]
109 | stripPrefixH "HTTP"                              = []
110 | stripPrefixH ('H':frag@(c:_)) | isUpper c        = [frag]
111 | stripPrefixH _                                   = []
112 | 
113 | stripPrefix_h :: String -> [String]
114 | stripPrefix_h "http"         = []
115 | stripPrefix_h "html"         = []
116 | stripPrefix_h ('h':'s':frag) = ['s':frag, frag]
117 | stripPrefix_h ('h':frag) {- | Set.notMember (T.pack w) ws -} = [frag]
118 | stripPrefix_h _              = []
119 | 
120 | stripSuffix :: String -> String -> Maybe String
121 | stripSuffix s t = fmap reverse (stripPrefix (reverse s) (reverse t))
122 | 
123 | stripSuffixNum :: String -> [String]
124 | stripSuffixNum s
125 |   | null rd || null rs' = []
126 |   | otherwise           = [s', d]
127 |   where
128 |     rs        = reverse s
129 |     (rd, rs') = span isDigit rs
130 |     d         = reverse rd
131 |     s'        = reverse rs'
132 | 
133 | splitOn2 :: String -> [String]
134 | splitOn2 t =
135 |   case break (=='2') t of
136 |     (from@(_:_), '2':to@(c:_))
137 |       | not (isDigit c)
138 |       , not (length from == 1 && length to == 1)
139 |       -> [from, to]
140 |     _ -> []
141 | 
142 | {-
143 | -------------------
144 | -- Main experiment
145 | --
146 | 
147 | main = do
148 |     pkgsFile <- readFile "pkgs"
149 |     let mostFreq :: [String]
150 |         pkgs     :: [PackageDescription]
151 |         (mostFreq, pkgs) = read pkgsFile
152 | 
153 | --    wordsFile <- T.readFile "/usr/share/dict/words"
154 | --    let ws = Set.fromList (map T.toLower $ T.lines wordsFile)
155 | 
156 | 
157 |     print "reading file"
158 |     evaluate (length mostFreq + length pkgs)
159 |     print "done"
160 | 
161 |     sequence_
162 |       [ putStrLn $ display (packageName pkg) ++ " -> "
163 |                 ++ intercalate ", " (map T.unpack $ extractNameTerms pkgname)
164 |       | pkg <- pkgs
165 |       , let pkgname = display (packageName pkg) ]
166 | 
167 | main = do
168 |     pkgsFile <- readFile "pkgs3"
169 |     let pkgs     :: [PackageDescription]
170 |         pkgs = map read (lines pkgsFile)
171 | 
172 | --    print "forcing pkgs..."
173 | --    evaluate (foldl' (\a p -> seq p a) () pkgs)
174 | 
175 |     sequence_
176 |       [ putStrLn $ display (packageName pkg) ++ ": " ++ display mod ++ " -> "
177 |                 ++ intercalate ", " (map T.unpack $ extractModuleNameTerms (display mod))
178 |       | pkg <- pkgs
179 |       , Just lib <- [library pkg]
180 |       , let mods = exposedModules lib
181 |       , mod <- mods ]
182 | -}
183 | 


--------------------------------------------------------------------------------
/demo/HaddockHtml.hs:
--------------------------------------------------------------------------------
 1 | -- stolen from Haddock's Util.hs and Doc.hs
 2 | module HaddockHtml (markup, docAppend, docParagraph) where
 3 | 
 4 | import HaddockTypes
 5 | import Data.Char                (isSpace)
 6 | 
 7 | markup :: DocMarkup id a -> Doc id -> a
 8 | markup m DocEmpty               = markupEmpty m
 9 | markup m (DocAppend d1 d2)      = markupAppend m (markup m d1) (markup m d2)
10 | markup m (DocString s)          = markupString m s
11 | markup m (DocParagraph d)       = markupParagraph m (markup m d)
12 | markup m (DocIdentifier x)      = markupIdentifier m x
13 | markup m (DocModule mod0)       = markupModule m mod0
14 | markup m (DocEmphasis d)        = markupEmphasis m (markup m d)
15 | markup m (DocMonospaced d)      = markupMonospaced m (markup m d)
16 | markup m (DocUnorderedList ds)  = markupUnorderedList m (map (markup m) ds)
17 | markup m (DocOrderedList ds)    = markupOrderedList m (map (markup m) ds)
18 | markup m (DocDefList ds)        = markupDefList m (map (markupPair m) ds)
19 | markup m (DocCodeBlock d)       = markupCodeBlock m (markup m d)
20 | markup m (DocHyperlink l)       = markupHyperlink m l
21 | markup m (DocAName ref)         = markupAName m ref
22 | markup m (DocPic img)           = markupPic m img
23 | 
24 | markupPair :: DocMarkup id a -> (Doc id, Doc id) -> (a, a)
25 | markupPair m (a,b) = (markup m a, markup m b)
26 | 
27 | -- -----------------------------------------------------------------------------
28 | -- ** Smart constructors
29 | 
30 | -- used to make parsing easier; we group the list items later
31 | docAppend :: Doc id -> Doc id -> Doc id
32 | docAppend (DocUnorderedList ds1) (DocUnorderedList ds2)
33 |   = DocUnorderedList (ds1++ds2)
34 | docAppend (DocUnorderedList ds1) (DocAppend (DocUnorderedList ds2) d)
35 |   = DocAppend (DocUnorderedList (ds1++ds2)) d
36 | docAppend (DocOrderedList ds1) (DocOrderedList ds2)
37 |   = DocOrderedList (ds1++ds2)
38 | docAppend (DocOrderedList ds1) (DocAppend (DocOrderedList ds2) d)
39 |   = DocAppend (DocOrderedList (ds1++ds2)) d
40 | docAppend (DocDefList ds1) (DocDefList ds2)
41 |   = DocDefList (ds1++ds2)
42 | docAppend (DocDefList ds1) (DocAppend (DocDefList ds2) d)
43 |   = DocAppend (DocDefList (ds1++ds2)) d
44 | docAppend DocEmpty d = d
45 | docAppend d DocEmpty = d
46 | docAppend d1 d2
47 |   = DocAppend d1 d2
48 | 
49 | 
50 | -- again to make parsing easier - we spot a paragraph whose only item
51 | -- is a DocMonospaced and make it into a DocCodeBlock
52 | docParagraph :: Doc id -> Doc id
53 | docParagraph (DocMonospaced p)
54 |   = DocCodeBlock (docCodeBlock p)
55 | docParagraph (DocAppend (DocString s1) (DocMonospaced p))
56 |   | all isSpace s1
57 |   = DocCodeBlock (docCodeBlock p)
58 | docParagraph (DocAppend (DocString s1)
59 |     (DocAppend (DocMonospaced p) (DocString s2)))
60 |   | all isSpace s1 && all isSpace s2
61 |   = DocCodeBlock (docCodeBlock p)
62 | docParagraph (DocAppend (DocMonospaced p) (DocString s2))
63 |   | all isSpace s2
64 |   = DocCodeBlock (docCodeBlock p)
65 | docParagraph p
66 |   = DocParagraph p
67 | 
68 | 
69 | -- Drop trailing whitespace from @..@ code blocks.  Otherwise this:
70 | --
71 | --    -- @
72 | --    -- foo
73 | --    -- @
74 | --
75 | -- turns into (DocCodeBlock "\nfoo\n ") which when rendered in HTML
76 | -- gives an extra vertical space after the code block.  The single space
77 | -- on the final line seems to trigger the extra vertical space.
78 | --
79 | docCodeBlock :: Doc id -> Doc id
80 | docCodeBlock (DocString s)
81 |   = DocString (reverse $ dropWhile (`elem` " \t") $ reverse s)
82 | docCodeBlock (DocAppend l r)
83 |   = DocAppend l (docCodeBlock r)
84 | docCodeBlock d = d
85 | 


--------------------------------------------------------------------------------
/demo/HaddockLex.x:
--------------------------------------------------------------------------------
  1 | --
  2 | -- Haddock - A Haskell Documentation Tool
  3 | --
  4 | -- (c) Simon Marlow 2002
  5 | --
  6 | 
  7 | {
  8 | -- Disable warnings that the generated code causes
  9 | {-# OPTIONS_GHC -fno-warn-deprecated-flags
 10 |                 -fno-warn-unused-binds
 11 |                 -fno-warn-unused-imports
 12 |                 -fno-warn-unused-matches
 13 |                 -fno-warn-missing-signatures
 14 |                 -fno-warn-tabs #-}
 15 | module HaddockLex (
 16 |         Token(..),
 17 |         tokenise
 18 |  ) where
 19 | 
 20 | import Data.Char
 21 | import Data.Word (Word8)
 22 | import qualified Data.Bits
 23 | import Numeric
 24 | import Control.Monad (liftM)
 25 | import HaddockTypes (RdrName)
 26 | }
 27 | 
 28 | $ws    = $white # \n
 29 | $digit = [0-9]
 30 | $hexdigit = [0-9a-fA-F]
 31 | $special =  [\"\@]
 32 | $alphanum = [A-Za-z0-9]
 33 | $ident    = [$alphanum \'\_\.\!\#\$\%\&\*\+\/\<\=\>\?\@\\\\\^\|\-\~\:]
 34 | 
 35 | :-
 36 | 
 37 | -- beginning of a paragraph
 38 | <0,para> {
 39 |  $ws* \n                ;
 40 |  $ws* \>                { begin birdtrack }
 41 |  $ws* [\*\-]            { token TokBullet `andBegin` string }
 42 |  $ws* \[                { token TokDefStart `andBegin` def }
 43 |  $ws* \( $digit+ \)     { token TokNumber `andBegin` string }
 44 |  $ws*                   { begin string }
 45 | }
 46 | 
 47 | -- beginning of a line
 48 | <line> {
 49 |   $ws* \>               { begin birdtrack }
 50 |   $ws* \n               { token TokPara `andBegin` para }
 51 |   -- Here, we really want to be able to say
 52 |   -- $ws* (\n | <eof>)  { token TokPara `andBegin` para}
 53 |   -- because otherwise a trailing line of whitespace will result in
 54 |   -- a spurious TokString at the end of a docstring.  We don't have <eof>,
 55 |   -- though (NOW I realise what it was for :-).  To get around this, we always
 56 |   -- append \n to the end of a docstring.
 57 |   ()                    { begin string }
 58 | }
 59 | 
 60 | <birdtrack> .*  \n?     { strtokenNL TokBirdTrack `andBegin` line }
 61 | 
 62 | <string,def> {
 63 |   $special                      { strtoken $ \s -> TokSpecial (head s) }
 64 |   \<\< [^\<\>]* \>\>            { strtoken $ \s -> TokPic (init $ init $ tail $ tail s) }
 65 |   \< [^\<\>]* \>                { strtoken $ \s -> TokURL (init (tail s)) }
 66 |   \# [^\#]* \#                  { strtoken $ \s -> TokAName (init (tail s)) }
 67 |   \/ [^\/]* \/                  { strtoken $ \s -> TokEmphasis (init (tail s)) }
 68 |   [\'\`] $ident+ [\'\`]         { ident }
 69 |   \\ .                          { strtoken (TokString . tail) }
 70 |   "&#" $digit+ \;               { strtoken $ \s -> TokString [chr (read (init (drop 2 s)))] }
 71 |   "&#" [xX] $hexdigit+ \;       { strtoken $ \s -> case readHex (init (drop 3 s)) of [(n,_)] -> TokString [chr n]; _ -> error "hexParser: Can't happen" }
 72 |   -- allow special characters through if they don't fit one of the previous
 73 |   -- patterns.
 74 |   [\/\'\`\<\#\&\\]                      { strtoken TokString }
 75 |   [^ $special \/ \< \# \n \'\` \& \\ \]]* \n { strtokenNL TokString `andBegin` line }
 76 |   [^ $special \/ \< \# \n \'\` \& \\ \]]+    { strtoken TokString }
 77 | }
 78 | 
 79 | <def> {
 80 |   \]                            { token TokDefEnd `andBegin` string }
 81 | }
 82 | 
 83 | -- ']' doesn't have any special meaning outside of the [...] at the beginning
 84 | -- of a definition paragraph.
 85 | <string> {
 86 |   \]                            { strtoken TokString }
 87 | }
 88 | 
 89 | {
 90 | data Token
 91 |   = TokPara
 92 |   | TokNumber
 93 |   | TokBullet
 94 |   | TokDefStart
 95 |   | TokDefEnd
 96 |   | TokSpecial Char
 97 |   | TokIdent RdrName
 98 |   | TokString String
 99 |   | TokURL String
100 |   | TokPic String
101 |   | TokEmphasis String
102 |   | TokAName String
103 |   | TokBirdTrack String
104 |   deriving Show
105 | 
106 | -- -----------------------------------------------------------------------------
107 | -- Alex support stuff
108 | 
109 | type StartCode = Int
110 | type Action = String -> StartCode -> (StartCode -> Maybe [Token]) -> Maybe [Token]
111 | 
112 | --TODO: we ought to switch to ByteString input.
113 | type AlexInput = (Char, [Word8], String)
114 | 
115 | -- | For alex >= 3
116 | --
117 | -- See also alexGetChar
118 | alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
119 | alexGetByte (c,(b:bs),s) = Just (b,(c,bs,s))
120 | alexGetByte (c,[],[])    = Nothing
121 | alexGetByte (_,[],(c:s)) = case utf8Encode c of
122 |                              (b:bs) -> Just (b, (c, bs, s))
123 | 
124 | -- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
125 | utf8Encode :: Char -> [Word8]
126 | utf8Encode = map fromIntegral . go . ord
127 |  where
128 |   go oc
129 |    | oc <= 0x7f       = [oc]
130 | 
131 |    | oc <= 0x7ff      = [ 0xc0 + (oc `Data.Bits.shiftR` 6)
132 |                         , 0x80 + oc Data.Bits..&. 0x3f
133 |                         ]
134 | 
135 |    | oc <= 0xffff     = [ 0xe0 + (oc `Data.Bits.shiftR` 12)
136 |                         , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
137 |                         , 0x80 + oc Data.Bits..&. 0x3f
138 |                         ]
139 |    | otherwise        = [ 0xf0 + (oc `Data.Bits.shiftR` 18)
140 |                         , 0x80 + ((oc `Data.Bits.shiftR` 12) Data.Bits..&. 0x3f)
141 |                         , 0x80 + ((oc `Data.Bits.shiftR` 6) Data.Bits..&. 0x3f)
142 |                         , 0x80 + oc Data.Bits..&. 0x3f
143 |                         ]
144 | 
145 | -- | For alex < 3
146 | --
147 | -- See also alexGetByte
148 | alexGetChar :: AlexInput -> Maybe (Char, AlexInput)
149 | alexGetChar (_, _, [])   = Nothing
150 | alexGetChar (_, _, c:cs) = Just (c, (c,[],cs))
151 | 
152 | alexInputPrevChar (c,_) = c
153 | 
154 | tokenise :: String -> Maybe [Token]
155 | tokenise str =
156 |     go ('\n', [], eofHack str) para
157 |   where
158 |     go inp@(_,_,str') sc =
159 |           case alexScan inp sc of
160 |                 AlexEOF -> Just []
161 |                 AlexError _ -> Nothing
162 |                 AlexSkip  inp' _       -> go inp' sc
163 |                 AlexToken inp' len act -> act (take len str') sc (\sc' -> go inp' sc')
164 | 
165 | -- NB. we add a final \n to the string, (see comment in the beginning of line
166 | -- production above).
167 | eofHack str = str++"\n"
168 | 
169 | andBegin  :: Action -> StartCode -> Action
170 | andBegin act new_sc = \str _ cont -> act str new_sc cont
171 | 
172 | token :: Token -> Action
173 | token t = \_ sc cont -> liftM (t :) (cont sc)
174 | 
175 | strtoken, strtokenNL :: (String -> Token) -> Action
176 | strtoken t = \str sc cont -> liftM (t str :) (cont sc)
177 | strtokenNL t = \str sc cont -> liftM (t (filter (/= '\r') str) :) (cont sc)
178 | -- ^ We only want LF line endings in our internal doc string format, so we
179 | -- filter out all CRs.
180 | 
181 | begin :: StartCode -> Action
182 | begin sc = \_ _ cont -> cont sc
183 | 
184 | -- -----------------------------------------------------------------------------
185 | -- Lex a string as a Haskell identifier
186 | 
187 | ident :: Action
188 | ident str sc cont = liftM (TokIdent str :) (cont sc)
189 | }
190 | 


--------------------------------------------------------------------------------
/demo/HaddockParse.y:
--------------------------------------------------------------------------------
  1 | {
  2 | -- Disable warnings that the generated code causes
  3 | {-# OPTIONS_GHC -fno-warn-deprecated-flags
  4 |                 -fno-warn-missing-signatures
  5 |                 -fno-warn-unused-binds
  6 |                 -fno-warn-unused-matches
  7 |                 -fno-warn-lazy-unlifted-bindings
  8 |                 -fno-warn-name-shadowing
  9 |                 -fno-warn-incomplete-patterns
 10 |                 -fno-warn-tabs #-}
 11 | module HaddockParse (parseHaddockParagraphs) where
 12 | 
 13 | import HaddockLex
 14 | import HaddockHtml
 15 | import HaddockTypes
 16 | import Data.Char  (isSpace)
 17 | }
 18 | 
 19 | %expect 0
 20 | 
 21 | %tokentype { Token }
 22 | 
 23 | %token
 24 |         '@'     { TokSpecial '@' }
 25 |         '['     { TokDefStart }
 26 |         ']'     { TokDefEnd }
 27 |         DQUO    { TokSpecial '\"' }
 28 |         URL     { TokURL $$ }
 29 |         PIC     { TokPic $$ }
 30 |         ANAME   { TokAName $$ }
 31 |         '/../'  { TokEmphasis $$ }
 32 |         '-'     { TokBullet }
 33 |         '(n)'   { TokNumber }
 34 |         '>..'   { TokBirdTrack $$ }
 35 |         IDENT   { TokIdent $$ }
 36 |         PARA    { TokPara }
 37 |         STRING  { TokString $$ }
 38 | 
 39 | %monad { Maybe }
 40 | 
 41 | %name parseHaddockParagraphs  doc
 42 | %name parseHaddockString seq
 43 | 
 44 | %%
 45 | 
 46 | doc     :: { Doc RdrName }
 47 |         : apara PARA doc        { docAppend $1 $3 }
 48 |         | PARA doc              { $2 }
 49 |         | apara                 { $1 }
 50 |         | {- empty -}           { DocEmpty }
 51 | 
 52 | apara   :: { Doc RdrName }
 53 |         : ulpara                { DocUnorderedList [$1] }
 54 |         | olpara                { DocOrderedList [$1] }
 55 |         | defpara               { DocDefList [$1] }
 56 |         | para                  { $1 }
 57 | 
 58 | ulpara  :: { Doc RdrName }
 59 |         : '-' para              { $2 }
 60 | 
 61 | olpara  :: { Doc RdrName }
 62 |         : '(n)' para            { $2 }
 63 | 
 64 | defpara :: { (Doc RdrName, Doc RdrName) }
 65 |         : '[' seq ']' seq       { ($2, $4) }
 66 | 
 67 | para    :: { Doc RdrName }
 68 |         : seq                   { docParagraph $1 }
 69 |         | codepara              { DocCodeBlock $1 }
 70 | 
 71 | codepara :: { Doc RdrName }
 72 |         : '>..' codepara        { docAppend (DocString $1) $2 }
 73 |         | '>..'                 { DocString $1 }
 74 | 
 75 | seq     :: { Doc RdrName }
 76 |         : elem seq              { docAppend $1 $2 }
 77 |         | elem                  { $1 }
 78 | 
 79 | elem    :: { Doc RdrName }
 80 |         : elem1                 { $1 }
 81 |         | '@' seq1 '@'          { DocMonospaced $2 }
 82 | 
 83 | seq1    :: { Doc RdrName }
 84 |         : PARA seq1             { docAppend (DocString "\n") $2 }
 85 |         | elem1 seq1            { docAppend $1 $2 }
 86 |         | elem1                 { $1 }
 87 | 
 88 | elem1   :: { Doc RdrName }
 89 |         : STRING                { DocString $1 }
 90 |         | '/../'                { DocEmphasis (DocString $1) }
 91 |         | URL                   { DocHyperlink (makeHyperlink $1) }
 92 |         | PIC                   { DocPic $1 }
 93 |         | ANAME                 { DocAName $1 }
 94 |         | IDENT                 { DocIdentifier $1 }
 95 |         | DQUO strings DQUO     { DocModule $2 }
 96 | 
 97 | strings  :: { String }
 98 |         : STRING                { $1 }
 99 |         | STRING strings        { $1 ++ $2 }
100 | 
101 | {
102 | happyError :: [Token] -> Maybe a
103 | happyError toks = Nothing
104 | 
105 | -- | Create a `Hyperlink` from given string.
106 | --
107 | -- A hyperlink consists of a URL and an optional label.  The label is separated
108 | -- from the url by one or more whitespace characters.
109 | makeHyperlink :: String -> Hyperlink
110 | makeHyperlink input = case break isSpace $ strip input of
111 |   (url, "")    -> Hyperlink url Nothing
112 |   (url, label) -> Hyperlink url (Just . dropWhile isSpace $ label)
113 | 
114 | -- | Remove all leading and trailing whitespace
115 | strip :: String -> String
116 | strip = dropWhile isSpace . reverse . dropWhile isSpace . reverse
117 | }
118 | 


--------------------------------------------------------------------------------
/demo/HaddockTypes.hs:
--------------------------------------------------------------------------------
 1 | -- stolen from Haddock's Types.hs
 2 | module HaddockTypes where
 3 | 
 4 | data Doc id
 5 |   = DocEmpty
 6 |   | DocAppend (Doc id) (Doc id)
 7 |   | DocString String
 8 |   | DocParagraph (Doc id)
 9 |   | DocIdentifier id
10 |   | DocModule String
11 |   | DocEmphasis (Doc id)
12 |   | DocMonospaced (Doc id)
13 |   | DocUnorderedList [Doc id]
14 |   | DocOrderedList [Doc id]
15 |   | DocDefList [(Doc id, Doc id)]
16 |   | DocCodeBlock (Doc id)
17 |   | DocHyperlink Hyperlink
18 |   | DocPic String
19 |   | DocAName String
20 | 
21 | data Hyperlink = Hyperlink
22 |   { hyperlinkUrl   :: String
23 |   , hyperlinkLabel :: Maybe String
24 |   } deriving (Eq, Show)
25 | 
26 | -- | DocMarkup is a set of instructions for marking up documentation.
27 | -- In fact, it's really just a mapping from 'Doc' to some other
28 | -- type [a], where [a] is usually the type of the output (HTML, say).
29 | 
30 | data DocMarkup id a = Markup {
31 |   markupEmpty         :: a,
32 |   markupString        :: String -> a,
33 |   markupParagraph     :: a -> a,
34 |   markupAppend        :: a -> a -> a,
35 |   markupIdentifier    :: id -> a,
36 |   markupModule        :: String -> a,
37 |   markupEmphasis      :: a -> a,
38 |   markupMonospaced    :: a -> a,
39 |   markupUnorderedList :: [a] -> a,
40 |   markupOrderedList   :: [a] -> a,
41 |   markupDefList       :: [(a,a)] -> a,
42 |   markupCodeBlock     :: a -> a,
43 |   markupHyperlink     :: Hyperlink -> a,
44 |   markupAName         :: String -> a,
45 |   markupPic           :: String -> a
46 |   }
47 | 
48 | type RdrName = String
49 | 


--------------------------------------------------------------------------------
/demo/Main.hs:
--------------------------------------------------------------------------------
 1 | module Main where
 2 | 
 3 | import Data.SearchEngine
 4 | import PackageSearch
 5 | 
 6 | import PackageIndexUtils
 7 | 
 8 | import Data.List
 9 | import qualified Data.Map as Map
10 | import qualified Data.Text as T
11 | import qualified Data.Text.IO as T
12 | import Data.Time
13 | 
14 | import Control.Monad
15 | import Control.Exception
16 | import System.IO
17 | import System.Directory
18 | import System.Exit
19 | 
20 | import Distribution.PackageDescription (PackageDescription)
21 | import Distribution.PackageDescription.Configuration (flattenPackageDescription)
22 | import Distribution.Package (packageVersion, packageName)
23 | import Distribution.Text (display)
24 | 
25 | 
26 | -------------------
27 | -- Main experiment
28 | --
29 | 
30 | main :: IO ()
31 | main = do
32 |     putStrLn "reading 00-index.tar..."
33 |     pkgs <- readPackages
34 | 
35 |     putStrLn "forcing pkgs..."
36 |     evaluate (foldl' (\a p -> seq p a) () pkgs)
37 | 
38 |     let searchengine = insertDocs pkgs initialPkgSearchEngine
39 | 
40 |     putStrLn "constructing index..."
41 |     printTiming "done" $
42 |       evaluate searchengine >> return ()
43 |     putStrLn $ "search engine invariant: " ++ show (invariant searchengine)
44 | 
45 | --    print [ avgFieldLength ctx s | s <- [minBound..maxBound] ]
46 | 
47 | --    print $ take 100 $ sortBy (flip compare) $ map Set.size $ Map.elems (termMap searchindex)
48 | --    T.putStr $ T.unlines $ Map.keys (termMap searchindex)
49 | --    let SearchEngine{searchIndex=SearchIndex{termMap, termIdMap, docKeyMap, docIdMap}} = searchengine
50 | --    print (Map.size termMap, IntMap.size termIdMap, Map.size docKeyMap, IntMap.size docIdMap)
51 | 
52 |     let loop = do
53 |           putStr "search term> "
54 |           hFlush stdout 
55 |           t <- T.getLine
56 |           unless (T.null t) $ do
57 |             putStrLn "Ranked results:"
58 |             let rankedResults = queryExplain searchengine (T.words t)
59 | 
60 |             putStr $ unlines
61 |               [ show (overallScore explanation) ++ ": " ++ display pkgname
62 |               | (explanation, pkgname) <- take 10 rankedResults ]
63 | 
64 |             loop
65 |     return ()
66 |     loop
67 | 
68 | printTiming :: String -> IO () -> IO ()
69 | printTiming msg action = do
70 |     t   <- getCurrentTime
71 |     action
72 |     t'  <- getCurrentTime
73 |     putStrLn (msg ++ ". time: " ++ show (diffUTCTime t' t))
74 | 
75 | readPackages :: IO [PackageDescription]
76 | readPackages = do
77 |   exists <- doesFileExist "00-index.tar"
78 |   when (not exists) $ do
79 |     putStrLn "This program needs a 00-index.tar package index."
80 |     putStrLn "Please grab 00-index.tar.gz from hackage and gunzip it."
81 |     exitFailure
82 | 
83 |   pkgs <- PackageIndexUtils.readPackageIndexFile "00-index.tar"
84 |   let latestPkgs = Map.fromListWith
85 |                      (\a b -> if packageVersion (fst a) > packageVersion (fst b)
86 |                                 then a else b)
87 |                      [ (packageName pkgid, (pkgid, pkg))
88 |                      | (pkgid, pkg) <- pkgs ]
89 | 
90 |   return . map (flattenPackageDescription . snd)
91 |          . Map.elems
92 |          $ latestPkgs
93 | 
94 | 


--------------------------------------------------------------------------------
/demo/PackageIndexUtils.hs:
--------------------------------------------------------------------------------
 1 | -----------------------------------------------------------------------------
 2 | -- |
 3 | -- Module      :  Distribution.Client.IndexUtils
 4 | -- Copyright   :  (c) Duncan Coutts 2008
 5 | -- License     :  BSD-like
 6 | --
 7 | -- Maintainer  :  duncan@community.haskell.org
 8 | -- Stability   :  provisional
 9 | -- Portability :  portable
10 | --
11 | -- Extra utils related to the package indexes.
12 | -----------------------------------------------------------------------------
13 | module PackageIndexUtils (
14 |     readPackageIndexFile
15 |   ) where
16 | 
17 | import qualified Codec.Archive.Tar as Tar
18 | 
19 | import Distribution.Package
20 |          ( PackageId, PackageIdentifier(..), PackageName(..) )
21 | import Distribution.PackageDescription
22 |          ( GenericPackageDescription )
23 | import Distribution.PackageDescription.Parse
24 |          ( parsePackageDescription )
25 | import Distribution.ParseUtils
26 |          ( ParseResult(..) )
27 | import Distribution.Text
28 |          ( simpleParse )
29 | import Distribution.Simple.Utils
30 |          ( fromUTF8 )
31 | 
32 | import Data.Maybe (fromMaybe)
33 | import qualified Data.ByteString.Lazy as BS
34 | import qualified Data.ByteString.Lazy.Char8 as BS.Char8
35 | import Data.ByteString.Lazy (ByteString)
36 | import System.FilePath (takeExtension, splitDirectories, normalise)
37 | 
38 | 
39 | readPackageIndexFile :: FilePath -> IO [(PackageId, GenericPackageDescription)]
40 | readPackageIndexFile indexFile =
41 |       either fail return
42 |     . parsePackageIndex
43 |   =<< BS.readFile indexFile
44 | 
45 | -- | Parse an uncompressed \"00-index.tar\" repository index file represented
46 | -- as a 'ByteString'.
47 | --
48 | parsePackageIndex :: ByteString
49 |                   -> Either String [(PackageId, GenericPackageDescription)]
50 | parsePackageIndex = accum [] . Tar.read
51 |   where
52 |     accum pkgs es = case es of
53 |       Tar.Fail err   -> Left  (show err)
54 |       Tar.Done       -> Right (reverse pkgs)
55 |       Tar.Next e es' -> accum pkgs' es'
56 |         where
57 |           pkgs' = extract pkgs e
58 | 
59 |     extract pkgs entry =
60 |        fromMaybe pkgs $ tryExtractPkg
61 |       where
62 |         tryExtractPkg = do
63 |           (pkgid, pkg) <- extractPkg entry
64 |           return ((pkgid, pkg):pkgs)
65 | 
66 | 
67 | extractPkg :: Tar.Entry -> Maybe (PackageId, GenericPackageDescription)
68 | extractPkg entry = case Tar.entryContent entry of
69 |   Tar.NormalFile content _
70 |      | takeExtension fileName == ".cabal"
71 |     -> case splitDirectories (normalise fileName) of
72 |         [pkgname,vers,_] -> case simpleParse vers of
73 |           Just ver -> Just (pkgid, descr)
74 |             where
75 |               pkgid  = PackageIdentifier (PackageName pkgname) ver
76 |               parsed = parsePackageDescription . fromUTF8 . BS.Char8.unpack
77 |                                                $ content
78 |               descr  = case parsed of
79 |                 ParseOk _ d -> d
80 |                 _           -> error $ "Couldn't read cabal file "
81 |                                     ++ show fileName
82 |           _ -> Nothing
83 |         _ -> Nothing
84 |   _ -> Nothing
85 |   where
86 |     fileName = Tar.entryPath entry
87 | 


--------------------------------------------------------------------------------
/demo/PackageSearch.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings, NamedFieldPuns #-}
  2 | module PackageSearch (
  3 |     PkgSearchEngine,
  4 |     initialPkgSearchEngine,
  5 |     defaultSearchRankParameters,
  6 |     PkgDocField(..),
  7 |   ) where
  8 | 
  9 | import Data.SearchEngine
 10 | 
 11 | import ExtractNameTerms
 12 | import ExtractDescriptionTerms
 13 | 
 14 | import Data.Ix
 15 | import Data.Set (Set)
 16 | import qualified Data.Set as Set
 17 | import Data.Text (Text)
 18 | import qualified Data.Text as T
 19 | import NLP.Snowball
 20 | 
 21 | import Distribution.Package
 22 | import Distribution.PackageDescription
 23 | import Distribution.Text (display)
 24 | 
 25 | 
 26 | type PkgSearchEngine = SearchEngine
 27 |                          PackageDescription
 28 |                          PackageName
 29 |                          PkgDocField
 30 |                          NoFeatures
 31 | 
 32 | data PkgDocField = NameField
 33 |                  | SynopsisField
 34 |                  | DescriptionField
 35 |   deriving (Eq, Ord, Enum, Bounded, Ix, Show)
 36 | 
 37 | initialPkgSearchEngine :: PkgSearchEngine
 38 | initialPkgSearchEngine =
 39 |     initSearchEngine pkgSearchConfig defaultSearchRankParameters
 40 | 
 41 | pkgSearchConfig :: SearchConfig PackageDescription
 42 |                                 PackageName PkgDocField NoFeatures
 43 | pkgSearchConfig =
 44 |     SearchConfig {
 45 |       documentKey           = packageName,
 46 |       extractDocumentTerms  = extractTokens,
 47 |       transformQueryTerm    = normaliseQueryToken,
 48 |       documentFeatureValue  = const noFeatures
 49 |   }
 50 |   where
 51 |     extractTokens :: PackageDescription -> PkgDocField -> [Text]
 52 |     extractTokens pkg NameField        = extractPackageNameTerms
 53 |                                            (display $ packageName pkg)
 54 |     extractTokens pkg SynopsisField    = extractSynopsisTerms
 55 |                                            stopWords (synopsis    pkg)
 56 |     extractTokens pkg DescriptionField = extractDescriptionTerms
 57 |                                            stopWords (description pkg)
 58 | 
 59 |     normaliseQueryToken :: Text -> PkgDocField -> Text
 60 |     normaliseQueryToken tok =
 61 |       let tokFold = T.toCaseFold tok
 62 |           tokStem = stem English tokFold
 63 |        in \field -> case field of
 64 |                       NameField        -> tokFold
 65 |                       SynopsisField    -> tokStem
 66 |                       DescriptionField -> tokStem
 67 | 
 68 | defaultSearchRankParameters :: SearchRankParameters PkgDocField NoFeatures
 69 | defaultSearchRankParameters =
 70 |     SearchRankParameters {
 71 |       paramK1,
 72 |       paramB,
 73 |       paramFieldWeights,
 74 |       paramFeatureWeights     = noFeatures,
 75 |       paramFeatureFunctions   = noFeatures,
 76 |       paramResultsetSoftLimit = 200,
 77 |       paramResultsetHardLimit = 400,
 78 |       paramAutosuggestPrefilterLimit  = 500,
 79 |       paramAutosuggestPostfilterLimit = 500
 80 |     }
 81 |   where
 82 |     paramK1 :: Float
 83 |     paramK1 = 1.5
 84 | 
 85 |     paramB :: PkgDocField -> Float
 86 |     paramB NameField        = 0.9
 87 |     paramB SynopsisField    = 0.5
 88 |     paramB DescriptionField = 0.5
 89 | 
 90 |     paramFieldWeights :: PkgDocField -> Float
 91 |     paramFieldWeights NameField        = 20
 92 |     paramFieldWeights SynopsisField    = 5
 93 |     paramFieldWeights DescriptionField = 1
 94 | 
 95 | 
 96 | stopWords :: Set Term
 97 | stopWords =
 98 |   Set.fromList
 99 |     ["haskell","library","simple","using","interface","functions",
100 |      "implementation","package","support","'s","based","for","a","and","the",
101 |      "to","of","with","in","an","on","from","that","as","into","by","is",
102 |      "some","which","or","like","your","other","can","at","over","be","it",
103 |      "within","their","this","but","are","get","one","all","you","so","only",
104 |      "now","how","where","when","up","has","been","about","them","then","see",
105 |      "no","do","than","should","out","off","much","if","i","have","also"]
106 | 
107 | 


--------------------------------------------------------------------------------
/full-text-search.cabal:
--------------------------------------------------------------------------------
  1 | cabal-version: 3.0
  2 | 
  3 | name: full-text-search
  4 | version: 0.2.2.3
  5 | synopsis: In-memory full text search engine
  6 | 
  7 | description:
  8 |   An in-memory full text search engine library. It lets you
  9 |   run full-text queries on a collection of your documents.
 10 | 
 11 |   Features:
 12 | 
 13 |   * Keyword queries and auto-complete\/auto-suggest queries.
 14 | 
 15 |   * Can search over any type of \"document\".
 16 |     (You explain how to extract search terms from them.)
 17 | 
 18 |   * Supports documents with multiple fields
 19 |     (e.g. title, body)
 20 | 
 21 |   * Supports documents with non-term features
 22 |     (e.g. quality score, page rank)
 23 | 
 24 |   * Uses the state of the art BM25F ranking function
 25 | 
 26 |   * Adjustable ranking parameters (including field weights
 27 |     and non-term feature scores)
 28 | 
 29 |   * In-memory but quite compact. It does not keep a copy of
 30 |     your original documents.
 31 | 
 32 |   * Quick incremental index updates, making it possible to
 33 |     keep your text search in-sync with your data.
 34 | 
 35 |   It is independent of the document type, so you have to
 36 |   write the document-specific parts: extracting search terms
 37 |   and any stop words, case-normalisation or stemming. This
 38 |   is quite easy using libraries such as
 39 |   <https://hackage.haskell.org/package/tokenize tokenize> and
 40 |   <https://hackage.haskell.org/package/snowball snowball>.
 41 | 
 42 |   The source package includes a demo to illustrate how to
 43 |   use the library. The demo is a simplified version of how
 44 |   the library is used in the
 45 |   <https://hackage.haskell.org/package/hackage-server hackage-server>
 46 |   where it provides the backend for the package search feature.
 47 | 
 48 | bug-reports:         https://github.com/well-typed/full-text-search/issues
 49 | license:             BSD-3-Clause
 50 | license-file:        LICENSE
 51 | author:              Duncan Coutts
 52 | maintainer:          Duncan Coutts <duncan@well-typed.com>,
 53 |                      Adam Gundry <adam@well-typed.com>
 54 | copyright:           2013-2014 Duncan Coutts, 2014 Well-Typed LLP,
 55 |                      2014-2023 IRIS Connect Ltd.
 56 | category:            Data, Text, Search
 57 | build-type:          Simple
 58 | extra-doc-files:     changelog
 59 | 
 60 | tested-with:         GHC ==9.2.8 || ==9.4.8 || ==9.6.6 || ==9.8.2 || ==9.10.1 || ==9.12.1
 61 | 
 62 | source-repository head
 63 |   type:              git
 64 |   location:          git@github.com:well-typed/full-text-search.git
 65 | 
 66 | flag build-search-demo
 67 |   default:           False
 68 |   description:       Build a little program illustrating the use of the library
 69 |   manual:            True
 70 | 
 71 | common base-deps
 72 |   build-depends:       base       >=4.16 && <4.22,
 73 |                        array      >=0.4  && <0.6,
 74 |                        vector     >=0.11 && <0.14,
 75 |                        containers >=0.4  && <0.8,
 76 |                        text       >=0.11 && <2.2
 77 | 
 78 | library
 79 |   import:              base-deps
 80 |   exposed-modules:     Data.SearchEngine,
 81 |                        Data.SearchEngine.BM25F
 82 |   other-modules:       Data.SearchEngine.Types,
 83 |                        Data.SearchEngine.Update,
 84 |                        Data.SearchEngine.Query,
 85 |                        Data.SearchEngine.Autosuggest,
 86 |                        Data.SearchEngine.SearchIndex,
 87 |                        Data.SearchEngine.DocFeatVals,
 88 |                        Data.SearchEngine.TermBag,
 89 |                        Data.SearchEngine.DocTermIds,
 90 |                        Data.SearchEngine.DocIdSet
 91 |   hs-source-dirs:      src
 92 |   other-extensions:    BangPatterns,
 93 |                        NamedFieldPuns,
 94 |                        RecordWildCards,
 95 |                        GeneralizedNewtypeDeriving,
 96 |                        ScopedTypeVariables
 97 |   default-language:    Haskell2010
 98 |   ghc-options:         -Wall -funbox-strict-fields
 99 | 
100 | 
101 | executable search-demo
102 |   main-is:             Main.hs
103 |   other-modules:       PackageSearch
104 |                        ExtractNameTerms
105 |                        ExtractDescriptionTerms
106 |                        PackageIndexUtils
107 |                        -- support code for package descriptions:
108 |                        HaddockHtml
109 |                        HaddockLex
110 |                        HaddockParse
111 |                        HaddockTypes
112 |   hs-source-dirs:      demo
113 |   if !flag(build-search-demo)
114 |     buildable:         False
115 |   else
116 |     build-depends:     full-text-search,
117 |                        base,
118 |                        text,
119 |                        containers,
120 |                        array,
121 |                        tokenize      >= 0.1  && <0.4,
122 |                        snowball      >= 1.0  && <1.1,
123 |                        transformers  >= 0.5  && <0.6,
124 |                        split         >= 0.2  && <0.3,
125 |                        Cabal         >= 1.14 && <3.15,
126 |                        bytestring    >= 0.12 && <0.13,
127 |                        filepath      >= 1.5  && <1.6,
128 |                        directory     >= 1.3  && <1.5,
129 |                        tar           >= 0.6  && <0.7,
130 |                        time          >= 1.14 && <1.15,
131 |                        mtl           >= 2.2  && <2.4
132 |   build-tool-depends:  alex:alex, happy:happy
133 |   default-language:    Haskell2010
134 |   other-extensions:    GeneralizedNewtypeDeriving
135 |   ghc-options:         -Wall
136 | 
137 | test-suite qc-props
138 |   import:              base-deps
139 |   type:                exitcode-stdio-1.0
140 |   main-is:             Main.hs
141 |   hs-source-dirs:      src, tests
142 |   build-depends:       QuickCheck       ==2.*,
143 |                        tasty            >=0.8,
144 |                        tasty-quickcheck >=0.8
145 |   other-modules:       Test.Data.SearchEngine.TermBag,
146 |                        Test.Data.SearchEngine.DocIdSet,
147 |                        Data.SearchEngine.DocTermIds,
148 |                        Data.SearchEngine.DocIdSet,
149 |                        Data.SearchEngine.TermBag
150 |   default-language:    Haskell2010
151 |   ghc-options:         -Wall
152 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine.hs:
--------------------------------------------------------------------------------
 1 | module Data.SearchEngine (
 2 | 
 3 |     -- * Basic interface
 4 | 
 5 |     -- ** Querying
 6 |     Term,
 7 |     query,
 8 | 
 9 |     -- *** Query auto-completion \/ auto-suggestion
10 |     queryAutosuggest,
11 |     ResultsFilter(..),
12 |     queryAutosuggestPredicate,
13 |     queryAutosuggestMatchingDocuments,
14 | 
15 |     -- ** Making a search engine instance
16 |     initSearchEngine,
17 |     SearchEngine,
18 |     SearchConfig(..),
19 |     SearchRankParameters(..),
20 |     FeatureFunction(..),
21 | 
22 |     -- ** Helper type for non-term features
23 |     NoFeatures,
24 |     noFeatures,
25 | 
26 |     -- ** Managing documents to be searched
27 |     insertDoc,
28 |     insertDocs,
29 |     deleteDoc,
30 | 
31 |     -- * Explain mode for query result rankings
32 |     queryExplain,
33 |     Explanation(..),
34 |     setRankParams,
35 | 
36 |     -- * Internal sanity check
37 |     invariant,
38 |   ) where
39 | 
40 | import Data.SearchEngine.Types
41 | import Data.SearchEngine.Update
42 | import Data.SearchEngine.Query
43 | import Data.SearchEngine.Autosuggest
44 | 
45 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine/Autosuggest.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, NamedFieldPuns, RecordWildCards,
  2 |              ScopedTypeVariables #-}
  3 | 
  4 | module Data.SearchEngine.Autosuggest (
  5 | 
  6 |     -- * Query auto-completion \/ auto-suggestion
  7 |     queryAutosuggest,
  8 |     ResultsFilter(..),
  9 | 
 10 |     queryAutosuggestPredicate,
 11 |     queryAutosuggestMatchingDocuments
 12 | 
 13 |   ) where
 14 | 
 15 | import Data.SearchEngine.Types
 16 | import Data.SearchEngine.Query (ResultsFilter(..))
 17 | import qualified Data.SearchEngine.Query       as Query
 18 | import qualified Data.SearchEngine.SearchIndex as SI
 19 | import qualified Data.SearchEngine.DocIdSet    as DocIdSet
 20 | import qualified Data.SearchEngine.DocTermIds  as DocTermIds
 21 | import qualified Data.SearchEngine.BM25F       as BM25F
 22 | 
 23 | import Data.Ix
 24 | import Data.Ord
 25 | import Data.List
 26 | import Data.Maybe
 27 | import qualified Data.Map as Map
 28 | import qualified Data.IntSet as IntSet
 29 | import qualified Data.Vector.Unboxed as Vec
 30 | 
 31 | 
 32 | -- | Execute an \"auto-suggest\" query. This is where one of the search terms
 33 | -- is an incomplete prefix and we are looking for possible completions of that
 34 | -- search term, and result documents to go with the possible completions.
 35 | --
 36 | -- An auto-suggest query only gives useful results when the 'SearchEngine' is
 37 | -- configured to use a non-term feature score. That is, when we can give
 38 | -- documents an importance score independent of what terms we are looking for.
 39 | -- This is because an auto-suggest query is backwards from a normal query: we
 40 | -- are asking for relevant terms occurring in important or popular documents
 41 | -- so we need some notion of important or popular. Without this we would just
 42 | -- be ranking based on term frequency which while it makes sense for normal
 43 | -- \"forward\" queries is pretty meaningless for auto-suggest \"reverse\"
 44 | -- queries. Indeed for single-term auto-suggest queries the ranking function
 45 | -- we use will assign 0 for all documents and completions if there is no 
 46 | -- non-term feature scores.
 47 | --
 48 | queryAutosuggest :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
 49 |                     SearchEngine doc key field feature ->
 50 |                     ResultsFilter key ->
 51 |                     [Term] -> Term -> ([(Term, Float)], [(key, Float)])
 52 | queryAutosuggest se resultsFilter precedingTerms partialTerm =
 53 | 
 54 |      step_external
 55 |    . step_rank
 56 |    . step_scoreDs
 57 |    . step_scoreTs
 58 |    . step_cache
 59 |    . step_postfilterlimit
 60 |    . step_filter
 61 |    . step_prefilterlimit
 62 |    . step_process
 63 |    $ step_prep
 64 |        precedingTerms partialTerm
 65 | 
 66 |   where
 67 |     -- Construct the auto-suggest query from the query terms
 68 |     step_prep pre_ts t = mkAutosuggestQuery se pre_ts t
 69 | 
 70 |     -- Find the appropriate subset of ts and ds
 71 |     -- and an intermediate result that will be useful later:
 72 |     -- { (t, ds ∩ ds_t) | t ∈ ts, ds ∩ ds_t ≠ ∅ }
 73 |     step_process (ts, ds, pre_ts) = (ts', ds', tdss', pre_ts)
 74 |       where
 75 |         (tdss', ts', ds') = processAutosuggestQuery se (ts, ds, pre_ts)
 76 | 
 77 |     -- If the number of docs results is huge then we may not want to bother
 78 |     -- and just return no results. Even the filtering of a huge number of
 79 |     -- docs can be expensive.
 80 |     step_prefilterlimit args@(_, ds, _, _)
 81 |       | withinPrefilterLimit se ds = args
 82 |       | otherwise                  = ([], DocIdSet.empty, [], [])
 83 | 
 84 |     -- Filter ds to those that are visible for this query
 85 |     -- and at the same time, do the docid -> docinfo lookup
 86 |     -- (needed at this step anyway to do the filter)
 87 |     step_filter (ts, ds, tdss, pre_ts) = (ts, ds_info, tdss, pre_ts)
 88 |       where
 89 |         ds_info = filterAutosuggestQuery se resultsFilter ds
 90 | 
 91 |     -- If the number of docs results is huge then we may not want to bother
 92 |     -- and just return no results. Scoring a large number of docs is expensive.
 93 |     step_postfilterlimit args@(_, ds_info, _, _)
 94 |       | withinPostfilterLimit se ds_info = args
 95 |       | otherwise                        = ([], [], [], [])
 96 | 
 97 |     -- For all ds, calculate and cache a couple bits of info needed
 98 |     -- later for scoring completion terms and doc results
 99 |     step_cache (ts, ds_info, tdss, pre_ts) = (ds_info', tdss)
100 |       where
101 |         ds_info' = cacheDocScoringInfo se ts ds_info pre_ts
102 | 
103 |     -- Score the completion terms
104 |     step_scoreTs (ds_info, tdss) = (ds_info, tdss, ts_scored)
105 |       where
106 |         ts_scored = scoreAutosuggestQueryCompletions tdss ds_info
107 | 
108 |     -- Score the doc results (making use of the completion scores)
109 |     step_scoreDs (ds_info, tdss, ts_scored) = (ts_scored, ds_scored)
110 |       where
111 |         ds_scored = scoreAutosuggestQueryResults tdss ds_info ts_scored
112 | 
113 |     -- Rank the completions and results based on their scores
114 |     step_rank = sortResults
115 | 
116 |     -- Convert from internal Ids into external forms: Term and doc key
117 |     step_external = convertIdsToExternal se
118 | 
119 | 
120 | -- | Given an incomplete prefix query, find the set of documents that match
121 | -- possible completions of that query.  This should be less computationally
122 | -- expensive than 'queryAutosuggest' as it does not do any ranking of documents.
123 | -- However, it does not apply the pre-filter or post-filter limits, and the list
124 | -- may be large when the query terms occur in many documents.  The order of
125 | -- returned keys is unspecified.
126 | queryAutosuggestMatchingDocuments :: (Ix field, Bounded field, Ord key) =>
127 |                                      SearchEngine doc key field feature ->
128 |                                      [Term] -> Term -> [key]
129 | queryAutosuggestMatchingDocuments se@SearchEngine{searchIndex} precedingTerms partialTerm =
130 |     let (_, _, ds) = processAutosuggestQuery se (mkAutosuggestQuery se precedingTerms partialTerm)
131 |     in map (SI.getDocKey searchIndex) (DocIdSet.toList ds)
132 | 
133 | -- | Given an incomplete prefix query, return a predicate that indicates whether
134 | -- a key is in the set of documents that match possible completions of that
135 | -- query.  This is equivalent to calling 'queryAutosuggestMatchingDocuments' and
136 | -- testing whether the key is in the list, but should be more efficient.
137 | --
138 | -- This does not apply the pre-filter or post-filter limits.
139 | queryAutosuggestPredicate :: (Ix field, Bounded field, Ord key) =>
140 |                              SearchEngine doc key field feature ->
141 |                              [Term] -> Term -> (key -> Bool)
142 | queryAutosuggestPredicate se@SearchEngine{searchIndex} precedingTerms partialTerm =
143 |     let (_, _, ds) = processAutosuggestQuery se (mkAutosuggestQuery se precedingTerms partialTerm)
144 |     in (\ key -> maybe False (flip DocIdSet.member ds) (SI.lookupDocKeyDocId searchIndex key))
145 | 
146 | 
147 | -- We apply hard limits both before and after filtering.
148 | -- The post-filter limit is to avoid scoring 1000s of documents.
149 | -- The pre-filter limit is to avoid filtering 1000s of docs (which in some
150 | -- apps may be expensive itself)
151 | 
152 | withinPrefilterLimit :: SearchEngine doc key field feature ->
153 |                         DocIdSet -> Bool
154 | withinPrefilterLimit SearchEngine{searchRankParams} ds =
155 |     DocIdSet.size ds <= paramAutosuggestPrefilterLimit searchRankParams
156 | 
157 | withinPostfilterLimit :: SearchEngine doc key field feature ->
158 |                          [a] -> Bool
159 | withinPostfilterLimit SearchEngine{searchRankParams} ds_info =
160 |     length ds_info <= paramAutosuggestPostfilterLimit searchRankParams
161 | 
162 | 
163 | sortResults :: (Ord av, Ord bv) => ([(a,av)], [(b,bv)]) -> ([(a,av)], [(b,bv)])
164 | sortResults (xs, ys) =
165 |     ( sortBySndDescending xs
166 |     , sortBySndDescending ys )
167 |   where
168 |     sortBySndDescending :: Ord v => [(x,v)] -> [(x,v)]
169 |     sortBySndDescending = sortBy (flip (comparing snd))
170 | 
171 | convertIdsToExternal :: SearchEngine doc key field feature ->
172 |                         ([(TermId, v)], [(DocId, v)]) -> ([(Term, v)], [(key, v)])
173 | convertIdsToExternal SearchEngine{searchIndex} (termids, docids) =
174 |     ( [ (SI.getTerm   searchIndex termid, s) | (termid, s) <- termids ]
175 |     , [ (SI.getDocKey searchIndex docid,  s) | (docid,  s) <- docids  ]
176 |     )
177 | 
178 | 
179 | -- From Bast and Weber:
180 | --
181 | --   An autocompletion query is a pair (T, D), where T is a range of terms
182 | --   (all possible completions of the last term which the user has started
183 | --   typing) and D is a set of documents (the hits for the preceding part of
184 | --   the query).
185 | --
186 | -- We augment this with the preceding terms because we will need these to
187 | -- score the set of documents D.
188 | --
189 | -- Note that the set D will be the entire collection in the case that the
190 | -- preceding part of the query is empty. For efficiency we represent that
191 | -- case specially with Maybe.
192 | 
193 | type AutosuggestQuery = (Map.Map TermId DocIdSet, Maybe DocIdSet, [TermId])
194 | 
195 | mkAutosuggestQuery :: (Ix field, Bounded field) =>
196 |                       SearchEngine doc key field feature ->
197 |                       [Term] -> Term -> AutosuggestQuery
198 | mkAutosuggestQuery se@SearchEngine{ searchIndex }
199 |                    precedingTerms partialTerm =
200 |     (completionTerms, precedingDocHits, precedingTerms')
201 |   where
202 |     completionTerms =
203 |       Map.unions
204 |         [ Map.fromList (SI.lookupTermsByPrefix searchIndex partialTerm')
205 |         | partialTerm' <- Query.expandTransformedQueryTerm se partialTerm
206 |         ]
207 | 
208 |     (precedingTerms', precedingDocHits)
209 |       | null precedingTerms = ([], Nothing)
210 |       | otherwise           = fmap carefulUnions
211 |                                    (lookupRawResults precedingTerms)
212 | 
213 |     -- For the preceding terms, we compute the union of the sets of documents in
214 |     -- which they appear.  This means that a query like "Apple Blackberry C"
215 |     -- will look for documents containing "Apple" or "Blackberry", then later
216 |     -- intersect that set with documents containing completions of "C".
217 |     --
218 |     -- In general we want to use union rather than intersection here, because
219 |     -- the preceding terms might contain some useful and some missing terms, and
220 |     -- if we took the intersection we would end up with no results; thus we rely
221 |     -- on scoring to rank the best matches highest.
222 |     --
223 |     -- However, this leads to an issue: if some of the terms are extremely
224 |     -- common, we might end up taking unions of very large document sets, which
225 |     -- is a performance disaster.  We address this by unioning only sets smaller
226 |     -- than the pre-filter limit (but falling back on the whole collection if
227 |     -- all sets are too large).  This means that:
228 |     --
229 |     --  * A query containing a mixture of common and uncommon preceding terms
230 |     --    will be completed/ranked solely based on the uncommon terms.  For
231 |     --    example, "Apple Blackberry C" will be equivalent to "Blackberry C" if
232 |     --    there are many apples.
233 |     --
234 |     --  * A query containing only common preceding terms will be
235 |     --    completed/ranked as if only the final term was present.  For example,
236 |     --    "Apple Blackberry C" will be equivalent to "C" if there are many
237 |     --    apples and blackberries.
238 |     --
239 |     carefulUnions :: [DocIdSet] -> Maybe DocIdSet
240 |     carefulUnions dss
241 |       | null dss  = Just DocIdSet.empty
242 |       | null dss' = Nothing
243 |       | otherwise = Just (DocIdSet.unions dss')
244 |       where
245 |         dss' = filter (withinPrefilterLimit se) dss
246 | 
247 |     lookupRawResults :: [Term] -> ([TermId], [DocIdSet])
248 |     lookupRawResults ts =
249 |       unzip $ catMaybes
250 |         [ SI.lookupTerm searchIndex t'
251 |         | t  <- ts
252 |         , t' <- Query.expandTransformedQueryTerm se t
253 |         ]
254 | 
255 | 
256 | 
257 | -- From Bast and Weber:
258 | --
259 | --   To process the query means to compute the subset T' ⊆ T of terms that
260 | --   occur in at least one document from D, as well as the subset D' ⊆ D of
261 | --   documents that contain at least one of these words.
262 | --
263 | --   The obvious way to use an inverted index to process an autocompletion
264 | --   query (T, D) is to compute, for each t ∈ T, the intersections D ∩ Dt.
265 | --   Then, T' is simply the set of all t for which the intersection was
266 | --   non-empty, and D' is the union of all (non-empty) intersections.
267 | --
268 | -- We will do this but additionally we will return all the non-empty
269 | -- intersections because they will be useful when scoring.
270 | 
271 | processAutosuggestQuery :: SearchEngine doc key field feature ->
272 |                            AutosuggestQuery ->
273 |                            ([(TermId, DocIdSet)], [TermId], DocIdSet)
274 | processAutosuggestQuery se (completionTerms, precedingDocHits, _)
275 |   -- Check all the individual document sets are smaller than the pre-filter
276 |   -- limit.  If any are larger, their union must also be too large, so we return
277 |   -- no results now rather than having to compute the union (which may be
278 |   -- expensive) only for it to inevitably hit the limit.
279 |   | all (withinPrefilterLimit se) docSets =
280 |     ( completionTermAndDocSets
281 |     , completionTerms'
282 |     , allTermDocSet
283 |     )
284 |   | otherwise = ([], [], DocIdSet.empty)
285 |   where
286 |     -- We look up each candidate completion to find the set of documents
287 |     -- it appears in, and filtering (intersecting) down to just those
288 |     -- appearing in the existing partial query results (if any).
289 |     -- Candidate completions not appearing at all within the existing
290 |     -- partial query results are excluded at this stage.
291 |     --
292 |     -- We have to keep these doc sets for the whole process, so we keep
293 |     -- them as the compact DocIdSet type.
294 |     --
295 |     completionTermAndDocSets :: [(TermId, DocIdSet)]
296 |     completionTermAndDocSets =
297 |       [ (t, ds_t')
298 |       | (t, ds_t) <- Map.toList completionTerms
299 |       , let ds_t' = case precedingDocHits of
300 |                       Just ds -> ds `DocIdSet.intersection` ds_t
301 |                       Nothing -> ds_t
302 |       , not (DocIdSet.null ds_t')
303 |       ]
304 | 
305 |     -- The remaining candidate completions
306 |     completionTerms' :: [TermId]
307 |     docSets :: [DocIdSet]
308 |     (completionTerms', docSets) = unzip completionTermAndDocSets
309 | 
310 |     -- The union of all these is this set of documents that form the results.
311 |     allTermDocSet :: DocIdSet
312 |     allTermDocSet = DocIdSet.unions docSets
313 | 
314 | 
315 | filterAutosuggestQuery :: SearchEngine doc key field feature ->
316 |                           ResultsFilter key ->
317 |                           DocIdSet ->
318 |                           [(DocId, (key, DocTermIds field, DocFeatVals feature))]
319 | filterAutosuggestQuery SearchEngine{ searchIndex } resultsFilter ds =
320 |     case resultsFilter of
321 |       NoFilter ->
322 |         [ (docid, doc)
323 |         | docid <- DocIdSet.toList ds
324 |         , let doc = SI.lookupDocId searchIndex docid ]
325 | 
326 |       FilterPredicate predicate ->
327 |         [ (docid, doc)
328 |         | docid <- DocIdSet.toList ds
329 |         , let doc@(k,_,_) = SI.lookupDocId searchIndex docid
330 |         , predicate k ]
331 | 
332 |       FilterBulkPredicate bulkPredicate ->
333 |         [ (docid, doc)
334 |         | let docids = DocIdSet.toList ds
335 |               docinf = map (SI.lookupDocId searchIndex) docids
336 |               keep   = bulkPredicate [ k | (k,_,_) <- docinf ]
337 |         , (docid, doc, True) <- zip3 docids docinf keep ]
338 | 
339 | 
340 | -- Scoring
341 | -------------
342 | --
343 | -- From Bast and Weber:
344 | --   In practice, only a selection of items from these lists can and will be
345 | --   presented to the user, and it is of course crucial that the most relevant
346 | --   completions and hits are selected.
347 | --
348 | --   A standard approach for this task in ad-hoc retrieval is to have a
349 | --   precomputed score for each term-in-document pair, and when a query is
350 | --   being processed, to aggregate these scores for each candidate document,
351 | --   and return documents with the highest such aggregated scores.
352 | --
353 | --   Both INV and HYB can be easily adapted to implement any such scoring and
354 | --   aggregation scheme: store by each term-in-document pair its precomputed
355 | --   score, and when intersecting, aggregate the scores. A decision has to be
356 | --   made on how to reconcile scores from different completions within the
357 | --   same document. We suggest the following: when merging the intersections
358 | --   (which gives the set D' according to Definition 1), compute for each
359 | --   document in D' the maximal score achieved for some completion in T'
360 | --   contained in that document, and compute for each completion in T' the
361 | --   maximal score achieved for a hit from D' achieved for this completion.
362 | --
363 | -- So firstly let us explore what this means and then discuss why it does not
364 | -- work for BM25.
365 | --
366 | -- The "precomputed score for each term-in-document pair" refers to the bm25
367 | -- score for this term in this document (and obviously doesn't have to be
368 | -- precomputed, though that'd be faster).
369 | --
370 | -- So the score for a document d ∈ D' is:
371 | --   maximum of score for d ∈ D ∩ Dt, for any t ∈ T'
372 | --
373 | -- While the score for a completion t ∈ T' is:
374 | --   maximum of score for d ∈ D ∩ Dt
375 | --
376 | -- So for documents we say their score is their best score for any of the
377 | -- completion terms they contain. While for completions we say their score
378 | -- is their best score for any of the documents they appear in.
379 | --
380 | -- For a scoring function like BM25 this appears to be not a good method, both
381 | -- in principle and in practice. Consider what terms get high BM25 scores:
382 | -- very rare ones. So this means we're going to score highly documents that
383 | -- contain the least frequent terms, and completions that are themselves very
384 | -- rare. This is silly.
385 | --
386 | -- Another important thing to note is that if we use this scoring method then
387 | -- we are using the BM25 score in a way that makes no sense. The BM25 score
388 | -- for different documents for the /same/ set of terms are comparable. The
389 | -- score for the same for different document with different terms are simply
390 | -- not comparable.
391 | --
392 | -- This also makes sense if you consider what question the BM25 score is
393 | -- answering: "what is the likelihood that this document is relevant given that
394 | -- I judge these terms to be relevant". However an auto-suggest query is
395 | -- different: "what is the likelihood that this term is relevant given the
396 | -- importance/popularity of the documents (and any preceding terms I've judged
397 | -- to be relevant)". They are both conditional likelihood questions but with
398 | -- different premises.
399 | --
400 | -- More generally, term frequency information simply isn't useful for
401 | -- auto-suggest queries. We don't want results that have the most obscure terms
402 | -- nor the most common terms, not even something in-between. Term frequency
403 | -- just doesn't tell us anything unless we've already judged terms to be
404 | -- relevant, and in an auto-suggest query we've not done that yet.
405 | --
406 | -- What we really need is information on the importance/popularity of the
407 | -- documents. We can actually do something with that.
408 | --
409 | -- So, instead we follow a different strategy. We require that we have
410 | -- importance/popularity info for the documents.
411 | --
412 | -- A first approximation would be to rank result documents by their importance
413 | -- and completion terms by the sum of the importance of the documents each
414 | -- term appears in.
415 | --
416 | -- Score for a document d ∈ D'
417 | --   importance score for d
418 | --
419 | -- Score for a completion t ∈ T'
420 | --   sum of importance score for d ∈ D ∩ Dt
421 | --
422 | -- The only problem with this is that just because a term appears in an
423 | -- important document, doesn't mean that term is /about/ that document, or to
424 | -- put it another way, that term may not be relevant for that document. For
425 | -- example common words like "the" likely appear in all important documents
426 | -- but this doesn't really tell us anything because "the" isn't an important
427 | -- keyword.
428 | --
429 | -- So what we want to do is to weight the document importance by the relevance
430 | -- of the keyword to the document. So now if we have an important document and
431 | -- a relevant keyword for that document then we get a high score, but an
432 | -- irrelevant term like "the" would get a very low weighting and so would not
433 | -- contribute much to the score, even for very important documents.
434 | --
435 | -- The intuition is that we will score term completions by taking the
436 | -- document importance weighted by the relevance of that term to that document
437 | -- and summing over all the documents where the term occurs.
438 | --
439 | -- We define document importance (for the set D') to be the BM25F score for
440 | -- the documents with any preceding terms. So this includes the non-term
441 | -- feature score for the importance/popularity, and also takes account of
442 | -- preceding terms if there were any.
443 | --
444 | -- We define term relevance (for terms in documents) to be the BM25F score for
445 | -- that term in that document as a fraction of the total BM25F score for all
446 | -- terms in the document. Thus the relevance of all terms in a document sums
447 | -- to 1.
448 | --
449 | -- Now we can re-weight the document importance by the term relevance:
450 | --
451 | -- Score for a completion t ∈ T'
452 | --   sum (for d ∈ D ∩ Dt) of ( importance for d * relevance for t in d )
453 | --
454 | -- And now for document result scores. We don't want to just stick with the
455 | -- innate document importance. We want to re-weight by the completion term
456 | -- scores:
457 | --
458 | -- Score for a document d ∈ D'
459 | --   sum (for t ∈ T' ∩ d) (importance score for d * score for completion t)
460 | --
461 | -- Clear as mud?
462 | 
463 | type DocImportance = Float
464 | type TermRelevanceBreakdown = Map.Map TermId Float
465 | 
466 | -- | Precompute the document importance and the term relevance breakdown for
467 | -- all the documents. This will be used in scoring the term completions
468 | -- and the result documents. They will all be used and some used many
469 | -- times so it's better to compute up-front and share.
470 | --
471 | -- This is actually the expensive bit (which is why we've filtered already).
472 | --
473 | cacheDocScoringInfo :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
474 |                        SearchEngine doc key field feature ->
475 |                        [TermId] ->
476 |                        [(DocId, (key, DocTermIds field, DocFeatVals feature))] ->
477 |                        [TermId] ->
478 |                        Map.Map DocId (DocImportance, TermRelevanceBreakdown)
479 | cacheDocScoringInfo se completionTerms allTermDocInfo precedingTerms =
480 |     Map.fromList
481 |       [ (docid, (docImportance, termRelevances))
482 |       | (docid, (_dockey, doctermids, docfeatvals)) <- allTermDocInfo
483 |       , let docImportance  = Query.relevanceScore se precedingTerms
484 |                                                   doctermids docfeatvals
485 |             termRelevances = relevanceBreakdown se doctermids docfeatvals
486 |                                                 completionTerms
487 |       ]
488 | 
489 | -- | Calculate the relevance of each of a given set of terms to the given
490 | -- document.
491 | --
492 | -- We define the \"relevance\" of each term in a document to be its
493 | -- term-in-document score as a fraction of the total of the scores for all
494 | -- terms in the document. Thus the sum of all the relevance values in the
495 | -- document is 1.
496 | --
497 | -- Note: we have to calculate the relevance for all terms in the document
498 | -- but we only keep the relevance value for the terms of interest.
499 | --
500 | relevanceBreakdown :: forall doc key field feature.
501 |                       (Ix field, Bounded field, Ix feature, Bounded feature) =>
502 |                       SearchEngine doc key field feature ->
503 |                       DocTermIds field -> DocFeatVals feature ->
504 |                       [TermId] -> TermRelevanceBreakdown
505 | relevanceBreakdown SearchEngine{ bm25Context } doctermids docfeatvals ts =
506 |     let -- We'll calculate the bm25 score for each term in this document
507 |         bm25Doc     = Query.indexDocToBM25Doc doctermids docfeatvals
508 | 
509 |         -- Cache the info that depends only on this doc, not the terms
510 |         termScore   :: (TermId -> (field -> Int) -> Float)
511 |         termScore   = BM25F.scoreTermsBulk bm25Context bm25Doc
512 | 
513 |         -- The DocTermIds has the info we need to do bulk scoring, but it's
514 |         -- a sparse representation, so we first convert it to a dense table
515 |         term        :: Int -> TermId
516 |         count       :: Int -> field -> Int
517 |         (!numTerms, term, count) = DocTermIds.denseTable doctermids
518 | 
519 |         -- We generate the vector of scores for all terms, based on looking up
520 |         -- the termid and the per-field counts in the dense table
521 |         termScores  :: Vec.Vector Float
522 |         !termScores = Vec.generate numTerms $ \i ->
523 |                        termScore (term i) (\f -> count i f)
524 | 
525 |         -- We keep only the values for the terms we're interested in
526 |         -- and normalise so we get the relevence fraction
527 |         !scoreSum   = Vec.sum termScores
528 |         !tset       = IntSet.fromList (map fromEnum ts)
529 |      in Map.fromList
530 |       . Vec.toList
531 |       . Vec.map    (\(t,s) -> (t, s/scoreSum))
532 |       . Vec.filter (\(t,_) -> fromEnum t `IntSet.member` tset)
533 |       . Vec.imap   (\i s   -> (term i, s))
534 |       $ termScores
535 | 
536 | 
537 | scoreAutosuggestQueryCompletions :: [(TermId, DocIdSet)]
538 |                                  -> Map.Map DocId (Float, Map.Map TermId Float)
539 |                                  -> [(TermId, Float)]
540 | scoreAutosuggestQueryCompletions completionTermAndDocSets allTermDocInfo =
541 |     [ (t, candidateScore t ds_t)
542 |     | (t, ds_t) <- completionTermAndDocSets ]
543 |   where
544 |     -- The score for a completion is the sum of the importance of the
545 |     -- documents in which that completion occurs, weighted by the relevance
546 |     -- of the term to each document. For example we can have a very
547 |     -- important document and our completion term is highly relevant to it
548 |     -- or we could have a large number of moderately important documents
549 |     -- that our term is quite relevant to. In either example the completion
550 |     -- term would score highly.
551 |     candidateScore :: TermId -> DocIdSet -> Float
552 |     candidateScore t ds_t =
553 |       sum [ docImportance * termRelevance
554 |           | Just (docImportance, termRelevances) <-
555 |                map (`Map.lookup` allTermDocInfo) (DocIdSet.toList ds_t)
556 |           , let termRelevance = termRelevances Map.! t
557 |           ]
558 | 
559 | 
560 | scoreAutosuggestQueryResults :: [(TermId, DocIdSet)] ->
561 |                                 Map.Map DocId (Float, Map.Map TermId Float) ->
562 |                                 [(TermId, Float)] ->
563 |                                 [(DocId, Float)]
564 | scoreAutosuggestQueryResults completionTermAndDocSets allTermDocInfo
565 |                              scoredCandidates =
566 |   Map.toList $ Map.fromListWith (+)
567 |     [ (docid, docImportance * score_t)
568 |     | ((_, ds_t), (_, score_t)) <- zip completionTermAndDocSets scoredCandidates
569 |     , let docids  = DocIdSet.toList ds_t
570 |           docinfo = map (`Map.lookup` allTermDocInfo) docids
571 |     , (docid, Just (docImportance, _)) <- zip docids docinfo
572 |     ]
573 | 
574 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine/BM25F.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE RecordWildCards, BangPatterns, ScopedTypeVariables #-}
  2 | 
  3 | -- | An implementation of BM25F ranking. See:
  4 | --
  5 | -- * A quick overview: <http://en.wikipedia.org/wiki/Okapi_BM25>
  6 | --
  7 | -- * /The Probabilistic Relevance Framework: BM25 and Beyond/
  8 | --   <http://www.staff.city.ac.uk/~sbrp622/papers/foundations_bm25_review.pdf>
  9 | --
 10 | -- * /An Introduction to Information Retrieval/
 11 | --   <http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf>
 12 | --
 13 | module Data.SearchEngine.BM25F (
 14 |     -- * The ranking function
 15 |     score,
 16 |     Context(..),
 17 |     FeatureFunction(..),
 18 |     Doc(..),
 19 |     -- ** Specialised variants
 20 |     scoreTermsBulk,
 21 | 
 22 |     -- * Explaining the score
 23 |     Explanation(..),
 24 |     explain,
 25 |   ) where
 26 | 
 27 | import Data.Ix
 28 | import Data.Array.Unboxed
 29 | 
 30 | data Context term field feature = Context {
 31 |          numDocsTotal     :: !Int,
 32 |          avgFieldLength   :: field -> Float,
 33 |          numDocsWithTerm  :: term -> Int,
 34 |          paramK1          :: !Float,
 35 |          paramB           :: field -> Float,
 36 |          -- consider minimum length to prevent massive B bonus?
 37 |          fieldWeight      :: field -> Float,
 38 |          featureWeight    :: feature -> Float,
 39 |          featureFunction  :: feature -> FeatureFunction
 40 |        }
 41 | 
 42 | data Doc term field feature = Doc {
 43 |          docFieldLength        :: field -> Int,
 44 |          docFieldTermFrequency :: field -> term -> Int,
 45 |          docFeatureValue       :: feature -> Float
 46 |        }
 47 | 
 48 | 
 49 | -- | The BM25F score for a document for a given set of terms.
 50 | --
 51 | score :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
 52 |          Context term field feature ->
 53 |          Doc term field feature -> [term] -> Float
 54 | score ctx doc terms =
 55 |     sum (map (weightedTermScore    ctx doc) terms)
 56 |   + sum (map (weightedNonTermScore ctx doc) features)
 57 | 
 58 |   where
 59 |     features = range (minBound, maxBound)
 60 | 
 61 | 
 62 | weightedTermScore :: (Ix field, Bounded field) =>
 63 |                      Context term field feature ->
 64 |                      Doc term field feature -> term -> Float
 65 | weightedTermScore ctx doc t =
 66 |     weightIDF ctx t *     tf'
 67 |                      / (k1 + tf')
 68 |   where
 69 |     tf' = weightedDocTermFrequency ctx doc t
 70 |     k1  = paramK1 ctx
 71 | 
 72 | 
 73 | weightIDF :: Context term field feature -> term -> Float
 74 | weightIDF ctx t =
 75 |     log ((n - n_t + 0.5) / (n_t + 0.5))
 76 |   where
 77 |     n   = fromIntegral (numDocsTotal ctx)
 78 |     n_t = fromIntegral (numDocsWithTerm ctx t)
 79 | 
 80 | 
 81 | weightedDocTermFrequency :: (Ix field, Bounded field) =>
 82 |                             Context term field feature ->
 83 |                             Doc term field feature -> term -> Float
 84 | weightedDocTermFrequency ctx doc t =
 85 |     sum [ w_f * tf_f / _B_f
 86 |         | field <- range (minBound, maxBound)
 87 |         , let w_f  = fieldWeight ctx field
 88 |               tf_f = fromIntegral (docFieldTermFrequency doc field t)
 89 |               _B_f = lengthNorm ctx doc field
 90 |         , not (isNaN _B_f)
 91 |         ]
 92 |     -- When the avgFieldLength is 0 we have a field which is empty for all
 93 |     -- documents. Unfortunately it leads to a NaN because the
 94 |     -- docFieldTermFrequency will also be 0 so we get 0/0. What we want to
 95 |     -- do in this situation is have that field contribute nothing to the
 96 |     -- score. The simplest way to achieve that is to skip if _B_f is NaN.
 97 |     -- So I think this is fine and not an ugly hack.
 98 | 
 99 | lengthNorm :: Context term field feature ->
100 |               Doc term field feature -> field -> Float
101 | lengthNorm ctx doc field =
102 |     (1-b_f) + b_f * sl_f / avgsl_f
103 |   where
104 |     b_f     = paramB ctx field
105 |     sl_f    = fromIntegral (docFieldLength doc field)
106 |     avgsl_f = avgFieldLength ctx field
107 | 
108 | 
109 | weightedNonTermScore :: (Ix feature, Bounded feature) =>
110 |                         Context term field feature ->
111 |                         Doc term field feature -> feature -> Float
112 | weightedNonTermScore ctx doc feature =
113 |     w_f * _V_f f_f
114 |   where
115 |     w_f  = featureWeight ctx feature
116 |     _V_f = applyFeatureFunction (featureFunction ctx feature)
117 |     f_f  = docFeatureValue doc feature
118 | 
119 | 
120 | data FeatureFunction
121 |    = LogarithmicFunction   Float -- ^ @log (\lambda_i + f_i)@
122 |    | RationalFunction      Float -- ^ @f_i / (\lambda_i + f_i)@
123 |    | SigmoidFunction Float Float -- ^ @1 / (\lambda + exp(-(\lambda' * f_i))@
124 | 
125 | applyFeatureFunction :: FeatureFunction -> (Float -> Float)
126 | applyFeatureFunction (LogarithmicFunction p1) = \fi -> log (p1 + fi)
127 | applyFeatureFunction (RationalFunction    p1) = \fi -> fi / (p1 + fi)
128 | applyFeatureFunction (SigmoidFunction  p1 p2) = \fi -> 1 / (p1 + exp (-fi * p2))
129 | 
130 | 
131 | -----------------------------
132 | -- Bulk scoring of many terms
133 | --
134 | 
135 | -- | Most of the time we want to score several different documents for the same
136 | -- set of terms, but sometimes we want to score one document for many terms
137 | -- and in that case we can save a bit of work by doing it in bulk. It lets us
138 | -- calculate once and share things that depend only on the document, and not
139 | -- the term.
140 | --
141 | -- To take advantage of the sharing you must partially apply and name the
142 | -- per-doc score functon, e.g.
143 | --
144 | -- > let score :: term -> (field -> Int) -> Float
145 | -- >     score = BM25.bulkScorer ctx doc
146 | -- >  in sum [ score t (\f -> counts ! (t, f)) | t <- ts ]
147 | --
148 | scoreTermsBulk :: forall field term feature. (Ix field, Bounded field) =>
149 |                   Context term field feature ->
150 |                   Doc term field feature ->
151 |                   (term -> (field -> Int) -> Float)
152 | scoreTermsBulk ctx doc = 
153 |     -- This is just a rearrangement of weightedTermScore and
154 |     -- weightedDocTermFrequency above, with the doc-constant bits hoisted out.
155 | 
156 |     \t tFreq ->
157 |     let !tf' = sum [ w!f * tf_f / _B!f
158 |                    | f <- range (minBound, maxBound)
159 |                    , let tf_f = fromIntegral (tFreq f)
160 |                          _B_f = _B!f
161 |                    , not (isNaN _B_f)
162 |                    ]
163 | 
164 |      in weightIDF ctx t *     tf'
165 |                          / (k1 + tf')
166 |   where
167 |     -- So long as the caller does the partial application thing then these
168 |     -- values can all be shared between many calls with different terms.
169 | 
170 |     !k1 = paramK1 ctx
171 |     w, _B :: UArray field Float
172 |     !w  = array (minBound, maxBound)
173 |                 [ (field, fieldWeight ctx field)
174 |                 | field <- range (minBound, maxBound) ]
175 |     !_B = array (minBound, maxBound)
176 |                 [ (field, lengthNorm ctx doc field)
177 |                 | field <- range (minBound, maxBound) ]
178 | 
179 | 
180 | ------------------
181 | -- Explanation
182 | --
183 | 
184 | -- | A breakdown of the BM25F score, to explain somewhat how it relates to
185 | -- the inputs, and so you can compare the scores of different documents.
186 | --
187 | data Explanation field feature term = Explanation {
188 |        -- | The overall score is the sum of the 'termScores', 'positionScore'
189 |        -- and 'nonTermScore'
190 |        overallScore  :: Float,
191 | 
192 |        -- | There is a score contribution from each query term. This is the
193 |        -- score for the term across all fields in the document (but see
194 |        -- 'termFieldScores').
195 |        termScores    :: [(term, Float)],
196 | {-
197 |        -- | There is a score contribution for positional information. Terms
198 |        -- appearing in the document close together give a bonus.
199 |        positionScore :: [(field, Float)],
200 | -}
201 |        -- | The document can have an inate bonus score independent of the terms
202 |        -- in the query. For example this might be a popularity score.
203 |        nonTermScores :: [(feature, Float)],
204 | 
205 |        -- | This does /not/ contribute to the 'overallScore'. It is an
206 |        -- indication of how the 'termScores' relates to per-field scores.
207 |        -- Note however that the term score for all fields is /not/ simply
208 |        -- sum of the per-field scores. The point of the BM25F scoring function
209 |        -- is that a linear combination of per-field scores is wrong, and BM25F
210 |        -- does a more cunning non-linear combination.
211 |        --
212 |        -- However, it is still useful as an indication to see scores for each
213 |        -- field for a term, to see how the compare.
214 |        --
215 |        termFieldScores :: [(term, [(field, Float)])]
216 |      }
217 |   deriving Show
218 | 
219 | instance Functor (Explanation field feature) where
220 |   fmap f e@Explanation{..} =
221 |     e {
222 |       termScores      = [ (f t, s)  | (t, s)  <- termScores ],
223 |       termFieldScores = [ (f t, fs) | (t, fs) <- termFieldScores ]
224 |     }
225 | 
226 | explain :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
227 |            Context term field feature ->
228 |            Doc term field feature -> [term] -> Explanation field feature term
229 | explain ctx doc ts =
230 |     Explanation {..}
231 |   where
232 |     overallScore  = sum (map snd termScores)
233 | --                  + sum (map snd positionScore)
234 |                   + sum (map snd nonTermScores)
235 |     termScores    = [ (t, weightedTermScore ctx doc t) | t <- ts ]
236 | --    positionScore = [ (f, 0) | f <- range (minBound, maxBound) ]
237 |     nonTermScores = [ (feature, weightedNonTermScore ctx doc feature)
238 |                     | feature <- range (minBound, maxBound) ]
239 | 
240 |     termFieldScores =
241 |       [ (t, fieldScores)
242 |       | t <- ts
243 |       , let fieldScores =
244 |               [ (f, weightedTermScore ctx' doc t)
245 |               | f <- range (minBound, maxBound)
246 |               , let ctx' = ctx { fieldWeight = fieldWeightOnly f }
247 |               ]
248 |       ]
249 |     fieldWeightOnly f f' | sameField f f' = fieldWeight ctx f'
250 |                          | otherwise      = 0
251 | 
252 |     sameField f f' = index (minBound, maxBound) f
253 |                   == index (minBound, maxBound) f'
254 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine/DocFeatVals.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving #-}
 2 | module Data.SearchEngine.DocFeatVals (
 3 |     DocFeatVals,
 4 |     featureValue,
 5 |     create,
 6 |   ) where
 7 | 
 8 | import Data.SearchEngine.DocTermIds (vecIndexIx, vecCreateIx)
 9 | import Data.Vector (Vector)
10 | import Data.Ix (Ix)
11 | 
12 | 
13 | -- | Storage for the non-term feature values i a document.
14 | --
15 | newtype DocFeatVals feature = DocFeatVals (Vector Float)
16 |   deriving (Show)
17 | 
18 | featureValue :: (Ix feature, Bounded feature) => DocFeatVals feature -> feature -> Float
19 | featureValue (DocFeatVals featVec) = vecIndexIx featVec
20 | 
21 | create :: (Ix feature, Bounded feature) =>
22 |           (feature -> Float) -> DocFeatVals feature
23 | create docFeatVals =
24 |     DocFeatVals (vecCreateIx docFeatVals)
25 | 
26 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine/DocIdSet.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, MultiParamTypeClasses,
  2 |              TypeFamilies #-}
  3 | module Data.SearchEngine.DocIdSet (
  4 |     DocId(DocId),
  5 |     DocIdSet(..),
  6 |     null,
  7 |     size,
  8 |     empty,
  9 |     singleton,
 10 |     fromList,
 11 |     toList,
 12 |     insert,
 13 |     delete,
 14 |     member,
 15 |     union,
 16 |     unions,
 17 |     intersection,
 18 |     invariant,
 19 |   ) where
 20 | 
 21 | import Data.Word
 22 | import qualified Data.Vector.Unboxed         as Vec
 23 | import qualified Data.Vector.Unboxed.Mutable as MVec
 24 | import qualified Data.Vector.Generic         as GVec
 25 | import qualified Data.Vector.Generic.Mutable as GMVec
 26 | import Control.Monad.ST
 27 | import Control.Monad (liftM)
 28 | import qualified Data.Set as Set
 29 | import qualified Data.List as List
 30 | import Data.Function (on)
 31 | 
 32 | import Prelude hiding (null)
 33 | 
 34 | 
 35 | newtype DocId = DocId { unDocId :: Word32 }
 36 |   deriving (Eq, Ord, Show, Enum, Bounded)
 37 | 
 38 | newtype DocIdSet = DocIdSet (Vec.Vector DocId)
 39 |   deriving (Eq, Show)
 40 | 
 41 | -- represented as a sorted sequence of ids
 42 | invariant :: DocIdSet -> Bool
 43 | invariant (DocIdSet vec) =
 44 |     strictlyAscending (Vec.toList vec)
 45 |   where
 46 |     strictlyAscending (a:xs@(b:_)) = a < b && strictlyAscending xs
 47 |     strictlyAscending _  = True
 48 | 
 49 | 
 50 | size :: DocIdSet -> Int
 51 | size (DocIdSet vec) = Vec.length vec
 52 | 
 53 | null :: DocIdSet -> Bool
 54 | null (DocIdSet vec) = Vec.null vec
 55 | 
 56 | empty :: DocIdSet
 57 | empty = DocIdSet Vec.empty
 58 | 
 59 | singleton :: DocId -> DocIdSet
 60 | singleton = DocIdSet . Vec.singleton
 61 | 
 62 | fromList :: [DocId] -> DocIdSet
 63 | fromList = DocIdSet . Vec.fromList . Set.toAscList . Set.fromList
 64 | 
 65 | toList ::  DocIdSet -> [DocId]
 66 | toList (DocIdSet vec) = Vec.toList vec
 67 | 
 68 | insert :: DocId -> DocIdSet -> DocIdSet
 69 | insert x (DocIdSet vec) =
 70 |     case binarySearch vec 0 (Vec.length vec - 1) x of
 71 |       (_, True)  -> DocIdSet vec
 72 |       (i, False) -> case Vec.splitAt i vec of
 73 |                       (before, after) ->
 74 |                         DocIdSet (Vec.concat [before, Vec.singleton x, after])
 75 | 
 76 | delete :: DocId -> DocIdSet -> DocIdSet
 77 | delete x (DocIdSet vec) =
 78 |     case binarySearch vec 0 (Vec.length vec - 1) x of
 79 |       (_, False) -> DocIdSet vec
 80 |       (i, True)  -> case Vec.splitAt i vec of
 81 |                       (before, after) ->
 82 |                         DocIdSet (before Vec.++ Vec.tail after)
 83 | 
 84 | member :: DocId -> DocIdSet -> Bool
 85 | member x (DocIdSet vec) = snd (binarySearch vec 0 (Vec.length vec - 1) x)
 86 | 
 87 | binarySearch :: Vec.Vector DocId -> Int -> Int -> DocId -> (Int, Bool)
 88 | binarySearch vec !a !b !key
 89 |   | a > b     = (a, False)
 90 |   | otherwise =
 91 |     let mid = (a + b) `div` 2
 92 |      in case compare key (vec Vec.! mid) of
 93 |           LT -> binarySearch vec a (mid-1) key
 94 |           EQ -> (mid, True)
 95 |           GT -> binarySearch vec (mid+1) b key
 96 | 
 97 | unions :: [DocIdSet] -> DocIdSet
 98 | unions = List.foldl' union empty
 99 |          -- a bit more effecient if we merge small ones first
100 |        . List.sortBy (compare `on` size)
101 | 
102 | union :: DocIdSet -> DocIdSet -> DocIdSet
103 | union x y | null x = y
104 |           | null y = x
105 | union (DocIdSet xs) (DocIdSet ys) =
106 |     DocIdSet (Vec.create (MVec.new sizeBound >>= writeMergedUnion xs ys))
107 |   where
108 |     sizeBound = Vec.length xs + Vec.length ys
109 | 
110 | writeMergedUnion :: Vec.Vector DocId -> Vec.Vector DocId ->
111 |                     MVec.MVector s DocId -> ST s (MVec.MVector s DocId)
112 | writeMergedUnion xs0 ys0 !out = do
113 |     i <- go xs0 ys0 0
114 |     return $! MVec.take i out
115 |   where
116 |     go !xs !ys !i
117 |       | Vec.null xs = do Vec.copy (MVec.slice i (Vec.length ys) out) ys
118 |                          return (i + Vec.length ys)
119 |       | Vec.null ys = do Vec.copy (MVec.slice i (Vec.length xs) out) xs
120 |                          return (i + Vec.length xs)
121 |       | otherwise   = let x = Vec.head xs; y = Vec.head ys
122 |                       in case compare x y of
123 |                           GT -> do MVec.write out i y
124 |                                    go           xs  (Vec.tail ys) (i+1)
125 |                           EQ -> do MVec.write out i x
126 |                                    go (Vec.tail xs) (Vec.tail ys) (i+1)
127 |                           LT -> do MVec.write out i x
128 |                                    go (Vec.tail xs)           ys  (i+1)
129 | 
130 | intersection :: DocIdSet -> DocIdSet -> DocIdSet
131 | intersection x y | null x = empty
132 |                  | null y = empty
133 | intersection (DocIdSet xs) (DocIdSet ys) =
134 |     DocIdSet (Vec.create (MVec.new sizeBound >>= writeMergedIntersection xs ys))
135 |   where
136 |     sizeBound = max (Vec.length xs) (Vec.length ys)
137 | 
138 | writeMergedIntersection :: Vec.Vector DocId -> Vec.Vector DocId ->
139 |                            MVec.MVector s DocId -> ST s (MVec.MVector s DocId)
140 | writeMergedIntersection xs0 ys0 !out = do
141 |     i <- go xs0 ys0 0
142 |     return $! MVec.take i out
143 |   where
144 |     go !xs !ys !i
145 |       | Vec.null xs = return i
146 |       | Vec.null ys = return i
147 |       | otherwise   = let x = Vec.head xs; y = Vec.head ys
148 |                       in case compare x y of
149 |                           GT ->    go           xs  (Vec.tail ys)  i
150 |                           EQ -> do MVec.write out i x
151 |                                    go (Vec.tail xs) (Vec.tail ys) (i+1)
152 |                           LT ->    go (Vec.tail xs)           ys   i
153 | 
154 | ------------------------------------------------------------------------------
155 | -- verbose Unbox instances
156 | --
157 | 
158 | instance MVec.Unbox DocId
159 | 
160 | newtype instance MVec.MVector s DocId = MV_DocId (MVec.MVector s Word32)
161 | 
162 | instance GMVec.MVector MVec.MVector DocId where
163 |     basicLength          (MV_DocId v) = GMVec.basicLength v
164 |     basicUnsafeSlice i l (MV_DocId v) = MV_DocId (GMVec.basicUnsafeSlice i l v)
165 |     basicUnsafeNew     l              = MV_DocId `liftM` GMVec.basicUnsafeNew l
166 |     basicInitialize      (MV_DocId v) = GMVec.basicInitialize v
167 |     basicUnsafeReplicate l x          = MV_DocId `liftM` GMVec.basicUnsafeReplicate l (unDocId x)
168 |     basicUnsafeRead  (MV_DocId v) i   = DocId `liftM`    GMVec.basicUnsafeRead v i
169 |     basicUnsafeWrite (MV_DocId v) i x = GMVec.basicUnsafeWrite v i (unDocId x)
170 |     basicClear       (MV_DocId v)     = GMVec.basicClear v
171 |     basicSet         (MV_DocId v) x   = GMVec.basicSet v (unDocId x)
172 |     basicUnsafeGrow  (MV_DocId v) l   = MV_DocId `liftM` GMVec.basicUnsafeGrow v l
173 |     basicUnsafeCopy  (MV_DocId v) (MV_DocId v') = GMVec.basicUnsafeCopy v v'
174 |     basicUnsafeMove  (MV_DocId v) (MV_DocId v') = GMVec.basicUnsafeMove v v'
175 |     basicOverlaps    (MV_DocId v) (MV_DocId v') = GMVec.basicOverlaps   v v'
176 |     {-# INLINE basicLength #-}
177 |     {-# INLINE basicUnsafeSlice #-}
178 |     {-# INLINE basicOverlaps #-}
179 |     {-# INLINE basicUnsafeNew #-}
180 |     {-# INLINE basicInitialize #-}
181 |     {-# INLINE basicUnsafeReplicate #-}
182 |     {-# INLINE basicUnsafeRead #-}
183 |     {-# INLINE basicUnsafeWrite #-}
184 |     {-# INLINE basicClear #-}
185 |     {-# INLINE basicSet #-}
186 |     {-# INLINE basicUnsafeCopy #-}
187 |     {-# INLINE basicUnsafeMove #-}
188 |     {-# INLINE basicUnsafeGrow #-}
189 | 
190 | newtype instance Vec.Vector DocId = V_DocId (Vec.Vector Word32)
191 | 
192 | instance GVec.Vector Vec.Vector DocId where
193 |     basicUnsafeFreeze (MV_DocId mv)  = V_DocId  `liftM` GVec.basicUnsafeFreeze mv
194 |     basicUnsafeThaw   (V_DocId  v)   = MV_DocId `liftM` GVec.basicUnsafeThaw v
195 |     basicLength       (V_DocId  v)   = GVec.basicLength v
196 |     basicUnsafeSlice i l (V_DocId v) = V_DocId (GVec.basicUnsafeSlice i l v)
197 |     basicUnsafeIndexM (V_DocId  v) i = DocId `liftM` GVec.basicUnsafeIndexM v i
198 |     basicUnsafeCopy   (MV_DocId mv)
199 |                       (V_DocId  v)   = GVec.basicUnsafeCopy mv v
200 |     elemseq           (V_DocId  v) x = GVec.elemseq v (unDocId x)
201 |     {-# INLINE basicUnsafeFreeze #-}
202 |     {-# INLINE basicUnsafeThaw #-}
203 |     {-# INLINE basicLength #-}
204 |     {-# INLINE basicUnsafeSlice #-}
205 |     {-# INLINE basicUnsafeIndexM #-}
206 |     {-# INLINE basicUnsafeCopy #-}
207 |     {-# INLINE elemseq #-}
208 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine/DocTermIds.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving #-}
 2 | module Data.SearchEngine.DocTermIds (
 3 |     DocTermIds,
 4 |     TermId,
 5 |     fieldLength,
 6 |     fieldTermCount,
 7 |     fieldElems,
 8 |     create,
 9 |     denseTable,
10 |     vecIndexIx,
11 |     vecCreateIx,
12 |   ) where
13 | 
14 | import Data.SearchEngine.TermBag (TermBag, TermId)
15 | import qualified Data.SearchEngine.TermBag as TermBag
16 | 
17 | import Data.Vector (Vector, (!))
18 | import qualified Data.Vector as Vec
19 | import qualified Data.Vector.Unboxed as UVec
20 | import Data.Ix (Ix)
21 | import qualified Data.Ix as Ix
22 | 
23 | 
24 | -- | The 'TermId's for the 'Term's that occur in a document. Documents may have
25 | -- multiple fields and the 'DocTerms' type holds them separately for each field.
26 | --
27 | newtype DocTermIds field = DocTermIds (Vector TermBag)
28 |   deriving (Show)
29 | 
30 | getField :: (Ix field, Bounded field) => DocTermIds field -> field -> TermBag
31 | getField (DocTermIds fieldVec) = vecIndexIx fieldVec
32 | 
33 | create :: (Ix field, Bounded field) =>
34 |           (field -> [TermId]) -> DocTermIds field
35 | create docTermIds =
36 |     DocTermIds (vecCreateIx (TermBag.fromList . docTermIds))
37 | 
38 | -- | The number of terms in a field within the document.
39 | fieldLength :: (Ix field, Bounded field) => DocTermIds field -> field -> Int
40 | fieldLength docterms field =
41 |     TermBag.size (getField docterms field)
42 | 
43 | -- | /O(log n)/ The frequency of a particular term in a field within the document.
44 | --
45 | fieldTermCount :: (Ix field, Bounded field) =>
46 |                   DocTermIds field -> field -> TermId -> Int
47 | fieldTermCount docterms field termid =
48 |     fromIntegral (TermBag.termCount (getField docterms field) termid)
49 | 
50 | fieldElems :: (Ix field, Bounded field) => DocTermIds field -> field -> [TermId]
51 | fieldElems docterms field =
52 |     TermBag.elems (getField docterms field)
53 | 
54 | -- | The 'DocTermIds' is really a sparse 2d array, and doing lookups with
55 | -- 'fieldTermCount' has a O(log n) cost. This function converts to a dense
56 | -- tabular representation which then enables linear scans.
57 | --
58 | denseTable :: (Ix field, Bounded field) => DocTermIds field ->
59 |               (Int, Int -> TermId, Int -> field -> Int)
60 | denseTable (DocTermIds fieldVec) =
61 |     let (!termids, !termcounts) = TermBag.denseTable (Vec.toList fieldVec)
62 |         !numTerms = UVec.length termids
63 |      in ( numTerms
64 |         , \i    -> termids UVec.! i
65 |         , \i ix -> let j = Ix.index (minBound, maxBound) ix
66 |                     in fromIntegral (termcounts UVec.! (j * numTerms + i))
67 |         )
68 | 
69 | ---------------------------------
70 | -- Vector indexed by Ix Bounded
71 | --
72 | 
73 | vecIndexIx  :: (Ix ix, Bounded ix) => Vector a -> ix -> a
74 | vecIndexIx vec ix = vec ! Ix.index (minBound, maxBound) ix
75 | 
76 | vecCreateIx :: (Ix ix, Bounded ix) => (ix -> a) -> Vector a
77 | vecCreateIx f = Vec.fromListN (Ix.rangeSize bounds)
78 |                   [ y | ix <- Ix.range bounds, let !y = f ix ]
79 |   where
80 |     bounds = (minBound, maxBound)
81 | 
82 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine/Query.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, NamedFieldPuns, RecordWildCards #-}
  2 | 
  3 | module Data.SearchEngine.Query (
  4 | 
  5 |     -- * Querying
  6 |     query,
  7 |     ResultsFilter(..),
  8 | 
  9 |     -- * Explain mode for query result rankings
 10 |     queryExplain,
 11 |     BM25F.Explanation(..),
 12 |     setRankParams,
 13 | 
 14 |     -- ** Utils used by autosuggest
 15 |     relevanceScore,
 16 |     indexDocToBM25Doc,
 17 |     expandTransformedQueryTerm,
 18 |   ) where
 19 | 
 20 | import Data.SearchEngine.Types
 21 | import qualified Data.SearchEngine.SearchIndex as SI
 22 | import qualified Data.SearchEngine.DocIdSet    as DocIdSet
 23 | import qualified Data.SearchEngine.DocTermIds  as DocTermIds
 24 | import qualified Data.SearchEngine.DocFeatVals as DocFeatVals
 25 | import qualified Data.SearchEngine.BM25F       as BM25F
 26 | 
 27 | import Data.Ix
 28 | import Data.List
 29 | import Data.Function
 30 | import Data.Maybe
 31 | 
 32 | 
 33 | -- | Execute a normal query. Find the documents in which one or more of
 34 | -- the search terms appear and return them in ranked order.
 35 | --
 36 | -- The number of documents returned is limited by the 'paramResultsetSoftLimit'
 37 | -- and 'paramResultsetHardLimit' paramaters. This also limits the cost of the
 38 | -- query (which is primarily the cost of scoring each document).
 39 | --
 40 | -- The given terms are all assumed to be complete (as opposed to prefixes
 41 | -- like with 'queryAutosuggest').
 42 | --
 43 | query :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
 44 |          SearchEngine doc key field feature ->
 45 |          [Term] -> [key]
 46 | query se@SearchEngine{ searchIndex,
 47 |                        searchRankParams = SearchRankParameters{..} }
 48 |       terms =
 49 | 
 50 |   let -- Start by transforming/normalising all the query terms.
 51 |       -- This can be done differently for each field we search by.
 52 |       lookupTerms :: [Term]
 53 |       lookupTerms = concatMap (expandTransformedQueryTerm se) terms
 54 | 
 55 |       -- Then we look up all the normalised terms in the index.
 56 |       rawresults :: [Maybe (TermId, DocIdSet)]
 57 |       rawresults = map (SI.lookupTerm searchIndex) lookupTerms
 58 | 
 59 |       -- For the terms that occur in the index, this gives us the term's id
 60 |       -- and the set of documents that the term occurs in.
 61 |       termids   :: [TermId]
 62 |       docidsets :: [DocIdSet]
 63 |       (termids, docidsets) = unzip (catMaybes rawresults)
 64 | 
 65 |       -- We looked up the documents that *any* of the term occur in (not all)
 66 |       -- so this could be rather a lot of docs if the user uses a few common
 67 |       -- terms. Scoring these result docs is a non-trivial cost so we want to
 68 |       -- limit the number that we have to score. The standard trick is to
 69 |       -- consider the doc sets in the order of size, smallest to biggest. Once
 70 |       -- we have gone over a certain threshold of docs then don't bother with
 71 |       -- the doc sets for the remaining terms. This tends to work because the
 72 |       -- scoring gives lower weight to terms that occur in many documents.
 73 |       unrankedResults :: DocIdSet
 74 |       unrankedResults = pruneRelevantResults
 75 |                           paramResultsetSoftLimit
 76 |                           paramResultsetHardLimit
 77 |                           docidsets
 78 | 
 79 |       --TODO: technically this isn't quite correct. Because each field can
 80 |       -- be normalised differently, we can end up with different termids for
 81 |       -- the same original search term, and then we score those as if they
 82 |       -- were different terms, which makes a difference when the term appears
 83 |       -- in multiple fields (exactly the case BM25F is supposed to deal with).
 84 |       -- What we ought to have instead is an Array (Int, field) TermId, and
 85 |       -- make the scoring use the appropriate termid for each field, but to
 86 |       -- consider them the "same" term.
 87 |    in rankResults se termids (DocIdSet.toList unrankedResults)
 88 | 
 89 | -- | Before looking up a term in the main index we need to normalise it
 90 | -- using the 'transformQueryTerm'. Of course the transform can be different
 91 | -- for different fields, so we have to collect all the forms (eliminating
 92 | -- duplicates).
 93 | --
 94 | expandTransformedQueryTerm :: (Ix field, Bounded field) =>
 95 |                               SearchEngine doc key field feature ->
 96 |                               Term -> [Term]
 97 | expandTransformedQueryTerm SearchEngine{searchConfig} term =
 98 |     nub [ transformForField field
 99 |         | let transformForField = transformQueryTerm searchConfig term
100 |         , field <- range (minBound, maxBound) ]
101 | 
102 | 
103 | rankResults :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
104 |                SearchEngine doc key field feature ->
105 |                [TermId] -> [DocId] -> [key]
106 | rankResults se@SearchEngine{searchIndex} queryTerms docids =
107 |     map snd
108 |   $ sortBy (flip compare `on` fst)
109 |       [ (relevanceScore se queryTerms doctermids docfeatvals, dockey)
110 |       | docid <- docids
111 |       , let (dockey, doctermids, docfeatvals) = SI.lookupDocId searchIndex docid ]
112 | 
113 | relevanceScore :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
114 |                   SearchEngine doc key field feature ->
115 |                   [TermId] -> DocTermIds field -> DocFeatVals feature -> Float
116 | relevanceScore SearchEngine{bm25Context} queryTerms doctermids docfeatvals =
117 |     BM25F.score bm25Context doc queryTerms
118 |   where
119 |     doc = indexDocToBM25Doc doctermids docfeatvals
120 | 
121 | indexDocToBM25Doc :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
122 |                      DocTermIds field ->
123 |                      DocFeatVals feature ->
124 |                      BM25F.Doc TermId field feature
125 | indexDocToBM25Doc doctermids docfeatvals =
126 |     BM25F.Doc {
127 |       BM25F.docFieldLength        = DocTermIds.fieldLength    doctermids,
128 |       BM25F.docFieldTermFrequency = DocTermIds.fieldTermCount doctermids,
129 |       BM25F.docFeatureValue       = DocFeatVals.featureValue docfeatvals
130 |     }
131 | 
132 | pruneRelevantResults :: Int -> Int -> [DocIdSet] -> DocIdSet
133 | pruneRelevantResults softLimit hardLimit =
134 |     -- Look at the docsets starting with the smallest ones. Smaller docsets
135 |     -- correspond to the rarer terms, which are the ones that score most highly.
136 |     go DocIdSet.empty . sortBy (compare `on` DocIdSet.size)
137 |   where
138 |     go !acc [] = acc
139 |     go !acc (d:ds)
140 |         -- If this is the first one, we add it anyway, otherwise we're in
141 |         -- danger of returning no results at all.
142 |       | DocIdSet.null acc = go d ds
143 |         -- We consider the size our docset would be if we add this extra one...
144 |         -- If it puts us over the hard limit then stop.
145 |       | size > hardLimit  = acc
146 |         -- If it puts us over soft limit then we add it and stop
147 |       | size > softLimit  = DocIdSet.union acc d
148 |         -- Otherwise we can add it and carry on to consider the remainder
149 |       | otherwise         = go (DocIdSet.union acc d) ds
150 |       where
151 |         size = DocIdSet.size acc + DocIdSet.size d
152 | 
153 | 
154 | --------------------------------
155 | -- Normal query with explanation
156 | --
157 | 
158 | queryExplain :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
159 |                 SearchEngine doc key field feature ->
160 |                 [Term] -> [(BM25F.Explanation field feature Term, key)]
161 | queryExplain se@SearchEngine{ searchIndex,
162 |                               searchConfig     = SearchConfig{transformQueryTerm},
163 |                               searchRankParams = SearchRankParameters{..} }
164 |       terms =
165 | 
166 |   -- See 'query' above for explanation. Really we ought to combine them.
167 |   let lookupTerms :: [Term]
168 |       lookupTerms = [ term'
169 |                     | term  <- terms
170 |                     , let transformForField = transformQueryTerm term
171 |                     , term' <- nub [ transformForField field
172 |                                    | field <- range (minBound, maxBound) ]
173 |                     ]
174 | 
175 |       rawresults :: [Maybe (TermId, DocIdSet)]
176 |       rawresults = map (SI.lookupTerm searchIndex) lookupTerms
177 | 
178 |       termids   :: [TermId]
179 |       docidsets :: [DocIdSet]
180 |       (termids, docidsets) = unzip (catMaybes rawresults)
181 | 
182 |       unrankedResults :: DocIdSet
183 |       unrankedResults = pruneRelevantResults
184 |                           paramResultsetSoftLimit
185 |                           paramResultsetHardLimit
186 |                           docidsets
187 | 
188 |    in rankExplainResults se termids (DocIdSet.toList unrankedResults)
189 | 
190 | rankExplainResults :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
191 |                       SearchEngine doc key field feature ->
192 |                       [TermId] ->
193 |                       [DocId] ->
194 |                       [(BM25F.Explanation field feature Term, key)]
195 | rankExplainResults se@SearchEngine{searchIndex} queryTerms docids =
196 |     sortBy (flip compare `on` (BM25F.overallScore . fst))
197 |       [ (explainRelevanceScore se queryTerms doctermids docfeatvals, dockey)
198 |       | docid <- docids
199 |       , let (dockey, doctermids, docfeatvals) = SI.lookupDocId searchIndex docid ]
200 | 
201 | 
202 | explainRelevanceScore :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
203 |                          SearchEngine doc key field feature ->
204 |                          [TermId] ->
205 |                          DocTermIds field ->
206 |                          DocFeatVals feature ->
207 |                          BM25F.Explanation field feature Term
208 | explainRelevanceScore SearchEngine{bm25Context, searchIndex}
209 |                       queryTerms doctermids docfeatvals =
210 |     fmap (SI.getTerm searchIndex) (BM25F.explain bm25Context doc queryTerms)
211 |   where
212 |     doc = indexDocToBM25Doc doctermids docfeatvals
213 | 
214 | 
215 | setRankParams :: SearchRankParameters field feature ->
216 |                  SearchEngine doc key field feature ->
217 |                  SearchEngine doc key field feature
218 | setRankParams params@SearchRankParameters{..} se =
219 |     se {
220 |       searchRankParams = params,
221 |       bm25Context      = (bm25Context se) {
222 |         BM25F.paramK1         = paramK1,
223 |         BM25F.paramB          = paramB,
224 |         BM25F.fieldWeight     = paramFieldWeights,
225 |         BM25F.featureWeight   = paramFeatureWeights,
226 |         BM25F.featureFunction = paramFeatureFunctions
227 |       }
228 |     }
229 | 
230 | 
231 | --------------------------------
232 | -- Results filter
233 | --
234 | 
235 | -- | In some applications it is necessary to enforce some security or
236 | -- visibility rule about the query results (e.g. in a typical DB-based
237 | -- application different users can see different data items). Typically
238 | -- it would be too expensive to build different search indexes for the
239 | -- different contexts and so the strategy is to use one index containing
240 | -- everything and filter for visibility in the results. This means the
241 | -- filter condition is different for different queries (e.g. performed
242 | -- on behalf of different users).
243 | --
244 | -- Filtering the results after a query is possible but not the most efficient
245 | -- thing to do because we've had to score all the not-visible documents.
246 | -- The better thing to do is to filter as part of the query, this way we can
247 | -- filter before the expensive scoring.
248 | --
249 | -- We provide one further optimisation: bulk predicates. In some applications
250 | -- it can be quicker to check the security\/visibility of a whole bunch of
251 | -- results all in one go.
252 | --
253 | data ResultsFilter key = NoFilter
254 |                        | FilterPredicate     (key -> Bool)
255 |                        | FilterBulkPredicate ([key] -> [Bool])
256 | --TODO: allow filtering & non-feature score lookup in one bulk op
257 | 
258 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine/SearchIndex.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, NamedFieldPuns #-}
  2 | 
  3 | module Data.SearchEngine.SearchIndex (
  4 |     SearchIndex,
  5 |     Term,
  6 |     TermId,
  7 |     DocId,
  8 | 
  9 |     emptySearchIndex,
 10 |     insertDoc,
 11 |     deleteDoc,
 12 | 
 13 |     docCount,
 14 |     lookupTerm,
 15 |     lookupTermsByPrefix,
 16 |     lookupTermId,
 17 |     lookupDocId,
 18 |     lookupDocKey,
 19 |     lookupDocKeyDocId,
 20 | 
 21 |     getTerm,
 22 |     getDocKey,
 23 | 
 24 |     invariant,
 25 |   ) where
 26 | 
 27 | import Data.SearchEngine.DocIdSet (DocIdSet, DocId)
 28 | import qualified Data.SearchEngine.DocIdSet as DocIdSet
 29 | import Data.SearchEngine.DocTermIds (DocTermIds, TermId, vecIndexIx, vecCreateIx)
 30 | import qualified Data.SearchEngine.DocTermIds as DocTermIds
 31 | import Data.SearchEngine.DocFeatVals (DocFeatVals)
 32 | import qualified Data.SearchEngine.DocFeatVals as DocFeatVals
 33 | 
 34 | import Data.Ix (Ix)
 35 | import qualified Data.Ix as Ix
 36 | import Data.Map (Map)
 37 | import qualified Data.Map as Map
 38 | import Data.IntMap (IntMap)
 39 | import qualified Data.IntMap as IntMap
 40 | import qualified Data.Set as Set
 41 | import Data.Text (Text)
 42 | import qualified Data.Text as T
 43 | import qualified Data.List as List
 44 | 
 45 | import Control.Exception (assert)
 46 | 
 47 | -- | Terms are short strings, usually whole words.
 48 | --
 49 | type Term = Text
 50 | 
 51 | -- | The search index is essentially a many-to-many mapping between documents
 52 | -- and terms. Each document contains many terms and each term occurs in many
 53 | -- documents. It is a bidirectional mapping as we need to support lookups in
 54 | -- both directions.
 55 | --
 56 | -- Documents are identified by a key (in Ord) while terms are text values.
 57 | -- Inside the index however we assign compact numeric ids to both documents and
 58 | -- terms. The advantage of this is a much more compact in-memory representation
 59 | -- and the disadvantage is greater complexity. In particular it means we have
 60 | -- to manage bidirectional mappings between document keys and ids, and between
 61 | -- terms and term ids.
 62 | --
 63 | -- So the mappings we maintain can be depicted as:
 64 | --
 65 | -- >  Term   <-- 1:1 -->   TermId
 66 | -- >          \              ^
 67 | -- >           \             |
 68 | -- >           1:many    many:many
 69 | -- >                \        |
 70 | -- >                 \->     v
 71 | -- > DocKey  <-- 1:1 -->   DocId
 72 | --
 73 | -- For efficiency, these details are exposed in the interface. In particular
 74 | -- the mapping from TermId to many DocIds is exposed via a 'DocIdSet',
 75 | -- and the mapping from DocIds to TermIds is exposed via 'DocTermIds'.
 76 | --
 77 | -- The main reason we need to keep the DocId -> TermId is to allow for
 78 | -- efficient incremental updates.
 79 | --
 80 | data SearchIndex key field feature = SearchIndex {
 81 |        -- the indexes
 82 |        termMap           :: !(Map Term TermInfo),
 83 |        termIdMap         :: !(IntMap TermIdInfo),
 84 |        docIdMap          :: !(IntMap (DocInfo key field feature)),
 85 |        docKeyMap         :: !(Map key DocId),
 86 | 
 87 |        -- auto-increment key counters
 88 |        nextTermId        :: TermId,
 89 |        nextDocId         :: DocId
 90 |      }
 91 |   deriving Show
 92 | 
 93 | data TermInfo = TermInfo !TermId !DocIdSet
 94 |   deriving Show
 95 | 
 96 | data TermIdInfo = TermIdInfo !Term !DocIdSet
 97 |   deriving (Show, Eq)
 98 | 
 99 | data DocInfo key field feature = DocInfo !key !(DocTermIds field)
100 |                                               !(DocFeatVals feature)
101 |   deriving Show
102 | 
103 | 
104 | -----------------------
105 | -- SearchIndex basics
106 | --
107 | 
108 | emptySearchIndex :: SearchIndex key field feature
109 | emptySearchIndex =
110 |     SearchIndex
111 |       Map.empty
112 |       IntMap.empty
113 |       IntMap.empty
114 |       Map.empty
115 |       minBound
116 |       minBound
117 | 
118 | checkInvariant :: (Ord key, Ix field, Bounded field) =>
119 |                   SearchIndex key field feature -> SearchIndex key field feature
120 | checkInvariant si = assert (invariant si) si
121 | 
122 | invariant :: (Ord key, Ix field, Bounded field) =>
123 |              SearchIndex key field feature -> Bool
124 | invariant SearchIndex{termMap, termIdMap, docKeyMap, docIdMap} =
125 |       and [ IntMap.lookup (fromEnum termId) termIdMap
126 |             == Just (TermIdInfo term docidset)
127 |           | (term, (TermInfo termId docidset)) <- Map.assocs termMap ]
128 |   &&  and [ case Map.lookup term termMap of
129 |               Just (TermInfo termId' docidset') -> toEnum termId == termId'
130 |                                                    && docidset == docidset'
131 |               Nothing                           -> False
132 |           | (termId, (TermIdInfo term docidset)) <- IntMap.assocs termIdMap ]
133 |   &&  and [ case IntMap.lookup (fromEnum docId) docIdMap of
134 |               Just (DocInfo docKey' _ _) -> docKey == docKey'
135 |               Nothing                  -> False
136 |           | (docKey, docId) <- Map.assocs docKeyMap ]
137 |   &&  and [ Map.lookup docKey docKeyMap == Just (toEnum docId)
138 |           | (docId, DocInfo docKey _ _) <- IntMap.assocs docIdMap ]
139 |   &&  and [ DocIdSet.invariant docIdSet
140 |           | (_term, (TermInfo _ docIdSet)) <- Map.assocs termMap ]
141 |   &&  and [ any (\field -> DocTermIds.fieldTermCount docterms field termId > 0) fields
142 |           | (_term, (TermInfo termId docIdSet)) <- Map.assocs termMap
143 |           , docId <- DocIdSet.toList docIdSet
144 |           , let DocInfo _ docterms _ = docIdMap IntMap.! fromEnum docId ]
145 |   &&  and [ IntMap.member (fromEnum termid) termIdMap
146 |           | (_docId, DocInfo _ docTerms _) <- IntMap.assocs docIdMap
147 |           , field <- fields
148 |           , termid <- DocTermIds.fieldElems docTerms field ]
149 |   where
150 |     fields = Ix.range (minBound, maxBound)
151 | 
152 | 
153 | -------------------
154 | -- Lookups
155 | --
156 | 
157 | docCount :: SearchIndex key field feature -> Int
158 | docCount SearchIndex{docIdMap} = IntMap.size docIdMap
159 | 
160 | lookupTerm :: SearchIndex key field feature -> Term -> Maybe (TermId, DocIdSet)
161 | lookupTerm SearchIndex{termMap} term =
162 |     case Map.lookup term termMap of
163 |       Nothing                         -> Nothing
164 |       Just (TermInfo termid docidset) -> Just (termid, docidset)
165 | 
166 | lookupTermsByPrefix :: SearchIndex key field feature ->
167 |                        Term -> [(TermId, DocIdSet)]
168 | lookupTermsByPrefix SearchIndex{termMap} term =
169 |     [ (termid, docidset)
170 |     | (TermInfo termid docidset) <- lookupPrefix term termMap ]
171 | 
172 | lookupTermId :: SearchIndex key field feature -> TermId -> DocIdSet
173 | lookupTermId SearchIndex{termIdMap} termid =
174 |     case IntMap.lookup (fromEnum termid) termIdMap of
175 |       Nothing -> error $ "lookupTermId: not found " ++ show termid
176 |       Just (TermIdInfo _ docidset) -> docidset
177 | 
178 | lookupDocId :: SearchIndex key field feature ->
179 |                DocId -> (key, DocTermIds field, DocFeatVals feature)
180 | lookupDocId SearchIndex{docIdMap} docid =
181 |     case IntMap.lookup (fromEnum docid) docIdMap of
182 |       Nothing                                   -> errNotFound
183 |       Just (DocInfo key doctermids docfeatvals) -> (key, doctermids, docfeatvals)
184 |   where
185 |     errNotFound = error $ "lookupDocId: not found " ++ show docid
186 | 
187 | lookupDocKey :: Ord key => SearchIndex key field feature ->
188 |                 key -> Maybe (DocTermIds field)
189 | lookupDocKey SearchIndex{docKeyMap, docIdMap} key = do
190 |     case Map.lookup key docKeyMap of
191 |       Nothing    -> Nothing
192 |       Just docid ->
193 |         case IntMap.lookup (fromEnum docid) docIdMap of
194 |           Nothing                          -> error "lookupDocKey: internal error"
195 |           Just (DocInfo _key doctermids _) -> Just doctermids
196 | 
197 | lookupDocKeyDocId :: Ord key => SearchIndex key field feature -> key -> Maybe DocId
198 | lookupDocKeyDocId SearchIndex{docKeyMap} key = Map.lookup key docKeyMap
199 | 
200 | 
201 | getTerm :: SearchIndex key field feature -> TermId -> Term
202 | getTerm SearchIndex{termIdMap} termId =
203 |     case termIdMap IntMap.! fromEnum termId of TermIdInfo term _ -> term
204 | 
205 | getTermId :: SearchIndex key field feature -> Term -> TermId
206 | getTermId SearchIndex{termMap} term =
207 |     case termMap Map.! term of TermInfo termid _ -> termid
208 | 
209 | getDocKey :: SearchIndex key field feature -> DocId -> key
210 | getDocKey SearchIndex{docIdMap} docid =
211 |     case docIdMap IntMap.! fromEnum docid of
212 |       DocInfo dockey _ _ -> dockey
213 | 
214 | getDocTermIds :: SearchIndex key field feature -> DocId -> DocTermIds field
215 | getDocTermIds SearchIndex{docIdMap} docid =
216 |     case docIdMap IntMap.! fromEnum docid of
217 |       DocInfo _ doctermids _ -> doctermids
218 | 
219 | --------------------
220 | -- Insert & delete
221 | --
222 | 
223 | -- Procedure for adding a new doc...
224 | -- (key, field -> [Term])
225 | -- alloc docid for key
226 | -- add term occurences for docid (include rev map for termid)
227 | -- construct indexdoc now that we have all the term -> termid entries
228 | -- insert indexdoc
229 | 
230 | -- Procedure for updating a doc...
231 | -- (key, field -> [Term])
232 | -- find docid for key
233 | -- lookup old terms for docid (using termid rev map)
234 | -- calc term occurrences to add, term occurrences to delete
235 | -- add new term occurrences, delete old term occurrences
236 | -- construct indexdoc now that we have all the term -> termid entries
237 | -- insert indexdoc
238 | 
239 | -- Procedure for deleting a doc...
240 | -- (key, field -> [Term])
241 | -- find docid for key
242 | -- lookup old terms for docid (using termid rev map)
243 | -- delete old term occurrences
244 | -- delete indexdoc
245 | 
246 | -- | This is the representation for documents to be added to the index.
247 | -- Documents may 
248 | --
249 | type DocTerms         field   = field   -> [Term]
250 | type DocFeatureValues feature = feature -> Float
251 | 
252 | insertDoc :: (Ord key, Ix field, Bounded field, Ix feature, Bounded feature) =>
253 |               key -> DocTerms field -> DocFeatureValues feature ->
254 |               SearchIndex key field feature -> SearchIndex key field feature
255 | insertDoc key userDocTerms userDocFeats si@SearchIndex{docKeyMap}
256 |   | Just docid <- Map.lookup key docKeyMap
257 |   = -- Some older version of the doc is already present in the index,
258 |     -- So we keep its docid. Now have to update the doc itself
259 |     -- and update the terms by removing old ones and adding new ones.
260 |     let oldTermsIds   = getDocTermIds si docid
261 |         userDocTerms' = memoiseDocTerms userDocTerms
262 |         newTerms      = docTermSet userDocTerms'
263 |         oldTerms      = docTermIdsTermSet si oldTermsIds
264 |         -- We optimise for the typical case of significant overlap between
265 |         -- the terms in the old and new versions of the document.
266 |         delTerms      = oldTerms `Set.difference` newTerms
267 |         addTerms      = newTerms `Set.difference` oldTerms
268 | 
269 |      -- Note: adding the doc relies on all the terms being in the termMap
270 |      -- already, so we first add all the term occurences for the docid.
271 |      in checkInvariant
272 |       . insertDocIdToDocEntry docid key userDocTerms' userDocFeats
273 |       . insertTermToDocIdEntries (Set.toList addTerms) docid
274 |       . deleteTermToDocIdEntries (Set.toList delTerms) docid
275 |       $ si
276 | 
277 |   | otherwise
278 |   = -- We're dealing with a new doc, so allocate a docid for the key
279 |     let (si', docid)  = allocFreshDocId si
280 |         userDocTerms' = memoiseDocTerms userDocTerms
281 |         addTerms      = docTermSet userDocTerms'
282 | 
283 |      -- Note: adding the doc relies on all the terms being in the termMap
284 |      -- already, so we first add all the term occurences for the docid.
285 |      in checkInvariant
286 |       . insertDocIdToDocEntry docid key userDocTerms' userDocFeats
287 |       . insertDocKeyToIdEntry key docid
288 |       . insertTermToDocIdEntries (Set.toList addTerms) docid
289 |       $ si'
290 | 
291 | deleteDoc :: (Ord key, Ix field, Bounded field) =>
292 |              key ->
293 |              SearchIndex key field feature -> SearchIndex key field feature
294 | deleteDoc key si@SearchIndex{docKeyMap}
295 |   | Just docid <- Map.lookup key docKeyMap
296 |   = let oldTermsIds = getDocTermIds si docid
297 |         oldTerms    = docTermIdsTermSet si oldTermsIds
298 |      in checkInvariant
299 |       . deleteDocEntry docid key
300 |       . deleteTermToDocIdEntries (Set.toList oldTerms) docid
301 |       $ si
302 |   
303 |   | otherwise = si
304 | 
305 | 
306 | ----------------------------------
307 | -- Insert & delete support utils
308 | --
309 | 
310 | 
311 | memoiseDocTerms :: (Ix field, Bounded field) => DocTerms field -> DocTerms field
312 | memoiseDocTerms docTermsFn =
313 |     \field -> vecIndexIx vec field
314 |   where
315 |     vec = vecCreateIx docTermsFn
316 | 
317 | docTermSet :: (Bounded t, Ix t) => DocTerms t -> Set.Set Term
318 | docTermSet docterms =
319 |     Set.unions [ Set.fromList (docterms field)
320 |                | field <- Ix.range (minBound, maxBound) ]
321 | 
322 | docTermIdsTermSet :: (Bounded field, Ix field) =>
323 |                      SearchIndex key field feature ->
324 |                      DocTermIds field -> Set.Set Term
325 | docTermIdsTermSet si doctermids =
326 |     Set.unions [ Set.fromList terms
327 |                | field <- Ix.range (minBound, maxBound)
328 |                , let termids = DocTermIds.fieldElems doctermids field
329 |                      terms   = map (getTerm si) termids ]
330 | 
331 | --
332 | -- The Term <-> DocId mapping
333 | --
334 | 
335 | -- | Add an entry into the 'Term' to 'DocId' mapping.
336 | insertTermToDocIdEntry :: Term -> DocId ->
337 |                           SearchIndex key field feature ->
338 |                           SearchIndex key field feature
339 | insertTermToDocIdEntry term !docid si@SearchIndex{termMap, termIdMap, nextTermId} =
340 |     case Map.lookup term termMap of
341 |       Nothing ->
342 |         let docIdSet'    = DocIdSet.singleton docid
343 |             !termInfo'   = TermInfo nextTermId docIdSet'
344 |             !termIdInfo' = TermIdInfo term     docIdSet'
345 |          in si { termMap    = Map.insert term termInfo' termMap
346 |                , termIdMap  = IntMap.insert (fromEnum nextTermId)
347 |                                             termIdInfo' termIdMap
348 |                , nextTermId = succ nextTermId }
349 | 
350 |       Just (TermInfo termId docIdSet) ->
351 |         let docIdSet'    = DocIdSet.insert docid docIdSet
352 |             !termInfo'   = TermInfo termId docIdSet'
353 |             !termIdInfo' = TermIdInfo term docIdSet'
354 |          in si { termMap   = Map.insert term termInfo' termMap
355 |                , termIdMap = IntMap.insert (fromEnum termId)
356 |                                            termIdInfo' termIdMap
357 |                }
358 | 
359 | -- | Add multiple entries into the 'Term' to 'DocId' mapping: many terms that
360 | -- map to the same document.
361 | insertTermToDocIdEntries :: [Term] -> DocId ->
362 |                             SearchIndex key field feature ->
363 |                             SearchIndex key field feature
364 | insertTermToDocIdEntries terms !docid si =
365 |     List.foldl' (\si' term -> insertTermToDocIdEntry term docid si') si terms
366 | 
367 | -- | Delete an entry from the 'Term' to 'DocId' mapping.
368 | deleteTermToDocIdEntry :: Term -> DocId ->
369 |                           SearchIndex key field feature ->
370 |                           SearchIndex key field feature
371 | deleteTermToDocIdEntry term !docid si@SearchIndex{termMap, termIdMap} =
372 |     case  Map.lookup term termMap of
373 |       Nothing -> si
374 |       Just (TermInfo termId docIdSet) ->
375 |         let docIdSet'    = DocIdSet.delete docid docIdSet
376 |             !termInfo'   = TermInfo termId docIdSet'
377 |             !termIdInfo' = TermIdInfo term docIdSet'
378 |         in if DocIdSet.null docIdSet'
379 |             then si { termMap = Map.delete term termMap
380 |                     , termIdMap = IntMap.delete (fromEnum termId) termIdMap }
381 |             else si { termMap   = Map.insert term termInfo' termMap
382 |                     , termIdMap = IntMap.insert (fromEnum termId)
383 |                                                 termIdInfo' termIdMap
384 |                     }
385 | 
386 | -- | Delete multiple entries from the 'Term' to 'DocId' mapping: many terms
387 | -- that map to the same document.
388 | deleteTermToDocIdEntries :: [Term] -> DocId ->
389 |                             SearchIndex key field feature ->
390 |                             SearchIndex key field feature
391 | deleteTermToDocIdEntries terms !docid si =
392 |     List.foldl' (\si' term -> deleteTermToDocIdEntry term docid si') si terms
393 | 
394 | --
395 | -- The DocId <-> Doc mapping
396 | --
397 | 
398 | allocFreshDocId :: SearchIndex key field feature ->
399 |                   (SearchIndex key field feature, DocId)
400 | allocFreshDocId si@SearchIndex{nextDocId} =
401 |     let !si' = si { nextDocId = succ nextDocId }
402 |      in (si', nextDocId)
403 | 
404 | insertDocKeyToIdEntry :: Ord key => key -> DocId ->
405 |                          SearchIndex key field feature ->
406 |                          SearchIndex key field feature
407 | insertDocKeyToIdEntry dockey !docid si@SearchIndex{docKeyMap} =
408 |     si { docKeyMap = Map.insert dockey docid docKeyMap }
409 | 
410 | insertDocIdToDocEntry :: (Ix field, Bounded field,
411 |                           Ix feature, Bounded feature) =>
412 |                          DocId -> key ->
413 |                          DocTerms field ->
414 |                          DocFeatureValues feature ->
415 |                          SearchIndex key field feature ->
416 |                          SearchIndex key field feature
417 | insertDocIdToDocEntry !docid dockey userdocterms userdocfeats
418 |                        si@SearchIndex{docIdMap} =
419 |     let doctermids = DocTermIds.create (map (getTermId si) . userdocterms)
420 |         docfeatvals= DocFeatVals.create userdocfeats
421 |         !docinfo   = DocInfo dockey doctermids docfeatvals
422 |      in si { docIdMap  = IntMap.insert (fromEnum docid) docinfo docIdMap }
423 | 
424 | deleteDocEntry :: Ord key => DocId -> key ->
425 |                   SearchIndex key field feature -> SearchIndex key field feature
426 | deleteDocEntry docid key si@SearchIndex{docIdMap, docKeyMap} =
427 |      si { docIdMap  = IntMap.delete (fromEnum docid) docIdMap
428 |         , docKeyMap = Map.delete key docKeyMap }
429 | 
430 | --
431 | -- Data.Map utils
432 | --
433 | 
434 | -- Data.Map does not support prefix lookups directly (unlike a trie)
435 | -- but we can implement it reasonably efficiently using split:
436 | 
437 | -- | Lookup values for a range of keys (inclusive lower bound and exclusive
438 | -- upper bound)
439 | --
440 | lookupRange :: Ord k => (k, k) -> Map k v -> [v]
441 | lookupRange (lb, ub) m =
442 |   let (_, mv, gt)  = Map.splitLookup lb m
443 |       (between, _) = Map.split       ub gt
444 |    in case mv of
445 |         Just v  -> v : Map.elems between
446 |         Nothing ->     Map.elems between
447 | 
448 | lookupPrefix :: Text -> Map Text v -> [v]
449 | lookupPrefix t _ | T.null t = []
450 | lookupPrefix t m = lookupRange (t, prefixUpperBound t) m
451 | 
452 | prefixUpperBound :: Text -> Text
453 | prefixUpperBound = succLast . T.dropWhileEnd (== maxBound)
454 |   where
455 |     succLast t = T.init t `T.snoc` succ (T.last t)
456 | 
457 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine/TermBag.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, GeneralizedNewtypeDeriving, MultiParamTypeClasses,
  2 |              TypeFamilies #-}
  3 | {-# LANGUAGE CPP #-}
  4 | #if __GLASGOW_HASKELL__ >= 908
  5 | {-# OPTIONS_GHC -Wno-x-partial #-}
  6 | #endif
  7 | 
  8 | module Data.SearchEngine.TermBag (
  9 |     TermId(TermId), TermCount,
 10 |     TermBag,
 11 |     size,
 12 |     fromList,
 13 |     toList,
 14 |     elems,
 15 |     termCount,
 16 |     denseTable,
 17 |     invariant
 18 |   ) where
 19 | 
 20 | import qualified Data.Vector.Unboxed         as Vec
 21 | import qualified Data.Vector.Unboxed.Mutable as MVec
 22 | import qualified Data.Vector.Generic         as GVec
 23 | import qualified Data.Vector.Generic.Mutable as GMVec
 24 | import Control.Monad.ST
 25 | import Control.Monad (liftM)
 26 | import qualified Data.Map as Map
 27 | import Data.Word (Word32, Word8)
 28 | import Data.Bits
 29 | import qualified Data.List as List
 30 | import Data.Function (on)
 31 | 
 32 | newtype TermId = TermId { unTermId :: Word32 }
 33 |   deriving (Eq, Ord, Show, Enum)
 34 | 
 35 | instance Bounded TermId where
 36 |   minBound = TermId 0
 37 |   maxBound = TermId 0x00FFFFFF
 38 | 
 39 | data TermBag = TermBag !Int !(Vec.Vector TermIdAndCount)
 40 |   deriving Show
 41 | 
 42 | -- We sneakily stuff both the TermId and the bag count into one 32bit word
 43 | type TermIdAndCount = Word32
 44 | type TermCount      = Word8
 45 | 
 46 | -- Bottom 24 bits is the TermId, top 8 bits is the bag count
 47 | termIdAndCount :: TermId -> Int -> TermIdAndCount
 48 | termIdAndCount (TermId termid) freq =
 49 |       (min (fromIntegral freq) 255 `shiftL` 24)
 50 |   .|. (termid .&. 0x00FFFFFF)
 51 | 
 52 | getTermId :: TermIdAndCount -> TermId
 53 | getTermId word = TermId (word .&. 0x00FFFFFF)
 54 | 
 55 | getTermCount :: TermIdAndCount -> TermCount
 56 | getTermCount word = fromIntegral (word `shiftR` 24)
 57 | 
 58 | invariant :: TermBag -> Bool
 59 | invariant (TermBag _ vec) =
 60 |     strictlyAscending (Vec.toList vec)
 61 |   where
 62 |     strictlyAscending (a:xs@(b:_)) = getTermId a < getTermId b
 63 |                                   && strictlyAscending xs
 64 |     strictlyAscending _  = True
 65 | 
 66 | size :: TermBag -> Int
 67 | size (TermBag sz _) = sz
 68 | 
 69 | elems :: TermBag -> [TermId]
 70 | elems (TermBag _ vec) = map getTermId (Vec.toList vec)
 71 | 
 72 | toList :: TermBag -> [(TermId, TermCount)]
 73 | toList (TermBag _ vec) = [ (getTermId x, getTermCount x)
 74 |                          | x <- Vec.toList vec ]
 75 | 
 76 | termCount :: TermBag -> TermId -> TermCount
 77 | termCount (TermBag _ vec) =
 78 |     binarySearch 0 (Vec.length vec - 1)
 79 |   where
 80 |     binarySearch :: Int -> Int -> TermId -> TermCount
 81 |     binarySearch !a !b !key
 82 |       | a > b     = 0
 83 |       | otherwise =
 84 |         let mid         = (a + b) `div` 2
 85 |             tidAndCount = vec Vec.! mid
 86 |          in case compare key (getTermId tidAndCount) of
 87 |               LT -> binarySearch a (mid-1) key
 88 |               EQ -> getTermCount tidAndCount
 89 |               GT -> binarySearch (mid+1) b key
 90 | 
 91 | fromList :: [TermId] -> TermBag
 92 | fromList termids =
 93 |     let bag = Map.fromListWith (+) [ (t, 1) | t <- termids ]
 94 |         sz  = Map.foldl' (+) 0 bag
 95 |         vec = Vec.fromListN (Map.size bag)
 96 |                             [ termIdAndCount termid freq
 97 |                             | (termid, freq) <- Map.toAscList bag ]
 98 |      in TermBag sz vec
 99 | 
100 | -- | Given a bunch of term bags, merge them into a table for easier subsequent
101 | -- processing. This is bascially a sparse to dense conversion. Missing entries
102 | -- are filled in with 0. We represent the table as one vector for the
103 | -- term ids and a 2d array for the counts.
104 | --
105 | -- Unfortunately vector does not directly support 2d arrays and array does
106 | -- not make it easy to trim arrays.
107 | --
108 | denseTable :: [TermBag] -> (Vec.Vector TermId, Vec.Vector TermCount)
109 | denseTable termbags =
110 |     (tids, tcts)
111 |   where
112 |     -- First merge the TermIds into one array
113 |     -- then make a linear pass to create the counts array
114 |     -- filling in 0s or the counts as we find them
115 |     !numBags   = length termbags
116 |     !tids      = unionsTermId termbags
117 |     !numTerms  = Vec.length tids
118 |     !numCounts = numTerms * numBags
119 |     !tcts      = Vec.create (do
120 |                    out <- MVec.new numCounts
121 |                    sequence_
122 |                      [ writeMergedTermCounts tids bag out i
123 |                      | (n, TermBag _ bag) <- zip [0..] termbags
124 |                      , let i = n * numTerms ]
125 |                    return out
126 |                  )
127 | 
128 | writeMergedTermCounts :: Vec.Vector TermId -> Vec.Vector TermIdAndCount ->
129 |                          MVec.MVector s TermCount -> Int -> ST s ()
130 | writeMergedTermCounts xs0 ys0 !out i0 =
131 |     -- assume xs & ys are sorted, and ys contains a subset of xs
132 |     go xs0 ys0 i0
133 |   where
134 |     go !xs !ys !i
135 |       | Vec.null ys = MVec.set (MVec.slice i (Vec.length xs) out) 0
136 |       | Vec.null xs = return ()
137 |       | otherwise   = let x   = Vec.head xs
138 |                           ytc = Vec.head ys
139 |                           y   = getTermId ytc
140 |                           c   = getTermCount ytc
141 |                       in case x == y of
142 |                            True  -> do MVec.write out i c
143 |                                        go (Vec.tail xs) (Vec.tail ys) (i+1)
144 |                            False -> do MVec.write out i 0
145 |                                        go (Vec.tail xs)           ys  (i+1)
146 | 
147 | -- | Given a set of term bags, form the set of TermIds
148 | --
149 | unionsTermId :: [TermBag] -> Vec.Vector TermId
150 | unionsTermId tbs =
151 |     case List.sortBy (compare `on` bagVecLength) tbs of
152 |       []             -> Vec.empty
153 |       [TermBag _ xs] -> (Vec.map getTermId xs)
154 |       (x0:x1:xs)     -> List.foldl' union3 (union2 x0 x1) xs
155 |   where
156 |     bagVecLength (TermBag _ vec) = Vec.length vec
157 | 
158 | union2 :: TermBag -> TermBag -> Vec.Vector TermId
159 | union2 (TermBag _ xs) (TermBag _ ys) =
160 |     Vec.create (MVec.new sizeBound >>= writeMergedUnion2 xs ys)
161 |   where
162 |     sizeBound = Vec.length xs + Vec.length ys
163 | 
164 | writeMergedUnion2 :: Vec.Vector TermIdAndCount -> Vec.Vector TermIdAndCount ->
165 |                      MVec.MVector s TermId -> ST s (MVec.MVector s TermId)
166 | writeMergedUnion2 xs0 ys0 !out = do
167 |     i <- go xs0 ys0 0
168 |     return $! MVec.take i out
169 |   where
170 |     go !xs !ys !i
171 |       | Vec.null xs = do Vec.copy (MVec.slice i (Vec.length ys) out)
172 |                                   (Vec.map getTermId ys)
173 |                          return (i + Vec.length ys)
174 |       | Vec.null ys = do Vec.copy (MVec.slice i (Vec.length xs) out)
175 |                                   (Vec.map getTermId xs)
176 |                          return (i + Vec.length xs)
177 |       | otherwise   = let x = getTermId (Vec.head xs)
178 |                           y = getTermId (Vec.head ys)
179 |                       in case compare x y of
180 |                           GT -> do MVec.write out i y
181 |                                    go           xs  (Vec.tail ys) (i+1)
182 |                           EQ -> do MVec.write out i x
183 |                                    go (Vec.tail xs) (Vec.tail ys) (i+1)
184 |                           LT -> do MVec.write out i x
185 |                                    go (Vec.tail xs)           ys  (i+1)
186 | 
187 | union3 :: Vec.Vector TermId -> TermBag -> Vec.Vector TermId
188 | union3 xs (TermBag _ ys) =
189 |     Vec.create (MVec.new sizeBound >>= writeMergedUnion3 xs ys)
190 |   where
191 |     sizeBound = Vec.length xs + Vec.length ys
192 | 
193 | writeMergedUnion3 :: Vec.Vector TermId -> Vec.Vector TermIdAndCount ->
194 |                      MVec.MVector s TermId -> ST s (MVec.MVector s TermId)
195 | writeMergedUnion3 xs0 ys0 !out = do
196 |     i <- go xs0 ys0 0
197 |     return $! MVec.take i out
198 |   where
199 |     go !xs !ys !i
200 |       | Vec.null xs = do Vec.copy (MVec.slice i (Vec.length ys) out)
201 |                                   (Vec.map getTermId ys)
202 |                          return (i + Vec.length ys)
203 |       | Vec.null ys = do Vec.copy (MVec.slice i (Vec.length xs) out) xs
204 |                          return (i + Vec.length xs)
205 |       | otherwise   = let x =            Vec.head xs
206 |                           y = getTermId (Vec.head ys)
207 |                       in case compare x y of
208 |                           GT -> do MVec.write out i y
209 |                                    go           xs  (Vec.tail ys) (i+1)
210 |                           EQ -> do MVec.write out i x
211 |                                    go (Vec.tail xs) (Vec.tail ys) (i+1)
212 |                           LT -> do MVec.write out i x
213 |                                    go (Vec.tail xs)           ys  (i+1)
214 | 
215 | ------------------------------------------------------------------------------
216 | -- verbose Unbox instances
217 | --
218 | 
219 | instance MVec.Unbox TermId
220 | 
221 | newtype instance MVec.MVector s TermId = MV_TermId (MVec.MVector s Word32)
222 | 
223 | instance GMVec.MVector MVec.MVector TermId where
224 |     basicLength          (MV_TermId v) = GMVec.basicLength v
225 |     basicUnsafeSlice i l (MV_TermId v) = MV_TermId (GMVec.basicUnsafeSlice i l v)
226 |     basicUnsafeNew     l               = MV_TermId `liftM` GMVec.basicUnsafeNew l
227 |     basicInitialize      (MV_TermId v) = GMVec.basicInitialize v
228 |     basicUnsafeReplicate l x           = MV_TermId `liftM` GMVec.basicUnsafeReplicate l (unTermId x)
229 |     basicUnsafeRead  (MV_TermId v) i   = TermId `liftM`    GMVec.basicUnsafeRead v i
230 |     basicUnsafeWrite (MV_TermId v) i x = GMVec.basicUnsafeWrite v i (unTermId x)
231 |     basicClear       (MV_TermId v)     = GMVec.basicClear v
232 |     basicSet         (MV_TermId v) x   = GMVec.basicSet v (unTermId x)
233 |     basicUnsafeGrow  (MV_TermId v) l   = MV_TermId `liftM` GMVec.basicUnsafeGrow v l
234 |     basicUnsafeCopy  (MV_TermId v) (MV_TermId v') = GMVec.basicUnsafeCopy v v'
235 |     basicUnsafeMove  (MV_TermId v) (MV_TermId v') = GMVec.basicUnsafeMove v v'
236 |     basicOverlaps    (MV_TermId v) (MV_TermId v') = GMVec.basicOverlaps   v v'
237 |     {-# INLINE basicLength #-}
238 |     {-# INLINE basicUnsafeSlice #-}
239 |     {-# INLINE basicOverlaps #-}
240 |     {-# INLINE basicUnsafeNew #-}
241 |     {-# INLINE basicInitialize #-}
242 |     {-# INLINE basicUnsafeReplicate #-}
243 |     {-# INLINE basicUnsafeRead #-}
244 |     {-# INLINE basicUnsafeWrite #-}
245 |     {-# INLINE basicClear #-}
246 |     {-# INLINE basicSet #-}
247 |     {-# INLINE basicUnsafeCopy #-}
248 |     {-# INLINE basicUnsafeMove #-}
249 |     {-# INLINE basicUnsafeGrow #-}
250 | 
251 | newtype instance Vec.Vector TermId = V_TermId (Vec.Vector Word32)
252 | 
253 | instance GVec.Vector Vec.Vector TermId where
254 |     basicUnsafeFreeze (MV_TermId mv)  = V_TermId  `liftM` GVec.basicUnsafeFreeze mv
255 |     basicUnsafeThaw   (V_TermId  v)   = MV_TermId `liftM` GVec.basicUnsafeThaw v
256 |     basicLength       (V_TermId  v)   = GVec.basicLength v
257 |     basicUnsafeSlice i l (V_TermId v) = V_TermId (GVec.basicUnsafeSlice i l v)
258 |     basicUnsafeIndexM (V_TermId  v) i = TermId `liftM` GVec.basicUnsafeIndexM v i
259 |     basicUnsafeCopy   (MV_TermId mv)
260 |                       (V_TermId  v)   = GVec.basicUnsafeCopy mv v
261 |     elemseq           (V_TermId  v) x = GVec.elemseq v (unTermId x)
262 |     {-# INLINE basicUnsafeFreeze #-}
263 |     {-# INLINE basicUnsafeThaw #-}
264 |     {-# INLINE basicLength #-}
265 |     {-# INLINE basicUnsafeSlice #-}
266 |     {-# INLINE basicUnsafeIndexM #-}
267 |     {-# INLINE basicUnsafeCopy #-}
268 |     {-# INLINE elemseq #-}
269 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine/Types.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE NamedFieldPuns, RecordWildCards #-}
  2 | 
  3 | module Data.SearchEngine.Types (
  4 |     -- * Search engine types and helper functions
  5 |     SearchEngine(..),
  6 |     SearchConfig(..),
  7 |     SearchRankParameters(..),
  8 |     BM25F.FeatureFunction(..),
  9 |     initSearchEngine,
 10 |     cacheBM25Context,
 11 | 
 12 |     -- ** Helper type for non-term features
 13 |     NoFeatures,
 14 |     noFeatures,
 15 | 
 16 |     -- * Re-export SearchIndex and other types
 17 |     SearchIndex, Term, TermId,
 18 |     DocIdSet, DocId,
 19 |     DocTermIds, DocFeatVals,
 20 | 
 21 |     -- * Internal sanity check
 22 |     invariant,
 23 |   ) where
 24 | 
 25 | import Data.SearchEngine.SearchIndex (SearchIndex, Term, TermId)
 26 | import qualified Data.SearchEngine.SearchIndex as SI
 27 | import Data.SearchEngine.DocIdSet (DocIdSet, DocId)
 28 | import qualified Data.SearchEngine.DocIdSet as DocIdSet
 29 | import Data.SearchEngine.DocFeatVals (DocFeatVals)
 30 | import Data.SearchEngine.DocTermIds  (DocTermIds)
 31 | import qualified Data.SearchEngine.BM25F as BM25F
 32 | 
 33 | import Data.Ix
 34 | import Data.Array.Unboxed
 35 | 
 36 | 
 37 | 
 38 | data SearchConfig doc key field feature = SearchConfig {
 39 |        documentKey          :: doc -> key,
 40 |        extractDocumentTerms :: doc -> field -> [Term],
 41 |        transformQueryTerm   :: Term -> field -> Term,
 42 |        documentFeatureValue :: doc -> feature -> Float
 43 |      }
 44 | 
 45 | data SearchRankParameters field feature = SearchRankParameters {
 46 |        paramK1                 :: !Float,
 47 |        paramB                  :: field -> Float,
 48 |        paramFieldWeights       :: field -> Float,
 49 |        paramFeatureWeights     :: feature -> Float,
 50 |        paramFeatureFunctions   :: feature -> BM25F.FeatureFunction,
 51 | 
 52 |        paramResultsetSoftLimit   :: !Int,
 53 |        paramResultsetHardLimit   :: !Int,
 54 |        paramAutosuggestPrefilterLimit  :: !Int,
 55 |        paramAutosuggestPostfilterLimit :: !Int
 56 |      }
 57 | 
 58 | data SearchEngine doc key field feature = SearchEngine {
 59 |        searchIndex      :: !(SearchIndex      key field feature),
 60 |        searchConfig     :: !(SearchConfig doc key field feature),
 61 |        searchRankParams :: !(SearchRankParameters field feature),
 62 | 
 63 |        -- cached info
 64 |        sumFieldLengths :: !(UArray field Int),
 65 |        bm25Context     :: BM25F.Context TermId field feature
 66 |      }
 67 | 
 68 | invariant :: (Ord key, Ix field, Bounded field) =>
 69 |              SearchEngine doc key field feature -> Bool
 70 | invariant SearchEngine{searchIndex} =
 71 |     SI.invariant searchIndex
 72 | -- && check caches
 73 | 
 74 | initSearchEngine :: (Ix field, Bounded field, Ix feature, Bounded feature) =>
 75 |                     SearchConfig doc key field feature ->
 76 |                     SearchRankParameters field feature ->
 77 |                     SearchEngine doc key field feature
 78 | initSearchEngine config params =
 79 |     cacheBM25Context
 80 |       SearchEngine {
 81 |         searchIndex      = SI.emptySearchIndex,
 82 |         searchConfig     = config,
 83 |         searchRankParams = params,
 84 |         sumFieldLengths  = listArray (minBound, maxBound) (repeat 0),
 85 |         bm25Context      = undefined
 86 |       }
 87 | 
 88 | cacheBM25Context :: Ix field =>
 89 |                     SearchEngine doc key field feature ->
 90 |                     SearchEngine doc key field feature
 91 | cacheBM25Context
 92 |     se@SearchEngine {
 93 |       searchRankParams = SearchRankParameters{..},
 94 |       searchIndex,
 95 |       sumFieldLengths
 96 |     }
 97 |   = se { bm25Context = bm25Context' }
 98 |   where
 99 |     bm25Context' = BM25F.Context {
100 |       BM25F.numDocsTotal    = SI.docCount searchIndex,
101 |       BM25F.avgFieldLength  = \f -> fromIntegral (sumFieldLengths ! f)
102 |                                   / fromIntegral (SI.docCount searchIndex),
103 |       BM25F.numDocsWithTerm = DocIdSet.size . SI.lookupTermId searchIndex,
104 |       BM25F.paramK1         = paramK1,
105 |       BM25F.paramB          = paramB,
106 |       BM25F.fieldWeight     = paramFieldWeights,
107 |       BM25F.featureWeight   = paramFeatureWeights,
108 |       BM25F.featureFunction = paramFeatureFunctions
109 |     }
110 | 
111 | 
112 | -----------------------------
113 | 
114 | data NoFeatures = NoFeatures
115 |   deriving (Eq, Ord, Bounded, Show)
116 | 
117 | instance Ix NoFeatures where
118 |   range   _   = []
119 |   inRange _ _ = False
120 |   index   _ _ = -1
121 | 
122 | noFeatures :: NoFeatures -> a
123 | noFeatures _ = error "noFeatures"
124 | 
125 | 


--------------------------------------------------------------------------------
/src/Data/SearchEngine/Update.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns, NamedFieldPuns, RecordWildCards #-}
 2 | 
 3 | module Data.SearchEngine.Update (
 4 | 
 5 |     -- * Managing documents to be searched
 6 |     insertDoc,
 7 |     insertDocs,
 8 |     deleteDoc,
 9 | 
10 |   ) where
11 | 
12 | import Data.SearchEngine.Types
13 | import qualified Data.SearchEngine.SearchIndex as SI
14 | import qualified Data.SearchEngine.DocTermIds as DocTermIds
15 | 
16 | import qualified Data.List as List
17 | import Data.Ix
18 | import Data.Array.Unboxed
19 | 
20 | 
21 | insertDocs :: (Ord key, Ix field, Bounded field, Ix feature, Bounded feature) =>
22 |               [doc] ->
23 |               SearchEngine doc key field feature ->
24 |               SearchEngine doc key field feature
25 | insertDocs docs se = List.foldl' (\se' doc -> insertDoc doc se') se docs
26 | 
27 | 
28 | insertDoc :: (Ord key, Ix field, Bounded field, Ix feature, Bounded feature) =>
29 |              doc ->
30 |              SearchEngine doc key field feature ->
31 |              SearchEngine doc key field feature
32 | insertDoc doc se@SearchEngine{ searchConfig = SearchConfig {
33 |                                  documentKey,
34 |                                  extractDocumentTerms,
35 |                                  documentFeatureValue
36 |                                }
37 |                              , searchIndex } =
38 |     let key = documentKey doc
39 |         searchIndex' = SI.insertDoc key (extractDocumentTerms doc)
40 |                                         (documentFeatureValue doc)
41 |                                         searchIndex
42 |         oldDoc       = SI.lookupDocKey searchIndex  key
43 |         newDoc       = SI.lookupDocKey searchIndex' key
44 | 
45 |      in cacheBM25Context $
46 |         updateCachedFieldLengths oldDoc newDoc $
47 |           se { searchIndex = searchIndex' }
48 | 
49 | 
50 | deleteDoc :: (Ord key, Ix field, Bounded field) =>
51 |              key ->
52 |              SearchEngine doc key field feature ->
53 |              SearchEngine doc key field feature
54 | deleteDoc key se@SearchEngine{searchIndex} =
55 |     let searchIndex' = SI.deleteDoc key searchIndex
56 |         oldDoc       = SI.lookupDocKey searchIndex key
57 | 
58 |      in cacheBM25Context $
59 |         updateCachedFieldLengths oldDoc Nothing $
60 |           se { searchIndex = searchIndex' }
61 | 
62 | 
63 | updateCachedFieldLengths :: (Ix field, Bounded field) =>
64 |                             Maybe (DocTermIds field) -> Maybe (DocTermIds field) ->
65 |                             SearchEngine doc key field feature ->
66 |                             SearchEngine doc key field feature
67 | updateCachedFieldLengths Nothing (Just newDoc) se@SearchEngine{sumFieldLengths} =
68 |     se {
69 |       sumFieldLengths =
70 |         array (bounds sumFieldLengths)
71 |               [ (i, n + DocTermIds.fieldLength newDoc i)
72 |               | (i, n) <- assocs sumFieldLengths ]
73 |     }
74 | updateCachedFieldLengths (Just oldDoc) (Just newDoc) se@SearchEngine{sumFieldLengths} =
75 |     se {
76 |       sumFieldLengths =
77 |         array (bounds sumFieldLengths)
78 |               [ (i, n - DocTermIds.fieldLength oldDoc i
79 |                       + DocTermIds.fieldLength newDoc i)
80 |               | (i, n) <- assocs sumFieldLengths ]
81 |     }
82 | updateCachedFieldLengths (Just oldDoc) Nothing se@SearchEngine{sumFieldLengths} =
83 |     se {
84 |       sumFieldLengths =
85 |         array (bounds sumFieldLengths)
86 |               [ (i, n - DocTermIds.fieldLength oldDoc i)
87 |               | (i, n) <- assocs sumFieldLengths ]
88 |     }
89 | updateCachedFieldLengths Nothing Nothing se = se
90 | 
91 | 


--------------------------------------------------------------------------------
/tests/Main.hs:
--------------------------------------------------------------------------------
 1 | 
 2 | import qualified Test.Data.SearchEngine.DocIdSet as DocIdSet
 3 | import qualified Test.Data.SearchEngine.TermBag  as TermBag
 4 | 
 5 | import Test.Tasty
 6 | import Test.Tasty.QuickCheck
 7 | 
 8 | 
 9 | main :: IO ()
10 | main = defaultMain $
11 |          testGroup ""
12 |            [ docIdSetTests
13 |            , termBagTests
14 |            ]
15 | 
16 | docIdSetTests :: TestTree
17 | docIdSetTests =
18 |     testGroup "TermIdSet"
19 |       [ testProperty "prop_insert"  DocIdSet.prop_insert
20 |       , testProperty "prop_delete"  DocIdSet.prop_delete
21 |       , testProperty "prop_delete'" DocIdSet.prop_delete'
22 |       , testProperty "prop_union"   DocIdSet.prop_union
23 |       , testProperty "prop_union'"  DocIdSet.prop_union'
24 |       ]
25 | 
26 | termBagTests :: TestTree
27 | termBagTests =
28 |     testGroup "TermBag"
29 |       [ testProperty "prop_invariant"   TermBag.prop_invariant
30 |       , testProperty "prop_elems"       TermBag.prop_elems
31 |       , testProperty "prop_fromList"    TermBag.prop_fromList
32 |       , testProperty "prop_size"        TermBag.prop_size
33 |       , testProperty "prop_termCount"   TermBag.prop_termCount
34 |       , testProperty "prop_denseTable1" TermBag.prop_denseTable1
35 |       , testProperty "prop_denseTable2" TermBag.prop_denseTable2
36 |       ]
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/Test/Data/SearchEngine/DocIdSet.hs:
--------------------------------------------------------------------------------
 1 | {-# OPTIONS_GHC -fno-warn-orphans #-}
 2 | module Test.Data.SearchEngine.DocIdSet where
 3 | 
 4 | import Data.SearchEngine.DocIdSet (DocIdSet(DocIdSet), DocId(DocId))
 5 | import qualified Data.SearchEngine.DocIdSet as DocIdSet
 6 | 
 7 | import qualified Data.Vector.Unboxed as Vec
 8 | import qualified Data.List as List
 9 | import Test.QuickCheck
10 | 
11 | 
12 | instance Arbitrary DocIdSet where
13 |   arbitrary = DocIdSet.fromList `fmap` (listOf arbitrary)
14 | 
15 | instance Arbitrary DocId where
16 |   arbitrary = DocId `fmap` choose (0,15)
17 | 
18 | 
19 | prop_insert :: DocIdSet -> DocId -> Bool
20 | prop_insert dset x =
21 |     let dset' = DocIdSet.insert x dset
22 |      in DocIdSet.invariant dset && DocIdSet.invariant dset'
23 |      && all (`member` dset') (x : DocIdSet.toList dset)
24 | 
25 | prop_delete :: DocIdSet -> DocId -> Bool
26 | prop_delete dset x =
27 |     let dset' = DocIdSet.delete x dset
28 |      in DocIdSet.invariant dset && DocIdSet.invariant dset'
29 |      && all (`member` dset') (List.delete x (DocIdSet.toList dset))
30 |      && not (x `member` dset')
31 | 
32 | prop_delete' :: DocIdSet -> Bool
33 | prop_delete' dset =
34 |     all (prop_delete dset) (DocIdSet.toList dset)
35 | 
36 | prop_union :: DocIdSet -> DocIdSet -> Bool
37 | prop_union dset1 dset2 =
38 |     let dset  = DocIdSet.union dset1 dset2
39 |         dset' = DocIdSet.fromList (List.union (DocIdSet.toList dset1) (DocIdSet.toList dset2))
40 | 
41 |      in DocIdSet.invariant dset && DocIdSet.invariant dset'
42 |      && dset == dset'
43 | 
44 | prop_union' :: DocIdSet -> DocIdSet -> Bool
45 | prop_union' dset1 dset2 =
46 |     let dset   = DocIdSet.union dset1 dset2
47 |         dset'  = List.foldl' (\s i -> DocIdSet.insert i s) dset1 (DocIdSet.toList dset2)
48 |         dset'' = List.foldl' (\s i -> DocIdSet.insert i s) dset2 (DocIdSet.toList dset1)
49 |      in DocIdSet.invariant dset && DocIdSet.invariant dset' && DocIdSet.invariant dset''
50 |      && dset == dset'
51 |      && dset' == dset''
52 | 
53 | member :: DocId -> DocIdSet -> Bool
54 | member x (DocIdSet vec) =
55 |    x `List.elem` Vec.toList vec
56 | 
57 | 


--------------------------------------------------------------------------------
/tests/Test/Data/SearchEngine/TermBag.hs:
--------------------------------------------------------------------------------
 1 | {-# OPTIONS_GHC -Wno-orphans #-}
 2 | {-# LANGUAGE CPP #-}
 3 | #if __GLASGOW_HASKELL__ >= 908
 4 | {-# OPTIONS_GHC -Wno-x-partial #-}
 5 | #endif
 6 | 
 7 | module Test.Data.SearchEngine.TermBag where
 8 | 
 9 | import Data.SearchEngine.TermBag
10 | 
11 | import qualified Data.Vector.Unboxed as Vec
12 | import qualified Data.List as List
13 | import Test.QuickCheck
14 | 
15 | 
16 | instance Arbitrary TermBag where
17 |   arbitrary = fromList `fmap` (listOf arbitrary)
18 | 
19 | instance Arbitrary TermId where
20 |   arbitrary = TermId `fmap` choose (0,5)
21 | 
22 | prop_invariant :: TermBag -> Bool
23 | prop_invariant = invariant
24 | 
25 | prop_elems :: [TermId] -> Bool
26 | prop_elems tids =
27 |     (map head . List.group . List.sort) tids
28 |  == (elems . fromList) tids
29 | 
30 | prop_fromList :: [TermId] -> Bool
31 | prop_fromList tids =
32 |     (map (\g -> (head g, fromIntegral (length g `min` 255)))
33 |      . List.group . List.sort) tids
34 |  == (toList . fromList) tids
35 | 
36 | prop_size :: [TermId] -> Bool
37 | prop_size tids =
38 |     (size . fromList) tids == length tids
39 | 
40 | prop_termCount :: [TermId] -> Bool
41 | prop_termCount tids =
42 |     and [ termCount bag tid == count
43 |         | let bag = fromList tids
44 |         , (tid, count) <- toList bag
45 |         ]
46 | 
47 | prop_denseTable1 :: [TermBag] -> Bool
48 | prop_denseTable1 bags =
49 |     Vec.toList terms == (List.sort . foldr List.union [] . map elems) bags
50 |   where
51 |     (terms, _) = denseTable bags
52 | 
53 | prop_denseTable2 :: [TermBag] -> Bool
54 | prop_denseTable2 bags =
55 |     and [ termCount bag (terms Vec.! t) == counts Vec.! (b * numTerms + t)
56 |         | let (terms, counts) = denseTable bags
57 |               numTerms        = Vec.length terms
58 |         , (b, bag) <- zip [0..] bags
59 |         ,  t       <- [0..Vec.length terms - 1]
60 |         ]
61 | 


--------------------------------------------------------------------------------