├── go.mod ├── go.sum ├── README.md ├── .travis.yml ├── LICENSE ├── bm25.go └── bm25_test.go /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/go-nlp/bm25 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/go-nlp/tfidf v1.1.0 7 | github.com/xtgo/set v1.0.0 8 | ) 9 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/go-nlp/tfidf v1.0.0 h1:usRGZjJO/MkU4Oq2Xa836MTpAgfhyb2kLyIkjnKfWY0= 2 | github.com/go-nlp/tfidf v1.0.0/go.mod h1:FHOpf09wrdELx7OnbxywpW4Cs0Q3r15QzpOc73rmUTo= 3 | github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY= 4 | github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8= 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bm25 [![PkgGoDev](https://pkg.go.dev/badge/github.com/go-nlp/bm25)](https://pkg.go.dev/github.com/go-nlp/bm25) [![Build Status](https://travis-ci.org/go-nlp/bm25.svg?branch=master)](https://travis-ci.org/go-nlp/bm25) [![Coverage Status](https://coveralls.io/repos/github/go-nlp/bm25/badge.svg?branch=master)](https://coveralls.io/github/go-nlp/bm25?branch=master) 2 | bm25 is a scoring function that helps with information retrieval 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: go 3 | branches: 4 | only: 5 | - master 6 | 7 | go: 8 | - 1.13.x 9 | - 1.14.x 10 | - tip 11 | 12 | env: 13 | global: 14 | - GOARCH=amd64 15 | - TRAVISTEST=true 16 | 17 | before_install: 18 | - go get github.com/mattn/goveralls 19 | 20 | script: 21 | - go test -run=. -coverprofile=profile.cov 22 | - $HOME/gopath/bin/goveralls -coverprofile=profile.cov -service=travis-ci 23 | 24 | matrix: 25 | allow_failures: 26 | - go: tip 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Xuanyi Chew 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bm25.go: -------------------------------------------------------------------------------- 1 | // package bm25 is a lingo-friendly BM25 library. 2 | // BM25 is a scoring function that relies on TFIDF, and is useful for document retrieval 3 | package bm25 4 | 5 | import ( 6 | "sort" 7 | 8 | "github.com/go-nlp/tfidf" 9 | "github.com/xtgo/set" 10 | ) 11 | 12 | // DocScore is a tuple of the document ID and a score 13 | type DocScore struct { 14 | ID int 15 | Score float64 16 | } 17 | 18 | // DocScores is a list of DocScore 19 | type DocScores []DocScore 20 | 21 | func (ds DocScores) Len() int { return len(ds) } 22 | func (ds DocScores) Less(i, j int) bool { return ds[i].Score < ds[j].Score } 23 | func (ds DocScores) Swap(i, j int) { 24 | ds[i].Score, ds[j].Score = ds[j].Score, ds[i].Score 25 | ds[i].ID, ds[j].ID = ds[j].ID, ds[i].ID 26 | } 27 | 28 | // BM25 is the scoring function. 29 | // 30 | // k1 should be between 1.2 and 2. 31 | // b should be around 0.75 32 | func BM25(tf *tfidf.TFIDF, query tfidf.Document, docs []tfidf.Document, k1, b float64) DocScores { 33 | q := tfidf.BOW(query) 34 | w := make([]int, len(q)) 35 | copy(w, q) 36 | avgLen := float64(tf.Len) / float64(tf.Docs) 37 | 38 | scores := make([]float64, 0, len(docs)) 39 | for _, doc := range docs { 40 | //TF := tfidf.TF(doc) 41 | d := tfidf.BOW(doc) 42 | w = append(w, d...) 43 | size := set.Inter(sort.IntSlice(w), len(q)) 44 | n := w[:size] 45 | 46 | score := make([]float64, 0, len(n)) 47 | docLen := float64(len(d)) 48 | for _, id := range n { 49 | num := (tf.TF[id] * (k1 + 1)) 50 | denom := (tf.TF[id] + k1*(1-b+b*docLen/avgLen)) 51 | idf := tf.IDF[id] 52 | score = append(score, idf*num/denom) 53 | } 54 | scores = append(scores, sum(score)) 55 | 56 | // reset working vector 57 | copy(w, q) 58 | w = w[:len(q)] 59 | } 60 | var retVal DocScores 61 | for i := range docs { 62 | retVal = append(retVal, DocScore{i, scores[i]}) 63 | } 64 | return retVal 65 | } 66 | 67 | func sum(a []float64) float64 { 68 | var retVal float64 69 | for _, f := range a { 70 | retVal += f 71 | } 72 | return retVal 73 | } 74 | -------------------------------------------------------------------------------- /bm25_test.go: -------------------------------------------------------------------------------- 1 | package bm25 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | "strings" 7 | 8 | "github.com/go-nlp/tfidf" 9 | ) 10 | 11 | var mobydick = []string{ 12 | "Call me Ishmael .", 13 | "Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world .", 14 | "It is a way I have of driving off the spleen and regulating the circulation .", 15 | "Whenever I find myself growing grim about the mouth ; ", 16 | "whenever it is a damp , drizzly November in my soul ; ", 17 | "whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; ", 18 | "and especially whenever my hypos get such an upper hand of me , that it requires a strong moral principle to prevent me from deliberately stepping into the street , and methodically knocking people's hats off -- then , I account it high time to get to sea as soon as I can .", 19 | "This is my substitute for pistol and ball . ", 20 | "With a philosophical flourish Cato throws himself upon his sword ; ", 21 | "I quietly take to the ship . There is nothing surprising in this .", 22 | "If they but knew it , almost all men in their degree , some time or other , cherish very nearly the same feelings towards the ocean with me .", 23 | } 24 | 25 | type doc []int 26 | 27 | func (d doc) IDs() []int { return []int(d) } 28 | 29 | func makeCorpus(a []string) (map[string]int, []string) { 30 | retVal := make(map[string]int) 31 | invRetVal := make([]string, 0) 32 | var id int 33 | for _, s := range a { 34 | for _, f := range strings.Fields(s) { 35 | f = strings.ToLower(f) 36 | if _, ok := retVal[f]; !ok { 37 | retVal[f] = id 38 | invRetVal = append(invRetVal, f) 39 | id++ 40 | } 41 | } 42 | } 43 | return retVal, invRetVal 44 | } 45 | 46 | func makeDocuments(a []string, c map[string]int) []tfidf.Document { 47 | retVal := make([]tfidf.Document, 0, len(a)) 48 | for _, s := range a { 49 | var ts []int 50 | for _, f := range strings.Fields(s) { 51 | f = strings.ToLower(f) 52 | id := c[f] 53 | ts = append(ts, id) 54 | } 55 | retVal = append(retVal, doc(ts)) 56 | } 57 | return retVal 58 | } 59 | 60 | func Example_BM25() { 61 | corpus, _ := makeCorpus(mobydick) 62 | docs := makeDocuments(mobydick, corpus) 63 | tf := tfidf.New() 64 | 65 | for _, doc := range docs { 66 | tf.Add(doc) 67 | } 68 | tf.CalculateIDF() 69 | 70 | // now we search 71 | 72 | // "ishmael" is a query 73 | ishmael := doc{corpus["ishmael"]} 74 | 75 | // "whenever i find" is another query 76 | whenever := doc{corpus["whenever"]} 77 | 78 | ishmaelScores := BM25(tf, ishmael, docs, 1.5, 0.75) 79 | wheneverScores := BM25(tf, whenever, docs, 1.5, 0.75) 80 | 81 | sort.Sort(sort.Reverse(ishmaelScores)) 82 | sort.Sort(sort.Reverse(wheneverScores)) 83 | 84 | fmt.Printf("Top 3 Relevant Docs to \"Ishmael\":\n") 85 | for _, d := range ishmaelScores[:3] { 86 | fmt.Printf("\tID : %d\n\tScore: %1.3f\n\tDoc : %q\n", d.ID, d.Score, mobydick[d.ID]) 87 | } 88 | fmt.Println("") 89 | fmt.Printf("Top 3 Relevant Docs to \"whenever i find\":\n") 90 | for _, d := range wheneverScores[:3] { 91 | fmt.Printf("\tID : %d\n\tScore: %1.3f\n\tDoc : %q\n", d.ID, d.Score, mobydick[d.ID]) 92 | } 93 | // Output: 94 | // Top 3 Relevant Docs to "Ishmael": 95 | // ID : 0 96 | // Score: 3.706 97 | // Doc : "Call me Ishmael ." 98 | // ID : 1 99 | // Score: 0.000 100 | // Doc : "Some years ago -- never mind how long precisely -- having little or no money in my purse , and nothing particular to interest me on shore , I thought I would sail about a little and see the watery part of the world ." 101 | // ID : 2 102 | // Score: 0.000 103 | // Doc : "It is a way I have of driving off the spleen and regulating the circulation ." 104 | // 105 | // Top 3 Relevant Docs to "whenever i find": 106 | // ID : 3 107 | // Score: 2.031 108 | // Doc : "Whenever I find myself growing grim about the mouth ; " 109 | // ID : 4 110 | // Score: 1.982 111 | // Doc : "whenever it is a damp , drizzly November in my soul ; " 112 | // ID : 5 113 | // Score: 1.810 114 | // Doc : "whenever I find myself involuntarily pausing before coffin warehouses , and bringing up the rear of every funeral I meet ; " 115 | 116 | } 117 | --------------------------------------------------------------------------------