-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbm25.go
73 lines (63 loc) · 1.69 KB
/
bm25.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// package bm25 is a lingo-friendly BM25 library.
// BM25 is a scoring function that relies on TFIDF, and is useful for document retrieval
package bm25
import (
"sort"
"github.com/go-nlp/tfidf"
"github.com/xtgo/set"
)
// DocScore is a tuple of the document ID and a score
type DocScore struct {
ID int
Score float64
}
// DocScores is a list of DocScore
type DocScores []DocScore
func (ds DocScores) Len() int { return len(ds) }
func (ds DocScores) Less(i, j int) bool { return ds[i].Score < ds[j].Score }
func (ds DocScores) Swap(i, j int) {
ds[i].Score, ds[j].Score = ds[j].Score, ds[i].Score
ds[i].ID, ds[j].ID = ds[j].ID, ds[i].ID
}
// BM25 is the scoring function.
//
// k1 should be between 1.2 and 2.
// b should be around 0.75
func BM25(tf *tfidf.TFIDF, query tfidf.Document, docs []tfidf.Document, k1, b float64) DocScores {
q := tfidf.BOW(query)
w := make([]int, len(q))
copy(w, q)
avgLen := float64(tf.Len) / float64(tf.Docs)
scores := make([]float64, 0, len(docs))
for _, doc := range docs {
//TF := tfidf.TF(doc)
d := tfidf.BOW(doc)
w = append(w, d...)
size := set.Inter(sort.IntSlice(w), len(q))
n := w[:size]
score := make([]float64, 0, len(n))
docLen := float64(len(d))
for _, id := range n {
num := (tf.TF[id] * (k1 + 1))
denom := (tf.TF[id] + k1*(1-b+b*docLen/avgLen))
idf := tf.IDF[id]
score = append(score, idf*num/denom)
}
scores = append(scores, sum(score))
// reset working vector
copy(w, q)
w = w[:len(q)]
}
var retVal DocScores
for i := range docs {
retVal = append(retVal, DocScore{i, scores[i]})
}
return retVal
}
func sum(a []float64) float64 {
var retVal float64
for _, f := range a {
retVal += f
}
return retVal
}