-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexcerpt.go
137 lines (121 loc) · 4.04 KB
/
excerpt.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
package excerpt
import (
"fmt"
"index/suffixarray"
"log"
"sort"
"strings"
"time"
)
const (
PADDING_WIDTH = 5
)
type ExcerptWindow struct {
Start int
ByteLength int
CharLength int
Score float64
Text string
Matches []*Match
}
func (e *ExcerptWindow) String() string {
return fmt.Sprintf("<ExcerptWindow(charlength: %v|bytes: %v): starts at: %v score: %v>:\n%s\n", e.CharLength, e.ByteLength, e.Start, e.Score, e.Text)
}
/*
FindExcerpts searches searchterms in body and returns excerpts of a given
length that contain the terms.
The terms can be weighted and each ExcerptWindow gets a cummulative score
of the searchterms it contains. setting findHighestScore the function will
only return the ExcerptWindow with the hightest score.
If a match is at the end of a window and overlaps its boundry the window
will be extended to include the full match.
An ExcerptWindow always starts with a match. In the future an option might
be added to position/center the window around the matches.
*/
func FindExcerpts(searchterms map[string]float64, body string, length int, expand bool, findHighestScore bool) (excerptCandidates []*ExcerptWindow) {
startTime := time.Now()
b := []byte(strings.ToLower(body))
var blength int = len(b)
// log.Printf("body length (in bytes): %v\n", blength)
var offsets []int
var score, bytelength float64
index := suffixarray.New(b)
scores := make(map[int][]float64)
for term, weight := range searchterms {
termMatches := index.Lookup([]byte(strings.ToLower(term)), -1)
// use the character(rune not byte) length multiplied with the weight
// as score, we also need to know the byte length of the match in case
// we need to extend the window if the match overlaps the window boundry
score = float64(len([]rune(term))) * weight
bytelength = float64(len([]byte(term)))
for _, m := range termMatches {
scores[m] = []float64{score, bytelength}
}
offsets = append(offsets, termMatches...)
}
log.Printf("finding matches took: %v. nr match positions: %v\n", time.Since(startTime), len(offsets))
sort.Ints(offsets)
// log.Println(offsets)
// log.Println(scores)
var nextMatchIdx int
var sliceEnd int
var HighestScore float64 = 0
var HighestScoreIdx int = 0
var ew *ExcerptWindow
var r []rune
// if we have no match we just send and excerpt that starts at the beginning
if len(offsets) == 0 {
scores[0] = []float64{0, 0}
offsets = []int{0}
}
for i, offset := range offsets {
// log.Printf("o")
ew = &ExcerptWindow{
Start: offset,
CharLength: length,
Score: scores[offset][0],
}
// runes use 4 bytes max - but 1~4 depending on the character
// we want a string of character_length length so we look ahead
// 4 * length bytes, check the character (rune) length and adjust
sliceEnd = (offset + length) * 4
if sliceEnd > blength {
sliceEnd = blength
}
// startTimeCnv := time.Now()
r = []rune(body[offset:sliceEnd])
// if the window would exceed the end of body we adjust the length
if ew.CharLength >= len(r) {
ew.CharLength = len(r)
}
r = r[0:ew.CharLength]
ew.ByteLength = len([]byte(string(r)))
// log.Printf("runtime []rune/[]byte conversions: %v\n", time.Since(startTimeCnv))
nextMatchIdx = i + 1
for {
// log.Printf("n")
if nextMatchIdx >= len(offsets) || offsets[nextMatchIdx] > offset+ew.ByteLength {
break
}
ew.Score += scores[offsets[nextMatchIdx]][0]
// if the match would be cut off we extend the window
if offsets[nextMatchIdx]+int(scores[offsets[nextMatchIdx]][1]) >= ew.Start+ew.ByteLength {
ew.ByteLength = offsets[nextMatchIdx] + int(scores[offsets[nextMatchIdx]][1]) - ew.Start
break
}
nextMatchIdx += 1
}
ew.Text = strings.TrimSpace(body[ew.Start : ew.Start+ew.ByteLength])
ew.CharLength = len([]rune(ew.Text))
if ew.Score > HighestScore {
HighestScore = ew.Score
HighestScoreIdx = i
}
excerptCandidates = append(excerptCandidates, ew)
}
if findHighestScore {
excerptCandidates = []*ExcerptWindow{excerptCandidates[HighestScoreIdx]}
}
log.Printf("runtime: %v\n", time.Since(startTime))
return
}