Skip to content

Commit 45973a1

Browse files
authored
Fix bleve fuzziness search (#33078)
Close #31565
1 parent 9882917 commit 45973a1

File tree

11 files changed

+83
-52
lines changed

11 files changed

+83
-52
lines changed

Diff for: custom/conf/app.example.ini

+4
Original file line numberDiff line numberDiff line change
@@ -1485,6 +1485,10 @@ LEVEL = Info
14851485
;REPO_INDEXER_EXCLUDE =
14861486
;;
14871487
;MAX_FILE_SIZE = 1048576
1488+
;;
1489+
;; Bleve engine has performance problems with fuzzy search, so we limit the fuzziness to 0 by default to disable it.
1490+
;; If you'd like to enable it, you can set it to a value between 0 and 2.
1491+
;TYPE_BLEVE_MAX_FUZZINESS = 0
14881492

14891493
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
14901494
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Diff for: modules/indexer/code/indexer.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,12 @@ func Init() {
123123
for _, indexerData := range items {
124124
log.Trace("IndexerData Process Repo: %d", indexerData.RepoID)
125125
if err := index(ctx, indexer, indexerData.RepoID); err != nil {
126-
unhandled = append(unhandled, indexerData)
127126
if !setting.IsInTesting {
128127
log.Error("Codes indexer handler: index error for repo %v: %v", indexerData.RepoID, err)
129128
}
130129
}
131130
}
132-
return unhandled
131+
return nil // do not re-queue the failed items, otherwise some broken repo will block the queue
133132
}
134133

135134
indexerQueue = queue.CreateUniqueQueue(ctx, "code_indexer", handler)

Diff for: modules/indexer/code/indexer_test.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import (
1515
"code.gitea.io/gitea/modules/indexer/code/bleve"
1616
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
1717
"code.gitea.io/gitea/modules/indexer/code/internal"
18+
"code.gitea.io/gitea/modules/setting"
19+
"code.gitea.io/gitea/modules/test"
1820

1921
_ "code.gitea.io/gitea/models"
2022
_ "code.gitea.io/gitea/models/actions"
@@ -279,7 +281,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
279281

280282
func TestBleveIndexAndSearch(t *testing.T) {
281283
unittest.PrepareTestEnv(t)
282-
284+
defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
283285
dir := t.TempDir()
284286

285287
idx := bleve.NewIndexer(dir)

Diff for: modules/indexer/internal/bleve/util.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"unicode"
1010

1111
"code.gitea.io/gitea/modules/log"
12+
"code.gitea.io/gitea/modules/setting"
1213
"code.gitea.io/gitea/modules/util"
1314

1415
"github.com/blevesearch/bleve/v2"
@@ -54,9 +55,9 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
5455
return index, 0, nil
5556
}
5657

57-
// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
58-
// may be different on two string and they still be considered equivalent.
59-
// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
58+
// GuessFuzzinessByKeyword guesses fuzziness based on the levenshtein distance and determines how many chars
59+
// may be different on two string, and they still be considered equivalent.
60+
// Given a phrase, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
6061
func GuessFuzzinessByKeyword(s string) int {
6162
tokenizer := unicode_tokenizer.NewUnicodeTokenizer()
6263
tokens := tokenizer.Tokenize([]byte(s))
@@ -85,5 +86,5 @@ func guessFuzzinessByKeyword(s string) int {
8586
return 0
8687
}
8788
}
88-
return min(maxFuzziness, len(s)/4)
89+
return min(min(setting.Indexer.TypeBleveMaxFuzzniess, maxFuzziness), len(s)/4)
8990
}

Diff for: modules/indexer/internal/bleve/util_test.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,15 @@ import (
77
"fmt"
88
"testing"
99

10+
"code.gitea.io/gitea/modules/setting"
11+
"code.gitea.io/gitea/modules/test"
12+
1013
"github.com/stretchr/testify/assert"
1114
)
1215

1316
func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
17+
defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
18+
1419
scenarios := []struct {
1520
Input string
1621
Fuzziness int // See util.go for the definition of fuzziness in this particular context
@@ -46,7 +51,7 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
4651
}
4752

4853
for _, scenario := range scenarios {
49-
t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
54+
t.Run(fmt.Sprintf("Fuziniess:%s=%d", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
5055
assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
5156
})
5257
}

Diff for: modules/setting/indexer.go

+3
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ var Indexer = struct {
3131
IncludePatterns []*GlobMatcher
3232
ExcludePatterns []*GlobMatcher
3333
ExcludeVendored bool
34+
35+
TypeBleveMaxFuzzniess int
3436
}{
3537
IssueType: "bleve",
3638
IssuePath: "indexers/issues.bleve",
@@ -88,6 +90,7 @@ func loadIndexerFrom(rootCfg ConfigProvider) {
8890
Indexer.ExcludeVendored = sec.Key("REPO_INDEXER_EXCLUDE_VENDORED").MustBool(true)
8991
Indexer.MaxIndexerFileSize = sec.Key("MAX_FILE_SIZE").MustInt64(1024 * 1024)
9092
Indexer.StartupTimeout = sec.Key("STARTUP_TIMEOUT").MustDuration(30 * time.Second)
93+
Indexer.TypeBleveMaxFuzzniess = sec.Key("TYPE_BLEVE_MAX_FUZZINESS").MustInt(0)
9194
}
9295

9396
// IndexerGlobFromString parses a comma separated list of patterns and returns a glob.Glob slice suited for repo indexing

Diff for: routers/common/codesearch.go

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright 2024 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package common
5+
6+
import (
7+
"code.gitea.io/gitea/modules/setting"
8+
"code.gitea.io/gitea/services/context"
9+
)
10+
11+
func PrepareCodeSearch(ctx *context.Context) (ret struct {
12+
Keyword string
13+
Language string
14+
IsFuzzy bool
15+
},
16+
) {
17+
ret.Language = ctx.FormTrim("l")
18+
ret.Keyword = ctx.FormTrim("q")
19+
20+
fuzzyDefault := setting.Indexer.RepoIndexerEnabled
21+
fuzzyAllow := true
22+
if setting.Indexer.RepoType == "bleve" && setting.Indexer.TypeBleveMaxFuzzniess == 0 {
23+
fuzzyDefault = false
24+
fuzzyAllow = false
25+
}
26+
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(fuzzyDefault)
27+
if isFuzzy && !fuzzyAllow {
28+
ctx.Flash.Info("Fuzzy search is disabled by default due to performance reasons")
29+
isFuzzy = false
30+
}
31+
32+
ctx.Data["IsBleveFuzzyDisabled"] = true
33+
ctx.Data["Keyword"] = ret.Keyword
34+
ctx.Data["Language"] = ret.Language
35+
ctx.Data["IsFuzzy"] = isFuzzy
36+
37+
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
38+
return ret
39+
}

Diff for: routers/web/explore/code.go

+6-13
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
code_indexer "code.gitea.io/gitea/modules/indexer/code"
1212
"code.gitea.io/gitea/modules/setting"
1313
"code.gitea.io/gitea/modules/templates"
14+
"code.gitea.io/gitea/routers/common"
1415
"code.gitea.io/gitea/services/context"
1516
)
1617

@@ -32,18 +33,10 @@ func Code(ctx *context.Context) {
3233
ctx.Data["Title"] = ctx.Tr("explore")
3334
ctx.Data["PageIsExplore"] = true
3435
ctx.Data["PageIsExploreCode"] = true
35-
36-
language := ctx.FormTrim("l")
37-
keyword := ctx.FormTrim("q")
38-
39-
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
40-
41-
ctx.Data["Keyword"] = keyword
42-
ctx.Data["Language"] = language
43-
ctx.Data["IsFuzzy"] = isFuzzy
4436
ctx.Data["PageIsViewCode"] = true
4537

46-
if keyword == "" {
38+
prepareSearch := common.PrepareCodeSearch(ctx)
39+
if prepareSearch.Keyword == "" {
4740
ctx.HTML(http.StatusOK, tplExploreCode)
4841
return
4942
}
@@ -80,9 +73,9 @@ func Code(ctx *context.Context) {
8073
if (len(repoIDs) > 0) || isAdmin {
8174
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
8275
RepoIDs: repoIDs,
83-
Keyword: keyword,
84-
IsKeywordFuzzy: isFuzzy,
85-
Language: language,
76+
Keyword: prepareSearch.Keyword,
77+
IsKeywordFuzzy: prepareSearch.IsFuzzy,
78+
Language: prepareSearch.Language,
8679
Paginator: &db.ListOptions{
8780
Page: page,
8881
PageSize: setting.UI.RepoSearchPagingNum,

Diff for: routers/web/repo/search.go

+8-16
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
code_indexer "code.gitea.io/gitea/modules/indexer/code"
1313
"code.gitea.io/gitea/modules/setting"
1414
"code.gitea.io/gitea/modules/templates"
15+
"code.gitea.io/gitea/routers/common"
1516
"code.gitea.io/gitea/services/context"
1617
)
1718

@@ -29,18 +30,9 @@ func indexSettingToGitGrepPathspecList() (list []string) {
2930

3031
// Search render repository search page
3132
func Search(ctx *context.Context) {
32-
language := ctx.FormTrim("l")
33-
keyword := ctx.FormTrim("q")
34-
35-
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
36-
37-
ctx.Data["Keyword"] = keyword
38-
ctx.Data["Language"] = language
39-
ctx.Data["IsFuzzy"] = isFuzzy
4033
ctx.Data["PageIsViewCode"] = true
41-
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
42-
43-
if keyword == "" {
34+
prepareSearch := common.PrepareCodeSearch(ctx)
35+
if prepareSearch.Keyword == "" {
4436
ctx.HTML(http.StatusOK, tplSearch)
4537
return
4638
}
@@ -57,9 +49,9 @@ func Search(ctx *context.Context) {
5749
var err error
5850
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
5951
RepoIDs: []int64{ctx.Repo.Repository.ID},
60-
Keyword: keyword,
61-
IsKeywordFuzzy: isFuzzy,
62-
Language: language,
52+
Keyword: prepareSearch.Keyword,
53+
IsKeywordFuzzy: prepareSearch.IsFuzzy,
54+
Language: prepareSearch.Language,
6355
Paginator: &db.ListOptions{
6456
Page: page,
6557
PageSize: setting.UI.RepoSearchPagingNum,
@@ -75,9 +67,9 @@ func Search(ctx *context.Context) {
7567
ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx)
7668
}
7769
} else {
78-
res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, git.GrepOptions{
70+
res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, prepareSearch.Keyword, git.GrepOptions{
7971
ContextLineNumber: 1,
80-
IsFuzzy: isFuzzy,
72+
IsFuzzy: prepareSearch.IsFuzzy,
8173
RefName: git.RefNameFromBranch(ctx.Repo.BranchName).String(), // BranchName should be default branch or the first existing branch
8274
PathspecList: indexSettingToGitGrepPathspecList(),
8375
})

Diff for: routers/web/user/code.go

+6-14
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
code_indexer "code.gitea.io/gitea/modules/indexer/code"
1212
"code.gitea.io/gitea/modules/setting"
1313
"code.gitea.io/gitea/modules/templates"
14+
"code.gitea.io/gitea/routers/common"
1415
shared_user "code.gitea.io/gitea/routers/web/shared/user"
1516
"code.gitea.io/gitea/services/context"
1617
)
@@ -34,20 +35,11 @@ func CodeSearch(ctx *context.Context) {
3435
}
3536

3637
ctx.Data["IsPackageEnabled"] = setting.Packages.Enabled
37-
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
3838
ctx.Data["Title"] = ctx.Tr("explore.code")
39-
40-
language := ctx.FormTrim("l")
41-
keyword := ctx.FormTrim("q")
42-
43-
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
44-
45-
ctx.Data["Keyword"] = keyword
46-
ctx.Data["Language"] = language
47-
ctx.Data["IsFuzzy"] = isFuzzy
4839
ctx.Data["IsCodePage"] = true
4940

50-
if keyword == "" {
41+
prepareSearch := common.PrepareCodeSearch(ctx)
42+
if prepareSearch.Keyword == "" {
5143
ctx.HTML(http.StatusOK, tplUserCode)
5244
return
5345
}
@@ -77,9 +69,9 @@ func CodeSearch(ctx *context.Context) {
7769
if len(repoIDs) > 0 {
7870
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
7971
RepoIDs: repoIDs,
80-
Keyword: keyword,
81-
IsKeywordFuzzy: isFuzzy,
82-
Language: language,
72+
Keyword: prepareSearch.Keyword,
73+
IsKeywordFuzzy: prepareSearch.IsFuzzy,
74+
Language: prepareSearch.Language,
8375
Paginator: &db.ListOptions{
8476
Page: page,
8577
PageSize: setting.UI.RepoSearchPagingNum,

Diff for: templates/shared/search/code/search.tmpl

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
{{template "shared/search/combo_fuzzy" dict "Value" .Keyword "Disabled" .CodeIndexerUnavailable "IsFuzzy" .IsFuzzy "Placeholder" (ctx.Locale.Tr "search.code_kind")}}
33
</form>
44
<div class="divider"></div>
5-
<div class="ui user list">
5+
<div class="ui list">
6+
{{template "base/alert" .}}
67
{{if .CodeIndexerUnavailable}}
78
<div class="ui error message">
89
<p>{{ctx.Locale.Tr "search.code_search_unavailable"}}</p>

0 commit comments

Comments
 (0)