Skip to content

Commit dfca246

Browse files
committed
Fix bleve fuzziness search (go-gitea#33078)
1 parent d371aa3 commit dfca246

File tree

11 files changed

+86
-52
lines changed

11 files changed

+86
-52
lines changed

custom/conf/app.example.ini

+4
Original file line numberDiff line numberDiff line change
@@ -1482,6 +1482,10 @@ LEVEL = Info
14821482
;REPO_INDEXER_EXCLUDE =
14831483
;;
14841484
;MAX_FILE_SIZE = 1048576
1485+
;;
1486+
;; Bleve engine has performance problems with fuzzy search, so we limit the fuzziness to 0 by default to disable it.
1487+
;; If you'd like to enable it, you can set it to a value between 0 and 2.
1488+
;TYPE_BLEVE_MAX_FUZZINESS = 0
14851489

14861490
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
14871491
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

modules/indexer/code/indexer.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,12 @@ func Init() {
123123
for _, indexerData := range items {
124124
log.Trace("IndexerData Process Repo: %d", indexerData.RepoID)
125125
if err := index(ctx, indexer, indexerData.RepoID); err != nil {
126-
unhandled = append(unhandled, indexerData)
127126
if !setting.IsInTesting {
128127
log.Error("Codes indexer handler: index error for repo %v: %v", indexerData.RepoID, err)
129128
}
130129
}
131130
}
132-
return unhandled
131+
return nil // do not re-queue the failed items, otherwise some broken repo will block the queue
133132
}
134133

135134
indexerQueue = queue.CreateUniqueQueue(ctx, "code_indexer", handler)

modules/indexer/code/indexer_test.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import (
1515
"code.gitea.io/gitea/modules/indexer/code/bleve"
1616
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
1717
"code.gitea.io/gitea/modules/indexer/code/internal"
18+
"code.gitea.io/gitea/modules/setting"
19+
"code.gitea.io/gitea/modules/test"
1820

1921
_ "code.gitea.io/gitea/models"
2022
_ "code.gitea.io/gitea/models/actions"
@@ -279,7 +281,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
279281

280282
func TestBleveIndexAndSearch(t *testing.T) {
281283
unittest.PrepareTestEnv(t)
282-
284+
defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
283285
dir := t.TempDir()
284286

285287
idx := bleve.NewIndexer(dir)

modules/indexer/internal/bleve/util.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"unicode"
1010

1111
"code.gitea.io/gitea/modules/log"
12+
"code.gitea.io/gitea/modules/setting"
1213
"code.gitea.io/gitea/modules/util"
1314

1415
"github.com/blevesearch/bleve/v2"
@@ -54,9 +55,9 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
5455
return index, 0, nil
5556
}
5657

57-
// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
58-
// may be different on two string and they still be considered equivalent.
59-
// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
58+
// GuessFuzzinessByKeyword guesses fuzziness based on the levenshtein distance and determines how many chars
59+
// may be different on two string, and they still be considered equivalent.
60+
// Given a phrase, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
6061
func GuessFuzzinessByKeyword(s string) int {
6162
tokenizer := unicode_tokenizer.NewUnicodeTokenizer()
6263
tokens := tokenizer.Tokenize([]byte(s))
@@ -85,5 +86,5 @@ func guessFuzzinessByKeyword(s string) int {
8586
return 0
8687
}
8788
}
88-
return min(maxFuzziness, len(s)/4)
89+
return min(min(setting.Indexer.TypeBleveMaxFuzzniess, maxFuzziness), len(s)/4)
8990
}

modules/indexer/internal/bleve/util_test.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,15 @@ import (
77
"fmt"
88
"testing"
99

10+
"code.gitea.io/gitea/modules/setting"
11+
"code.gitea.io/gitea/modules/test"
12+
1013
"github.com/stretchr/testify/assert"
1114
)
1215

1316
func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
17+
defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
18+
1419
scenarios := []struct {
1520
Input string
1621
Fuzziness int // See util.go for the definition of fuzziness in this particular context
@@ -46,7 +51,7 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
4651
}
4752

4853
for _, scenario := range scenarios {
49-
t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
54+
t.Run(fmt.Sprintf("Fuziniess:%s=%d", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
5055
assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
5156
})
5257
}

modules/setting/indexer.go

+3
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ var Indexer = struct {
3131
IncludePatterns []*GlobMatcher
3232
ExcludePatterns []*GlobMatcher
3333
ExcludeVendored bool
34+
35+
TypeBleveMaxFuzzniess int
3436
}{
3537
IssueType: "bleve",
3638
IssuePath: "indexers/issues.bleve",
@@ -88,6 +90,7 @@ func loadIndexerFrom(rootCfg ConfigProvider) {
8890
Indexer.ExcludeVendored = sec.Key("REPO_INDEXER_EXCLUDE_VENDORED").MustBool(true)
8991
Indexer.MaxIndexerFileSize = sec.Key("MAX_FILE_SIZE").MustInt64(1024 * 1024)
9092
Indexer.StartupTimeout = sec.Key("STARTUP_TIMEOUT").MustDuration(30 * time.Second)
93+
Indexer.TypeBleveMaxFuzzniess = sec.Key("TYPE_BLEVE_MAX_FUZZINESS").MustInt(0)
9194
}
9295

9396
// IndexerGlobFromString parses a comma separated list of patterns and returns a glob.Glob slice suited for repo indexing

routers/common/codesearch.go

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright 2024 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package common
5+
6+
import (
7+
"code.gitea.io/gitea/modules/setting"
8+
"code.gitea.io/gitea/services/context"
9+
)
10+
11+
func PrepareCodeSearch(ctx *context.Context) (ret struct {
12+
Keyword string
13+
Language string
14+
IsFuzzy bool
15+
},
16+
) {
17+
ret.Language = ctx.FormTrim("l")
18+
ret.Keyword = ctx.FormTrim("q")
19+
20+
fuzzyDefault := setting.Indexer.RepoIndexerEnabled
21+
fuzzyAllow := true
22+
if setting.Indexer.RepoType == "bleve" && setting.Indexer.TypeBleveMaxFuzzniess == 0 {
23+
fuzzyDefault = false
24+
fuzzyAllow = false
25+
}
26+
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(fuzzyDefault)
27+
if isFuzzy && !fuzzyAllow {
28+
ctx.Flash.Info("Fuzzy search is disabled by default due to performance reasons")
29+
isFuzzy = false
30+
}
31+
32+
ctx.Data["IsBleveFuzzyDisabled"] = true
33+
ctx.Data["Keyword"] = ret.Keyword
34+
ctx.Data["Language"] = ret.Language
35+
ctx.Data["IsFuzzy"] = isFuzzy
36+
37+
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
38+
return ret
39+
}

routers/web/explore/code.go

+7-13
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111
"code.gitea.io/gitea/modules/base"
1212
code_indexer "code.gitea.io/gitea/modules/indexer/code"
1313
"code.gitea.io/gitea/modules/setting"
14+
"code.gitea.io/gitea/modules/templates"
15+
"code.gitea.io/gitea/routers/common"
1416
"code.gitea.io/gitea/services/context"
1517
)
1618

@@ -32,18 +34,10 @@ func Code(ctx *context.Context) {
3234
ctx.Data["Title"] = ctx.Tr("explore")
3335
ctx.Data["PageIsExplore"] = true
3436
ctx.Data["PageIsExploreCode"] = true
35-
36-
language := ctx.FormTrim("l")
37-
keyword := ctx.FormTrim("q")
38-
39-
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
40-
41-
ctx.Data["Keyword"] = keyword
42-
ctx.Data["Language"] = language
43-
ctx.Data["IsFuzzy"] = isFuzzy
4437
ctx.Data["PageIsViewCode"] = true
4538

46-
if keyword == "" {
39+
prepareSearch := common.PrepareCodeSearch(ctx)
40+
if prepareSearch.Keyword == "" {
4741
ctx.HTML(http.StatusOK, tplExploreCode)
4842
return
4943
}
@@ -80,9 +74,9 @@ func Code(ctx *context.Context) {
8074
if (len(repoIDs) > 0) || isAdmin {
8175
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
8276
RepoIDs: repoIDs,
83-
Keyword: keyword,
84-
IsKeywordFuzzy: isFuzzy,
85-
Language: language,
77+
Keyword: prepareSearch.Keyword,
78+
IsKeywordFuzzy: prepareSearch.IsFuzzy,
79+
Language: prepareSearch.Language,
8680
Paginator: &db.ListOptions{
8781
Page: page,
8882
PageSize: setting.UI.RepoSearchPagingNum,

routers/web/repo/search.go

+9-16
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ import (
1212
"code.gitea.io/gitea/modules/git"
1313
code_indexer "code.gitea.io/gitea/modules/indexer/code"
1414
"code.gitea.io/gitea/modules/setting"
15+
"code.gitea.io/gitea/modules/templates"
16+
"code.gitea.io/gitea/routers/common"
1517
"code.gitea.io/gitea/services/context"
1618
)
1719

@@ -29,18 +31,9 @@ func indexSettingToGitGrepPathspecList() (list []string) {
2931

3032
// Search render repository search page
3133
func Search(ctx *context.Context) {
32-
language := ctx.FormTrim("l")
33-
keyword := ctx.FormTrim("q")
34-
35-
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
36-
37-
ctx.Data["Keyword"] = keyword
38-
ctx.Data["Language"] = language
39-
ctx.Data["IsFuzzy"] = isFuzzy
4034
ctx.Data["PageIsViewCode"] = true
41-
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
42-
43-
if keyword == "" {
35+
prepareSearch := common.PrepareCodeSearch(ctx)
36+
if prepareSearch.Keyword == "" {
4437
ctx.HTML(http.StatusOK, tplSearch)
4538
return
4639
}
@@ -57,9 +50,9 @@ func Search(ctx *context.Context) {
5750
var err error
5851
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
5952
RepoIDs: []int64{ctx.Repo.Repository.ID},
60-
Keyword: keyword,
61-
IsKeywordFuzzy: isFuzzy,
62-
Language: language,
53+
Keyword: prepareSearch.Keyword,
54+
IsKeywordFuzzy: prepareSearch.IsFuzzy,
55+
Language: prepareSearch.Language,
6356
Paginator: &db.ListOptions{
6457
Page: page,
6558
PageSize: setting.UI.RepoSearchPagingNum,
@@ -75,9 +68,9 @@ func Search(ctx *context.Context) {
7568
ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx)
7669
}
7770
} else {
78-
res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, git.GrepOptions{
71+
res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, prepareSearch.Keyword, git.GrepOptions{
7972
ContextLineNumber: 1,
80-
IsFuzzy: isFuzzy,
73+
IsFuzzy: prepareSearch.IsFuzzy,
8174
RefName: git.RefNameFromBranch(ctx.Repo.BranchName).String(), // BranchName should be default branch or the first existing branch
8275
PathspecList: indexSettingToGitGrepPathspecList(),
8376
})

routers/web/user/code.go

+7-14
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111
"code.gitea.io/gitea/modules/base"
1212
code_indexer "code.gitea.io/gitea/modules/indexer/code"
1313
"code.gitea.io/gitea/modules/setting"
14+
"code.gitea.io/gitea/modules/templates"
15+
"code.gitea.io/gitea/routers/common"
1416
shared_user "code.gitea.io/gitea/routers/web/shared/user"
1517
"code.gitea.io/gitea/services/context"
1618
)
@@ -34,20 +36,11 @@ func CodeSearch(ctx *context.Context) {
3436
}
3537

3638
ctx.Data["IsPackageEnabled"] = setting.Packages.Enabled
37-
ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
3839
ctx.Data["Title"] = ctx.Tr("explore.code")
39-
40-
language := ctx.FormTrim("l")
41-
keyword := ctx.FormTrim("q")
42-
43-
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
44-
45-
ctx.Data["Keyword"] = keyword
46-
ctx.Data["Language"] = language
47-
ctx.Data["IsFuzzy"] = isFuzzy
4840
ctx.Data["IsCodePage"] = true
4941

50-
if keyword == "" {
42+
prepareSearch := common.PrepareCodeSearch(ctx)
43+
if prepareSearch.Keyword == "" {
5144
ctx.HTML(http.StatusOK, tplUserCode)
5245
return
5346
}
@@ -77,9 +70,9 @@ func CodeSearch(ctx *context.Context) {
7770
if len(repoIDs) > 0 {
7871
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
7972
RepoIDs: repoIDs,
80-
Keyword: keyword,
81-
IsKeywordFuzzy: isFuzzy,
82-
Language: language,
73+
Keyword: prepareSearch.Keyword,
74+
IsKeywordFuzzy: prepareSearch.IsFuzzy,
75+
Language: prepareSearch.Language,
8376
Paginator: &db.ListOptions{
8477
Page: page,
8578
PageSize: setting.UI.RepoSearchPagingNum,

templates/shared/search/code/search.tmpl

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
{{template "shared/search/combo_fuzzy" dict "Value" .Keyword "Disabled" .CodeIndexerUnavailable "IsFuzzy" .IsFuzzy "Placeholder" (ctx.Locale.Tr "search.code_kind")}}
33
</form>
44
<div class="divider"></div>
5-
<div class="ui user list">
5+
<div class="ui list">
6+
{{template "base/alert" .}}
67
{{if .CodeIndexerUnavailable}}
78
<div class="ui error message">
89
<p>{{ctx.Locale.Tr "search.code_search_unavailable"}}</p>

0 commit comments

Comments
 (0)