From 97a7c04a8fc4747d32af84fca3d068425ab33768 Mon Sep 17 00:00:00 2001
From: Giteabot <teabot@gitea.io>
Date: Wed, 1 May 2024 20:59:59 +0800
Subject: [PATCH] Fix bleve fuzziness (#30799) (#30804)

Backport #30799 by wxiaoguang

Fix #30797
Fix #30317

Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
---
 modules/indexer/code/bleve/bleve.go    |  4 +---
 modules/indexer/internal/bleve/util.go | 12 ++++++++++++
 modules/indexer/issues/bleve/bleve.go  |  8 ++------
 routers/web/repo/search.go             |  2 +-
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go
index bd844205a6..8056b58ec2 100644
--- a/modules/indexer/code/bleve/bleve.go
+++ b/modules/indexer/code/bleve/bleve.go
@@ -39,8 +39,6 @@ import (
 const (
 	unicodeNormalizeName = "unicodeNormalize"
 	maxBatchSize         = 16
-	// fuzzyDenominator determines the levenshtein distance per each character of a keyword
-	fuzzyDenominator = 4
 )
 
 func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
@@ -245,7 +243,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 	phraseQuery.Analyzer = repoIndexerAnalyzer
 	keywordQuery = phraseQuery
 	if opts.IsKeywordFuzzy {
-		phraseQuery.Fuzziness = len(opts.Keyword) / fuzzyDenominator
+		phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
 	}
 
 	if len(opts.RepoIDs) > 0 {
diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go
index 43a7c3c5ec..a2265f86e6 100644
--- a/modules/indexer/internal/bleve/util.go
+++ b/modules/indexer/internal/bleve/util.go
@@ -47,3 +47,15 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
 
 	return index, 0, nil
 }
+
+func GuessFuzzinessByKeyword(s string) int {
+	// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
+	// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
+	// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
+	for _, r := range s {
+		if r >= 128 {
+			return 0
+		}
+	}
+	return min(2, len(s)/4)
+}
diff --git a/modules/indexer/issues/bleve/bleve.go b/modules/indexer/issues/bleve/bleve.go
index 1f54be721b..d7957b266a 100644
--- a/modules/indexer/issues/bleve/bleve.go
+++ b/modules/indexer/issues/bleve/bleve.go
@@ -35,11 +35,7 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
 	})
 }
 
-const (
-	maxBatchSize = 16
-	// fuzzyDenominator determines the levenshtein distance per each character of a keyword
-	fuzzyDenominator = 4
-)
+const maxBatchSize = 16
 
 // IndexerData an update to the issue indexer
 type IndexerData internal.IndexerData
@@ -162,7 +158,7 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
 	if options.Keyword != "" {
 		fuzziness := 0
 		if options.IsFuzzyKeyword {
-			fuzziness = len(options.Keyword) / fuzzyDenominator
+			fuzziness = inner_bleve.GuessFuzzinessByKeyword(options.Keyword)
 		}
 
 		queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
diff --git a/routers/web/repo/search.go b/routers/web/repo/search.go
index 23cf898630..d7854b2499 100644
--- a/routers/web/repo/search.go
+++ b/routers/web/repo/search.go
@@ -28,6 +28,7 @@ func Search(ctx *context.Context) {
 	ctx.Data["Language"] = language
 	ctx.Data["IsFuzzy"] = isFuzzy
 	ctx.Data["PageIsViewCode"] = true
+	ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
 
 	if keyword == "" {
 		ctx.HTML(http.StatusOK, tplSearch)
@@ -86,7 +87,6 @@ func Search(ctx *context.Context) {
 		}
 	}
 
-	ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
 	ctx.Data["Repo"] = ctx.Repo.Repository
 	ctx.Data["SearchResults"] = searchResults
 	ctx.Data["SearchResultLanguages"] = searchResultLanguages