mirror of
https://github.com/go-gitea/gitea.git
synced 2024-12-27 19:53:52 +01:00
f64fbd9b74
This PR improves the accuracy of Gitea's code search. Currently, Gitea does not consider statements such as `onsole.log("hello")` as hits when the user searches for `log`. The culprit is how both ES and Bleve are tokenizing the file contents (in both cases, `console.log` is a whole token). In ES' case, we changed the tokenizer to [simple_pattern_split](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simplepatternsplit-tokenizer.html#:~:text=The%20simple_pattern_split%20tokenizer%20uses%20a,the%20tokenization%20is%20generally%20faster.). In such a case, tokens are words formed by digits and letters. In Bleve's case, it employs a [letter](https://blevesearch.com/docs/Tokenizers/) tokenizer. Resolves #32220 --------- Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
90 lines
2.6 KiB
Go
90 lines
2.6 KiB
Go
// Copyright 2023 The Gitea Authors. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
package bleve
|
|
|
|
import (
|
|
"errors"
|
|
"os"
|
|
"unicode"
|
|
|
|
"code.gitea.io/gitea/modules/log"
|
|
"code.gitea.io/gitea/modules/util"
|
|
|
|
"github.com/blevesearch/bleve/v2"
|
|
unicode_tokenizer "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
|
"github.com/blevesearch/bleve/v2/index/upsidedown"
|
|
"github.com/ethantkoenig/rupture"
|
|
)
|
|
|
|
const (
|
|
maxFuzziness = 2
|
|
)
|
|
|
|
// openIndexer open the index at the specified path, checking for metadata
|
|
// updates and bleve version updates. If index needs to be created (or
|
|
// re-created), returns (nil, nil)
|
|
func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
|
|
_, err := os.Stat(path)
|
|
if err != nil && os.IsNotExist(err) {
|
|
return nil, 0, nil
|
|
} else if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
|
|
metadata, err := rupture.ReadIndexMetadata(path)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
if metadata.Version < latestVersion {
|
|
// the indexer is using a previous version, so we should delete it and
|
|
// re-populate
|
|
return nil, metadata.Version, util.RemoveAll(path)
|
|
}
|
|
|
|
index, err := bleve.Open(path)
|
|
if err != nil {
|
|
if errors.Is(err, upsidedown.IncompatibleVersion) {
|
|
log.Warn("Indexer was built with a previous version of bleve, deleting and rebuilding")
|
|
return nil, 0, util.RemoveAll(path)
|
|
}
|
|
return nil, 0, err
|
|
}
|
|
|
|
return index, 0, nil
|
|
}
|
|
|
|
// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
|
|
// may be different on two string and they still be considered equivalent.
|
|
// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
|
|
func GuessFuzzinessByKeyword(s string) int {
|
|
tokenizer := unicode_tokenizer.NewUnicodeTokenizer()
|
|
tokens := tokenizer.Tokenize([]byte(s))
|
|
|
|
if len(tokens) > 0 {
|
|
fuzziness := maxFuzziness
|
|
|
|
for _, token := range tokens {
|
|
fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
|
|
}
|
|
|
|
return fuzziness
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
func guessFuzzinessByKeyword(s string) int {
|
|
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
|
|
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
|
|
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
|
|
// Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness
|
|
|
|
for _, r := range s {
|
|
if r >= 128 || !unicode.IsLetter(r) {
|
|
return 0
|
|
}
|
|
}
|
|
return min(maxFuzziness, len(s)/4)
|
|
}
|