mirror of
https://github.com/go-gitea/gitea.git
synced 2024-12-28 06:43:38 +01:00
900ac62251
This is a large and complex PR, so let me explain in detail its changes. First, I had to create new index mappings for Bleve and ElasticSerach as the current ones do not support search by filename. This requires Gitea to recreate the code search indexes (I do not know if this is a breaking change, but I feel it deserves a heads-up). I've used [this approach](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-pathhierarchy-tokenizer.html) to model the filename index. It allows us to efficiently search for both the full path and the name of a file. Bleve, however, does not support this out-of-box, so I had to code a brand new [token filter](https://blevesearch.com/docs/Token-Filters/) to generate the search terms. I also did an overhaul in the `indexer_test.go` file. It now asserts the order of the expected results (this is important since matches based on the name of a file are more relevant than those based on its content). I've added new test scenarios that deal with searching by filename. They use a new repo included in the Gitea fixture. The screenshot below depicts how Gitea shows the search results. It shows results based on content in the same way as the current version does. In matches based on the filename, the first seven lines of the file contents are shown (BTW, this is how GitHub does it). ![image](https://github.com/user-attachments/assets/9d938d86-1a8d-4f89-8644-1921a473e858) Resolves #32096 --------- Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
383 lines
12 KiB
Go
383 lines
12 KiB
Go
// Copyright 2019 The Gitea Authors. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
package bleve
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
repo_model "code.gitea.io/gitea/models/repo"
|
|
"code.gitea.io/gitea/modules/analyze"
|
|
"code.gitea.io/gitea/modules/charset"
|
|
"code.gitea.io/gitea/modules/git"
|
|
"code.gitea.io/gitea/modules/gitrepo"
|
|
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
|
|
"code.gitea.io/gitea/modules/indexer/code/internal"
|
|
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
|
|
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
|
|
"code.gitea.io/gitea/modules/setting"
|
|
"code.gitea.io/gitea/modules/timeutil"
|
|
"code.gitea.io/gitea/modules/typesniffer"
|
|
|
|
"github.com/blevesearch/bleve/v2"
|
|
analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
|
|
analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
|
|
"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
|
|
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
|
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
|
|
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
|
"github.com/blevesearch/bleve/v2/mapping"
|
|
"github.com/blevesearch/bleve/v2/search/query"
|
|
"github.com/go-enry/go-enry/v2"
|
|
)
|
|
|
|
const (
|
|
unicodeNormalizeName = "unicodeNormalize"
|
|
maxBatchSize = 16
|
|
)
|
|
|
|
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
|
|
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
|
|
"type": unicodenorm.Name,
|
|
"form": unicodenorm.NFC,
|
|
})
|
|
}
|
|
|
|
// RepoIndexerData data stored in the repo indexer
|
|
type RepoIndexerData struct {
|
|
RepoID int64
|
|
CommitID string
|
|
Content string
|
|
Filename string
|
|
Language string
|
|
UpdatedAt time.Time
|
|
}
|
|
|
|
// Type returns the document type, for bleve's mapping.Classifier interface.
|
|
func (d *RepoIndexerData) Type() string {
|
|
return repoIndexerDocType
|
|
}
|
|
|
|
const (
|
|
repoIndexerAnalyzer = "repoIndexerAnalyzer"
|
|
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
|
|
filenameIndexerTokenizer = "filenameIndexerTokenizer"
|
|
repoIndexerDocType = "repoIndexerDocType"
|
|
repoIndexerLatestVersion = 7
|
|
)
|
|
|
|
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
|
|
func generateBleveIndexMapping() (mapping.IndexMapping, error) {
|
|
docMapping := bleve.NewDocumentMapping()
|
|
numericFieldMapping := bleve.NewNumericFieldMapping()
|
|
numericFieldMapping.IncludeInAll = false
|
|
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
|
|
|
|
textFieldMapping := bleve.NewTextFieldMapping()
|
|
textFieldMapping.IncludeInAll = false
|
|
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
|
|
|
|
fileNamedMapping := bleve.NewTextFieldMapping()
|
|
fileNamedMapping.IncludeInAll = false
|
|
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
|
|
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
|
|
|
|
termFieldMapping := bleve.NewTextFieldMapping()
|
|
termFieldMapping.IncludeInAll = false
|
|
termFieldMapping.Analyzer = analyzer_keyword.Name
|
|
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
|
|
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
|
|
|
|
timeFieldMapping := bleve.NewDateTimeFieldMapping()
|
|
timeFieldMapping.IncludeInAll = false
|
|
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
|
|
|
|
mapping := bleve.NewIndexMapping()
|
|
|
|
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
|
|
return nil, err
|
|
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
|
|
"type": analyzer_custom.Name,
|
|
"char_filters": []string{},
|
|
"tokenizer": unicode.Name,
|
|
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
|
|
"type": analyzer_custom.Name,
|
|
"char_filters": []string{},
|
|
"tokenizer": unicode.Name,
|
|
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
mapping.DefaultAnalyzer = repoIndexerAnalyzer
|
|
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
|
|
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
|
|
|
|
return mapping, nil
|
|
}
|
|
|
|
var _ internal.Indexer = &Indexer{}
|
|
|
|
// Indexer represents a bleve indexer implementation
|
|
type Indexer struct {
|
|
inner *inner_bleve.Indexer
|
|
indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
|
|
}
|
|
|
|
// NewIndexer creates a new bleve local indexer
|
|
func NewIndexer(indexDir string) *Indexer {
|
|
inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
|
|
return &Indexer{
|
|
Indexer: inner,
|
|
inner: inner,
|
|
}
|
|
}
|
|
|
|
func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
|
|
update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
|
|
) error {
|
|
// Ignore vendored files in code search
|
|
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
|
|
return nil
|
|
}
|
|
|
|
size := update.Size
|
|
|
|
var err error
|
|
if !update.Sized {
|
|
var stdout string
|
|
stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
|
|
return fmt.Errorf("misformatted git cat-file output: %w", err)
|
|
}
|
|
}
|
|
|
|
if size > setting.Indexer.MaxIndexerFileSize {
|
|
return b.addDelete(update.Filename, repo, batch)
|
|
}
|
|
|
|
if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
|
|
return err
|
|
}
|
|
|
|
_, _, size, err = git.ReadBatchLine(batchReader)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
|
|
if err != nil {
|
|
return err
|
|
} else if !typesniffer.DetectContentType(fileContents).IsText() {
|
|
// FIXME: UTF-16 files will probably fail here
|
|
return nil
|
|
}
|
|
|
|
if _, err = batchReader.Discard(1); err != nil {
|
|
return err
|
|
}
|
|
id := internal.FilenameIndexerID(repo.ID, update.Filename)
|
|
return batch.Index(id, &RepoIndexerData{
|
|
RepoID: repo.ID,
|
|
CommitID: commitSha,
|
|
Filename: update.Filename,
|
|
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
|
|
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
|
|
UpdatedAt: time.Now().UTC(),
|
|
})
|
|
}
|
|
|
|
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
|
|
id := internal.FilenameIndexerID(repo.ID, filename)
|
|
return batch.Delete(id)
|
|
}
|
|
|
|
// Index indexes the data
|
|
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
|
|
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
|
|
if len(changes.Updates) > 0 {
|
|
r, err := gitrepo.OpenRepository(ctx, repo)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer r.Close()
|
|
gitBatch, err := r.NewBatch(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer gitBatch.Close()
|
|
|
|
for _, update := range changes.Updates {
|
|
if err := b.addUpdate(ctx, gitBatch.Writer, gitBatch.Reader, sha, update, repo, batch); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
gitBatch.Close()
|
|
}
|
|
for _, filename := range changes.RemovedFilenames {
|
|
if err := b.addDelete(filename, repo, batch); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return batch.Flush()
|
|
}
|
|
|
|
// Delete deletes indexes by ids
|
|
func (b *Indexer) Delete(_ context.Context, repoID int64) error {
|
|
query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
|
|
searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
|
|
result, err := b.inner.Indexer.Search(searchRequest)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
|
|
for _, hit := range result.Hits {
|
|
if err = batch.Delete(hit.ID); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return batch.Flush()
|
|
}
|
|
|
|
// Search searches for files in the specified repo.
|
|
// Returns the matching file-paths
|
|
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
|
var (
|
|
indexerQuery query.Query
|
|
keywordQuery query.Query
|
|
)
|
|
|
|
pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
|
|
pathQuery.FieldVal = "Filename"
|
|
pathQuery.SetBoost(10)
|
|
|
|
contentQuery := bleve.NewMatchQuery(opts.Keyword)
|
|
contentQuery.FieldVal = "Content"
|
|
|
|
if opts.IsKeywordFuzzy {
|
|
contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
|
|
}
|
|
|
|
keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
|
|
|
|
if len(opts.RepoIDs) > 0 {
|
|
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
|
|
for _, repoID := range opts.RepoIDs {
|
|
repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
|
|
}
|
|
|
|
indexerQuery = bleve.NewConjunctionQuery(
|
|
bleve.NewDisjunctionQuery(repoQueries...),
|
|
keywordQuery,
|
|
)
|
|
} else {
|
|
indexerQuery = keywordQuery
|
|
}
|
|
|
|
// Save for reuse without language filter
|
|
facetQuery := indexerQuery
|
|
if len(opts.Language) > 0 {
|
|
languageQuery := bleve.NewMatchQuery(opts.Language)
|
|
languageQuery.FieldVal = "Language"
|
|
languageQuery.Analyzer = analyzer_keyword.Name
|
|
|
|
indexerQuery = bleve.NewConjunctionQuery(
|
|
indexerQuery,
|
|
languageQuery,
|
|
)
|
|
}
|
|
|
|
from, pageSize := opts.GetSkipTake()
|
|
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
|
|
searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
|
|
searchRequest.IncludeLocations = true
|
|
|
|
if len(opts.Language) == 0 {
|
|
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
|
|
}
|
|
|
|
searchRequest.SortBy([]string{"-_score", "UpdatedAt"})
|
|
|
|
result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
|
|
if err != nil {
|
|
return 0, nil, nil, err
|
|
}
|
|
|
|
total := int64(result.Total)
|
|
|
|
searchResults := make([]*internal.SearchResult, len(result.Hits))
|
|
for i, hit := range result.Hits {
|
|
startIndex, endIndex := -1, -1
|
|
for _, locations := range hit.Locations["Content"] {
|
|
location := locations[0]
|
|
locationStart := int(location.Start)
|
|
locationEnd := int(location.End)
|
|
if startIndex < 0 || locationStart < startIndex {
|
|
startIndex = locationStart
|
|
}
|
|
if endIndex < 0 || locationEnd > endIndex {
|
|
endIndex = locationEnd
|
|
}
|
|
}
|
|
if len(hit.Locations["Filename"]) > 0 {
|
|
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
|
|
}
|
|
|
|
language := hit.Fields["Language"].(string)
|
|
var updatedUnix timeutil.TimeStamp
|
|
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
|
|
updatedUnix = timeutil.TimeStamp(t.Unix())
|
|
}
|
|
searchResults[i] = &internal.SearchResult{
|
|
RepoID: int64(hit.Fields["RepoID"].(float64)),
|
|
StartIndex: startIndex,
|
|
EndIndex: endIndex,
|
|
Filename: internal.FilenameOfIndexerID(hit.ID),
|
|
Content: hit.Fields["Content"].(string),
|
|
CommitID: hit.Fields["CommitID"].(string),
|
|
UpdatedUnix: updatedUnix,
|
|
Language: language,
|
|
Color: enry.GetColor(language),
|
|
}
|
|
}
|
|
|
|
searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
|
|
if len(opts.Language) > 0 {
|
|
// Use separate query to go get all language counts
|
|
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
|
|
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
|
|
facetRequest.IncludeLocations = true
|
|
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
|
|
|
|
if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
|
|
return 0, nil, nil, err
|
|
}
|
|
}
|
|
languagesFacet := result.Facets["languages"]
|
|
for _, term := range languagesFacet.Terms.Terms() {
|
|
if len(term.Term) == 0 {
|
|
continue
|
|
}
|
|
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
|
|
Language: term.Term,
|
|
Color: enry.GetColor(term.Term),
|
|
Count: term.Count,
|
|
})
|
|
}
|
|
return total, searchResults, searchResultLanguages, nil
|
|
}
|