mirror of
				https://github.com/go-gitea/gitea.git
				synced 2025-10-31 09:31:53 +01:00 
			
		
		
		
	Each "indexer" should provide the "search modes" they support by themselves. And we need to remove the "fuzzy" search for code.
		
			
				
	
	
		
			399 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			399 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // Copyright 2019 The Gitea Authors. All rights reserved.
 | |
| // SPDX-License-Identifier: MIT
 | |
| 
 | |
| package bleve
 | |
| 
 | |
| import (
 | |
| 	"bufio"
 | |
| 	"context"
 | |
| 	"fmt"
 | |
| 	"io"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"time"
 | |
| 
 | |
| 	repo_model "code.gitea.io/gitea/models/repo"
 | |
| 	"code.gitea.io/gitea/modules/analyze"
 | |
| 	"code.gitea.io/gitea/modules/charset"
 | |
| 	"code.gitea.io/gitea/modules/git"
 | |
| 	"code.gitea.io/gitea/modules/gitrepo"
 | |
| 	"code.gitea.io/gitea/modules/indexer"
 | |
| 	path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
 | |
| 	"code.gitea.io/gitea/modules/indexer/code/internal"
 | |
| 	indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
 | |
| 	inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
 | |
| 	"code.gitea.io/gitea/modules/setting"
 | |
| 	"code.gitea.io/gitea/modules/timeutil"
 | |
| 	"code.gitea.io/gitea/modules/typesniffer"
 | |
| 
 | |
| 	"github.com/blevesearch/bleve/v2"
 | |
| 	analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
 | |
| 	analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
 | |
| 	"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
 | |
| 	"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
 | |
| 	"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
 | |
| 	"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
 | |
| 	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
 | |
| 	"github.com/blevesearch/bleve/v2/mapping"
 | |
| 	"github.com/blevesearch/bleve/v2/search/query"
 | |
| 	"github.com/go-enry/go-enry/v2"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	unicodeNormalizeName = "unicodeNormalize"
 | |
| 	maxBatchSize         = 16
 | |
| )
 | |
| 
 | |
| func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
 | |
| 	return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
 | |
| 		"type": unicodenorm.Name,
 | |
| 		"form": unicodenorm.NFC,
 | |
| 	})
 | |
| }
 | |
| 
 | |
| // RepoIndexerData data stored in the repo indexer
 | |
| type RepoIndexerData struct {
 | |
| 	RepoID    int64
 | |
| 	CommitID  string
 | |
| 	Content   string
 | |
| 	Filename  string
 | |
| 	Language  string
 | |
| 	UpdatedAt time.Time
 | |
| }
 | |
| 
 | |
| // Type returns the document type, for bleve's mapping.Classifier interface.
 | |
| func (d *RepoIndexerData) Type() string {
 | |
| 	return repoIndexerDocType
 | |
| }
 | |
| 
 | |
| const (
 | |
| 	repoIndexerAnalyzer      = "repoIndexerAnalyzer"
 | |
| 	filenameIndexerAnalyzer  = "filenameIndexerAnalyzer"
 | |
| 	filenameIndexerTokenizer = "filenameIndexerTokenizer"
 | |
| 	repoIndexerDocType       = "repoIndexerDocType"
 | |
| 	repoIndexerLatestVersion = 8
 | |
| )
 | |
| 
 | |
| // generateBleveIndexMapping generates a bleve index mapping for the repo indexer
 | |
| func generateBleveIndexMapping() (mapping.IndexMapping, error) {
 | |
| 	docMapping := bleve.NewDocumentMapping()
 | |
| 	numericFieldMapping := bleve.NewNumericFieldMapping()
 | |
| 	numericFieldMapping.IncludeInAll = false
 | |
| 	docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
 | |
| 
 | |
| 	textFieldMapping := bleve.NewTextFieldMapping()
 | |
| 	textFieldMapping.IncludeInAll = false
 | |
| 	docMapping.AddFieldMappingsAt("Content", textFieldMapping)
 | |
| 
 | |
| 	fileNamedMapping := bleve.NewTextFieldMapping()
 | |
| 	fileNamedMapping.IncludeInAll = false
 | |
| 	fileNamedMapping.Analyzer = filenameIndexerAnalyzer
 | |
| 	docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
 | |
| 
 | |
| 	termFieldMapping := bleve.NewTextFieldMapping()
 | |
| 	termFieldMapping.IncludeInAll = false
 | |
| 	termFieldMapping.Analyzer = analyzer_keyword.Name
 | |
| 	docMapping.AddFieldMappingsAt("Language", termFieldMapping)
 | |
| 	docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
 | |
| 
 | |
| 	timeFieldMapping := bleve.NewDateTimeFieldMapping()
 | |
| 	timeFieldMapping.IncludeInAll = false
 | |
| 	docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
 | |
| 
 | |
| 	mapping := bleve.NewIndexMapping()
 | |
| 
 | |
| 	if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
 | |
| 		return nil, err
 | |
| 	} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
 | |
| 		"type":          analyzer_custom.Name,
 | |
| 		"char_filters":  []string{},
 | |
| 		"tokenizer":     letter.Name,
 | |
| 		"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
 | |
| 	}); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
 | |
| 		"type":          analyzer_custom.Name,
 | |
| 		"char_filters":  []string{},
 | |
| 		"tokenizer":     unicode.Name,
 | |
| 		"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
 | |
| 	}); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	mapping.DefaultAnalyzer = repoIndexerAnalyzer
 | |
| 	mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
 | |
| 	mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
 | |
| 
 | |
| 	return mapping, nil
 | |
| }
 | |
| 
 | |
| var _ internal.Indexer = &Indexer{}
 | |
| 
 | |
| // Indexer represents a bleve indexer implementation
 | |
| type Indexer struct {
 | |
| 	inner                    *inner_bleve.Indexer
 | |
| 	indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
 | |
| }
 | |
| 
 | |
| func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
 | |
| 	return indexer.SearchModesExactWords()
 | |
| }
 | |
| 
 | |
| // NewIndexer creates a new bleve local indexer
 | |
| func NewIndexer(indexDir string) *Indexer {
 | |
| 	inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
 | |
| 	return &Indexer{
 | |
| 		Indexer: inner,
 | |
| 		inner:   inner,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
 | |
| 	update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
 | |
| ) error {
 | |
| 	// Ignore vendored files in code search
 | |
| 	if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	size := update.Size
 | |
| 
 | |
| 	var err error
 | |
| 	if !update.Sized {
 | |
| 		var stdout string
 | |
| 		stdout, _, err = git.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(ctx, &git.RunOpts{Dir: repo.RepoPath()})
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
 | |
| 			return fmt.Errorf("misformatted git cat-file output: %w", err)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if size > setting.Indexer.MaxIndexerFileSize {
 | |
| 		return b.addDelete(update.Filename, repo, batch)
 | |
| 	}
 | |
| 
 | |
| 	if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	_, _, size, err = git.ReadBatchLine(batchReader)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	} else if !typesniffer.DetectContentType(fileContents).IsText() {
 | |
| 		// FIXME: UTF-16 files will probably fail here
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	if _, err = batchReader.Discard(1); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	id := internal.FilenameIndexerID(repo.ID, update.Filename)
 | |
| 	return batch.Index(id, &RepoIndexerData{
 | |
| 		RepoID:    repo.ID,
 | |
| 		CommitID:  commitSha,
 | |
| 		Filename:  update.Filename,
 | |
| 		Content:   string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
 | |
| 		Language:  analyze.GetCodeLanguage(update.Filename, fileContents),
 | |
| 		UpdatedAt: time.Now().UTC(),
 | |
| 	})
 | |
| }
 | |
| 
 | |
| func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
 | |
| 	id := internal.FilenameIndexerID(repo.ID, filename)
 | |
| 	return batch.Delete(id)
 | |
| }
 | |
| 
 | |
| // Index indexes the data
 | |
| func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
 | |
| 	batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
 | |
| 	if len(changes.Updates) > 0 {
 | |
| 		r, err := gitrepo.OpenRepository(ctx, repo)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		defer r.Close()
 | |
| 		gitBatch, err := r.NewBatch(ctx)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		defer gitBatch.Close()
 | |
| 
 | |
| 		for _, update := range changes.Updates {
 | |
| 			if err := b.addUpdate(ctx, gitBatch.Writer, gitBatch.Reader, sha, update, repo, batch); err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 		}
 | |
| 		gitBatch.Close()
 | |
| 	}
 | |
| 	for _, filename := range changes.RemovedFilenames {
 | |
| 		if err := b.addDelete(filename, repo, batch); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return batch.Flush()
 | |
| }
 | |
| 
 | |
| // Delete deletes indexes by ids
 | |
| func (b *Indexer) Delete(_ context.Context, repoID int64) error {
 | |
| 	query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
 | |
| 	searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
 | |
| 	result, err := b.inner.Indexer.Search(searchRequest)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
 | |
| 	for _, hit := range result.Hits {
 | |
| 		if err = batch.Delete(hit.ID); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return batch.Flush()
 | |
| }
 | |
| 
 | |
| // Search searches for files in the specified repo.
 | |
| // Returns the matching file-paths
 | |
| func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
 | |
| 	var (
 | |
| 		indexerQuery query.Query
 | |
| 		keywordQuery query.Query
 | |
| 		contentQuery query.Query
 | |
| 	)
 | |
| 
 | |
| 	pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
 | |
| 	pathQuery.FieldVal = "Filename"
 | |
| 	pathQuery.SetBoost(10)
 | |
| 
 | |
| 	if opts.SearchMode == indexer.SearchModeExact {
 | |
| 		q := bleve.NewMatchPhraseQuery(opts.Keyword)
 | |
| 		q.FieldVal = "Content"
 | |
| 		contentQuery = q
 | |
| 	} else /* words */ {
 | |
| 		q := bleve.NewMatchQuery(opts.Keyword)
 | |
| 		q.FieldVal = "Content"
 | |
| 		if opts.SearchMode == indexer.SearchModeFuzzy {
 | |
| 			// this logic doesn't seem right, it is only used to pass the test-case `Keyword:    "dESCRIPTION"`, which doesn't seem to be a real-life use-case.
 | |
| 			q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
 | |
| 		} else {
 | |
| 			q.Operator = query.MatchQueryOperatorAnd
 | |
| 		}
 | |
| 		contentQuery = q
 | |
| 	}
 | |
| 
 | |
| 	keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
 | |
| 
 | |
| 	if len(opts.RepoIDs) > 0 {
 | |
| 		repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
 | |
| 		for _, repoID := range opts.RepoIDs {
 | |
| 			repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
 | |
| 		}
 | |
| 
 | |
| 		indexerQuery = bleve.NewConjunctionQuery(
 | |
| 			bleve.NewDisjunctionQuery(repoQueries...),
 | |
| 			keywordQuery,
 | |
| 		)
 | |
| 	} else {
 | |
| 		indexerQuery = keywordQuery
 | |
| 	}
 | |
| 
 | |
| 	// Save for reuse without language filter
 | |
| 	facetQuery := indexerQuery
 | |
| 	if len(opts.Language) > 0 {
 | |
| 		languageQuery := bleve.NewMatchQuery(opts.Language)
 | |
| 		languageQuery.FieldVal = "Language"
 | |
| 		languageQuery.Analyzer = analyzer_keyword.Name
 | |
| 
 | |
| 		indexerQuery = bleve.NewConjunctionQuery(
 | |
| 			indexerQuery,
 | |
| 			languageQuery,
 | |
| 		)
 | |
| 	}
 | |
| 
 | |
| 	from, pageSize := opts.GetSkipTake()
 | |
| 	searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
 | |
| 	searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
 | |
| 	searchRequest.IncludeLocations = true
 | |
| 
 | |
| 	if len(opts.Language) == 0 {
 | |
| 		searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
 | |
| 	}
 | |
| 
 | |
| 	searchRequest.SortBy([]string{"-_score", "UpdatedAt"})
 | |
| 
 | |
| 	result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
 | |
| 	if err != nil {
 | |
| 		return 0, nil, nil, err
 | |
| 	}
 | |
| 
 | |
| 	total := int64(result.Total)
 | |
| 
 | |
| 	searchResults := make([]*internal.SearchResult, len(result.Hits))
 | |
| 	for i, hit := range result.Hits {
 | |
| 		startIndex, endIndex := -1, -1
 | |
| 		for _, locations := range hit.Locations["Content"] {
 | |
| 			location := locations[0]
 | |
| 			locationStart := int(location.Start)
 | |
| 			locationEnd := int(location.End)
 | |
| 			if startIndex < 0 || locationStart < startIndex {
 | |
| 				startIndex = locationStart
 | |
| 			}
 | |
| 			if endIndex < 0 || locationEnd > endIndex {
 | |
| 				endIndex = locationEnd
 | |
| 			}
 | |
| 		}
 | |
| 		if len(hit.Locations["Filename"]) > 0 {
 | |
| 			startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
 | |
| 		}
 | |
| 
 | |
| 		language := hit.Fields["Language"].(string)
 | |
| 		var updatedUnix timeutil.TimeStamp
 | |
| 		if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
 | |
| 			updatedUnix = timeutil.TimeStamp(t.Unix())
 | |
| 		}
 | |
| 		searchResults[i] = &internal.SearchResult{
 | |
| 			RepoID:      int64(hit.Fields["RepoID"].(float64)),
 | |
| 			StartIndex:  startIndex,
 | |
| 			EndIndex:    endIndex,
 | |
| 			Filename:    internal.FilenameOfIndexerID(hit.ID),
 | |
| 			Content:     hit.Fields["Content"].(string),
 | |
| 			CommitID:    hit.Fields["CommitID"].(string),
 | |
| 			UpdatedUnix: updatedUnix,
 | |
| 			Language:    language,
 | |
| 			Color:       enry.GetColor(language),
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
 | |
| 	if len(opts.Language) > 0 {
 | |
| 		// Use separate query to go get all language counts
 | |
| 		facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
 | |
| 		facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
 | |
| 		facetRequest.IncludeLocations = true
 | |
| 		facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
 | |
| 
 | |
| 		if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
 | |
| 			return 0, nil, nil, err
 | |
| 		}
 | |
| 	}
 | |
| 	languagesFacet := result.Facets["languages"]
 | |
| 	for _, term := range languagesFacet.Terms.Terms() {
 | |
| 		if len(term.Term) == 0 {
 | |
| 			continue
 | |
| 		}
 | |
| 		searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
 | |
| 			Language: term.Term,
 | |
| 			Color:    enry.GetColor(term.Term),
 | |
| 			Count:    term.Count,
 | |
| 		})
 | |
| 	}
 | |
| 	return total, searchResults, searchResultLanguages, nil
 | |
| }
 |