0
0
mirror of https://github.com/go-gitea/gitea.git synced 2025-07-17 02:22:52 +02:00

Merge 9363b591eda59e4fb6394c58f77d6f564faf2414 into 6599efb3b1400ac06d06e1c8b68ae6037fbb7952

This commit is contained in:
Lunny Xiao 2025-07-12 18:20:49 +08:00 committed by GitHub
commit 74e78db5eb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 93 additions and 34 deletions

View File

@ -7,6 +7,7 @@ package languagestats
import (
"bytes"
"context"
"io"
"code.gitea.io/gitea/modules/analyze"
@ -20,8 +21,8 @@ import (
"github.com/go-git/go-git/v5/plumbing/object"
)
// GetLanguageStats calculates language stats for git repository at specified commit
func GetLanguageStats(repo *git_module.Repository, commitID string) (map[string]int64, error) {
// CalcLanguageStats calculates language stats for git repository at specified commit
func CalcLanguageStats(ctx context.Context, repo *git_module.Repository, commitID string) (map[string]int64, error) {
r, err := git.PlainOpen(repo.Path)
if err != nil {
return nil, err
@ -58,6 +59,13 @@ func GetLanguageStats(repo *git_module.Repository, commitID string) (map[string]
firstExcludedLanguageSize := int64(0)
err = tree.Files().ForEach(func(f *object.File) error {
select {
case <-ctx.Done():
return ctx.Err()
default:
return nil
}
if f.Size == 0 {
return nil
}

View File

@ -7,6 +7,7 @@ package languagestats
import (
"bytes"
"context"
"io"
"code.gitea.io/gitea/modules/analyze"
@ -18,8 +19,8 @@ import (
"github.com/go-enry/go-enry/v2"
)
// GetLanguageStats calculates language stats for git repository at specified commit
func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, error) {
// CalcLanguageStats calculates language stats for git repository at specified commit
func CalcLanguageStats(ctx context.Context, repo *git.Repository, commitID string) (map[string]int64, error) {
// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
// so let's create a batch stdin and stdout
batchStdinWriter, batchReader, cancel, err := repo.CatFileBatch(repo.Ctx)
@ -59,11 +60,6 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
tree := commit.Tree
entries, err := tree.ListEntriesRecursiveWithSize()
if err != nil {
return nil, err
}
checker, err := attribute.NewBatchChecker(repo, commitID, attribute.LinguistAttributes)
if err != nil {
return nil, err
@ -82,18 +78,12 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
firstExcludedLanguage := ""
firstExcludedLanguageSize := int64(0)
for _, f := range entries {
select {
case <-repo.Ctx.Done():
return sizes, repo.Ctx.Err()
default:
}
if err := tree.IterateEntriesRecursive(ctx, func(ctx context.Context, f *git.TreeEntry) error {
contentBuf.Reset()
content = contentBuf.Bytes()
if f.Size() == 0 {
continue
return nil
}
isVendored := optional.None[bool]()
@ -104,19 +94,19 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
attrLinguistGenerated := optional.None[bool]()
if err == nil {
if isVendored = attrs.GetVendored(); isVendored.ValueOrDefault(false) {
continue
return nil
}
if attrLinguistGenerated = attrs.GetGenerated(); attrLinguistGenerated.ValueOrDefault(false) {
continue
return nil
}
if isDocumentation = attrs.GetDocumentation(); isDocumentation.ValueOrDefault(false) {
continue
return nil
}
if isDetectable = attrs.GetDetectable(); !isDetectable.ValueOrDefault(true) {
continue
return nil
}
if hasLanguage := attrs.GetLanguage(); hasLanguage.Value() != "" {
@ -130,7 +120,7 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
// this language will always be added to the size
sizes[language] += f.Size()
continue
return nil
}
}
@ -138,19 +128,19 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
enry.IsDotFile(f.Name()) ||
(!isDocumentation.Has() && enry.IsDocumentation(f.Name())) ||
enry.IsConfiguration(f.Name()) {
continue
return nil
}
// If content can not be read or file is too big just do detection by filename
if f.Size() <= bigFileSize {
if err := writeID(f.ID.String()); err != nil {
return nil, err
return err
}
_, _, size, err := git.ReadBatchLine(batchReader)
if err != nil {
log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err)
return nil, err
return err
}
sizeToRead := size
@ -162,11 +152,11 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
if err != nil {
return nil, err
return err
}
content = contentBuf.Bytes()
if err := git.DiscardFull(batchReader, discard); err != nil {
return nil, err
return err
}
}
@ -178,14 +168,14 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
isGenerated = enry.IsGenerated(f.Name(), content)
}
if isGenerated {
continue
return nil
}
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
// - eg. do the all the detection tests using filename first before reading content.
language := analyze.GetCodeLanguage(f.Name(), content)
if language == "" {
continue
return nil
}
// group languages, such as Pug -> HTML; SCSS -> CSS
@ -206,6 +196,9 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
firstExcludedLanguage = language
firstExcludedLanguageSize += f.Size()
}
return nil
}, git.TrustedCmdArgs{"--long"}); err != nil {
return sizes, err
}
// If there are no included languages add the first excluded language

View File

@ -22,7 +22,7 @@ func TestRepository_GetLanguageStats(t *testing.T) {
require.NoError(t, err)
defer gitRepo.Close()
stats, err := GetLanguageStats(gitRepo, "8fee858da5796dfb37704761701bb8e800ad9ef3")
stats, err := CalcLanguageStats(t.Context(), gitRepo, "8fee858da5796dfb37704761701bb8e800ad9ef3")
require.NoError(t, err)
assert.Equal(t, map[string]int64{

View File

@ -22,6 +22,13 @@ func ParseTreeEntries(data []byte) ([]*TreeEntry, error) {
// parseTreeEntries FIXME this function's design is not right, it should not make the caller read all data into memory
func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
entries := make([]*TreeEntry, 0, bytes.Count(data, []byte{'\n'})+1)
return entries, iterateTreeEntries(data, ptree, func(entry *TreeEntry) error {
entries = append(entries, entry)
return nil
})
}
func iterateTreeEntries(data []byte, ptree *Tree, f func(entry *TreeEntry) error) error {
for pos := 0; pos < len(data); {
posEnd := bytes.IndexByte(data[pos:], '\n')
if posEnd == -1 {
@ -33,7 +40,7 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
line := data[pos:posEnd]
lsTreeLine, err := parseLsTreeLine(line)
if err != nil {
return nil, err
return err
}
entry := &TreeEntry{
ptree: ptree,
@ -44,9 +51,11 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
sized: lsTreeLine.Size.Has(),
}
pos = posEnd + 1
entries = append(entries, entry)
if err := f(entry); err != nil {
return err
}
}
return entries, nil
return nil
}
func catBatchParseTreeEntries(objectFormat ObjectFormat, ptree *Tree, rd *bufio.Reader, sz int64) ([]*TreeEntry, error) {

View File

@ -6,6 +6,8 @@
package git
import (
"bufio"
"context"
"io"
"strings"
)
@ -122,3 +124,50 @@ func (t *Tree) ListEntriesRecursiveFast() (Entries, error) {
func (t *Tree) ListEntriesRecursiveWithSize() (Entries, error) {
return t.listEntriesRecursive(TrustedCmdArgs{"--long"})
}
// IterateEntriesRecursive returns iterate entries of current tree recursively including all subtrees
// extraArgs could be "-l" to get the size, which is slower
func (t *Tree) IterateEntriesRecursive(ctx context.Context, f func(ctx context.Context, entry *TreeEntry) error, extraArgs TrustedCmdArgs) error {
reader, writer := io.Pipe()
done := make(chan error)
go func(t *Tree, done chan error, writer *io.PipeWriter) {
runErr := NewCommand("ls-tree", "-t", "-r").
AddArguments(extraArgs...).
AddDynamicArguments(t.ID.String()).
Run(ctx, &RunOpts{
Dir: t.repo.Path,
Stdout: writer,
})
_ = writer.Close()
done <- runErr
}(t, done, writer)
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
if err := scanner.Err(); err != nil {
return err
}
data := scanner.Bytes()
if err := iterateTreeEntries(data, t, func(entry *TreeEntry) error {
if err := f(ctx, entry); err != nil {
return err
}
select {
case <-ctx.Done():
return ctx.Err()
case runErr := <-done:
return runErr
default:
return nil
}
}); err != nil {
return err
}
}
return nil
}

View File

@ -63,7 +63,7 @@ func (db *DBIndexer) Index(id int64) error {
}
// Calculate and save language statistics to database
stats, err := languagestats.GetLanguageStats(gitRepo, commitID)
stats, err := languagestats.CalcLanguageStats(ctx, gitRepo, commitID)
if err != nil {
if !setting.IsInTesting {
log.Error("Unable to get language stats for ID %s for default branch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.FullName(), err)