From 2a278b996fd6608973c3ab2a2cfb584e67d5bd8b Mon Sep 17 00:00:00 2001 From: KN4CK3R Date: Fri, 23 Feb 2024 18:24:27 +0100 Subject: [PATCH] Add support for `linguist-detectable` and `linguist-documentation` (#29267) Add support for `linguist-detectable` and `linguist-documentation` Add tests for the attributes https://github.com/github-linguist/linguist/blob/master/docs/overrides.md#detectable https://github.com/github-linguist/linguist/blob/master/docs/overrides.md#documentation --- modules/git/repo_attribute.go | 23 +- modules/git/repo_language_stats_gogit.go | 77 +++--- modules/git/repo_language_stats_nogogit.go | 77 +++--- tests/integration/linguist_test.go | 259 +++++++++++++++++++++ 4 files changed, 365 insertions(+), 71 deletions(-) create mode 100644 tests/integration/linguist_test.go diff --git a/modules/git/repo_attribute.go b/modules/git/repo_attribute.go index 2b34f117f7..44f13ddc2d 100644 --- a/modules/git/repo_attribute.go +++ b/modules/git/repo_attribute.go @@ -11,6 +11,7 @@ import ( "os" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/optional" ) // CheckAttributeOpts represents the possible options to CheckAttribute @@ -291,7 +292,7 @@ func (repo *Repository) CheckAttributeReader(commitID string) (*CheckAttributeRe } checker := &CheckAttributeReader{ - Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language", "gitlab-language"}, + Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language", "gitlab-language", "linguist-documentation", "linguist-detectable"}, Repo: repo, IndexFile: indexFilename, WorkTree: worktree, @@ -316,3 +317,23 @@ func (repo *Repository) CheckAttributeReader(commitID string) (*CheckAttributeRe return checker, deferable } + +// true if "set"/"true", false if "unset"/"false", none otherwise +func attributeToBool(attr map[string]string, name string) optional.Option[bool] { + if value, has := attr[name]; has && value != "unspecified" { + switch value { + case "set", "true": + return optional.Some(true) + case "unset", "false": + return optional.Some(false) + } + } + return optional.None[bool]() +} + +func attributeToString(attr map[string]string, name string) optional.Option[string] { + if value, has := attr[name]; has && value != "unspecified" { + return optional.Some(value) + } + return optional.None[string]() +} diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/repo_language_stats_gogit.go index 4c6fbd6c7e..99c7a894d5 100644 --- a/modules/git/repo_language_stats_gogit.go +++ b/modules/git/repo_language_stats_gogit.go @@ -11,6 +11,7 @@ import ( "strings" "code.gitea.io/gitea/modules/analyze" + "code.gitea.io/gitea/modules/optional" "github.com/go-enry/go-enry/v2" "github.com/go-git/go-git/v5" @@ -57,25 +58,47 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil } - notVendored := false - notGenerated := false + isVendored := optional.None[bool]() + isGenerated := optional.None[bool]() + isDocumentation := optional.None[bool]() + isDetectable := optional.None[bool]() if checker != nil { attrs, err := checker.CheckPath(f.Name) if err == nil { - if vendored, has := attrs["linguist-vendored"]; has { - if vendored == "set" || vendored == "true" { - return nil - } - notVendored = vendored == "false" + isVendored = attributeToBool(attrs, "linguist-vendored") + if isVendored.ValueOrDefault(false) { + return nil } - if generated, has := attrs["linguist-generated"]; has { - if generated == "set" || generated == "true" { - return nil - } - notGenerated = generated == "false" + + isGenerated = attributeToBool(attrs, "linguist-generated") + if isGenerated.ValueOrDefault(false) { + return nil } - if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" { + + isDocumentation = attributeToBool(attrs, "linguist-documentation") + if isDocumentation.ValueOrDefault(false) { + return nil + } + + isDetectable = attributeToBool(attrs, "linguist-detectable") + if !isDetectable.ValueOrDefault(true) { + return nil + } + + hasLanguage := attributeToString(attrs, "linguist-language") + if hasLanguage.Value() == "" { + hasLanguage = attributeToString(attrs, "gitlab-language") + if hasLanguage.Has() { + language := hasLanguage.Value() + if idx := strings.IndexByte(language, '?'); idx >= 0 { + hasLanguage = optional.Some(language[:idx]) + } + } + } + if hasLanguage.Value() != "" { + language := hasLanguage.Value() + // group languages, such as Pug -> HTML; SCSS -> CSS group := enry.GetLanguageGroup(language) if len(group) != 0 { @@ -85,28 +108,14 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err // this language will always be added to the size sizes[language] += f.Size return nil - } else if language, has := attrs["gitlab-language"]; has && language != "unspecified" && language != "" { - // strip off a ? if present - if idx := strings.IndexByte(language, '?'); idx >= 0 { - language = language[:idx] - } - if len(language) != 0 { - // group languages, such as Pug -> HTML; SCSS -> CSS - group := enry.GetLanguageGroup(language) - if len(group) != 0 { - language = group - } - - // this language will always be added to the size - sizes[language] += f.Size - return nil - } } } } - if (!notVendored && analyze.IsVendor(f.Name)) || enry.IsDotFile(f.Name) || - enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { + if (!isVendored.Has() && analyze.IsVendor(f.Name)) || + enry.IsDotFile(f.Name) || + (!isDocumentation.Has() && enry.IsDocumentation(f.Name)) || + enry.IsConfiguration(f.Name) { return nil } @@ -115,12 +124,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err if f.Size <= bigFileSize { content, _ = readFile(f, fileSizeLimit) } - if !notGenerated && enry.IsGenerated(f.Name, content) { + if !isGenerated.Has() && enry.IsGenerated(f.Name, content) { return nil } - // TODO: Use .gitattributes file for linguist overrides - language := analyze.GetCodeLanguage(f.Name, content) if language == enry.OtherLanguage || language == "" { return nil @@ -138,7 +145,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err included = langtype == enry.Programming || langtype == enry.Markup includedLanguage[language] = included } - if included { + if included || isDetectable.ValueOrDefault(false) { sizes[language] += f.Size } else if len(sizes) == 0 && (firstExcludedLanguage == "" || firstExcludedLanguage == language) { firstExcludedLanguage = language diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index d68d7d210a..16669924d6 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -12,6 +12,7 @@ import ( "code.gitea.io/gitea/modules/analyze" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/optional" "github.com/go-enry/go-enry/v2" ) @@ -88,25 +89,47 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err continue } - notVendored := false - notGenerated := false + isVendored := optional.None[bool]() + isGenerated := optional.None[bool]() + isDocumentation := optional.None[bool]() + isDetectable := optional.None[bool]() if checker != nil { attrs, err := checker.CheckPath(f.Name()) if err == nil { - if vendored, has := attrs["linguist-vendored"]; has { - if vendored == "set" || vendored == "true" { - continue - } - notVendored = vendored == "false" + isVendored = attributeToBool(attrs, "linguist-vendored") + if isVendored.ValueOrDefault(false) { + continue } - if generated, has := attrs["linguist-generated"]; has { - if generated == "set" || generated == "true" { - continue - } - notGenerated = generated == "false" + + isGenerated = attributeToBool(attrs, "linguist-generated") + if isGenerated.ValueOrDefault(false) { + continue } - if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" { + + isDocumentation = attributeToBool(attrs, "linguist-documentation") + if isDocumentation.ValueOrDefault(false) { + continue + } + + isDetectable = attributeToBool(attrs, "linguist-detectable") + if !isDetectable.ValueOrDefault(true) { + continue + } + + hasLanguage := attributeToString(attrs, "linguist-language") + if hasLanguage.Value() == "" { + hasLanguage = attributeToString(attrs, "gitlab-language") + if hasLanguage.Has() { + language := hasLanguage.Value() + if idx := strings.IndexByte(language, '?'); idx >= 0 { + hasLanguage = optional.Some(language[:idx]) + } + } + } + if hasLanguage.Value() != "" { + language := hasLanguage.Value() + // group languages, such as Pug -> HTML; SCSS -> CSS group := enry.GetLanguageGroup(language) if len(group) != 0 { @@ -116,29 +139,14 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err // this language will always be added to the size sizes[language] += f.Size() continue - } else if language, has := attrs["gitlab-language"]; has && language != "unspecified" && language != "" { - // strip off a ? if present - if idx := strings.IndexByte(language, '?'); idx >= 0 { - language = language[:idx] - } - if len(language) != 0 { - // group languages, such as Pug -> HTML; SCSS -> CSS - group := enry.GetLanguageGroup(language) - if len(group) != 0 { - language = group - } - - // this language will always be added to the size - sizes[language] += f.Size() - continue - } } - } } - if (!notVendored && analyze.IsVendor(f.Name())) || enry.IsDotFile(f.Name()) || - enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) { + if (!isVendored.Has() && analyze.IsVendor(f.Name())) || + enry.IsDotFile(f.Name()) || + (!isDocumentation.Has() && enry.IsDocumentation(f.Name())) || + enry.IsConfiguration(f.Name()) { continue } @@ -170,7 +178,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil, err } } - if !notGenerated && enry.IsGenerated(f.Name(), content) { + if !isGenerated.Has() && enry.IsGenerated(f.Name(), content) { continue } @@ -193,13 +201,12 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err included = langType == enry.Programming || langType == enry.Markup includedLanguage[language] = included } - if included { + if included || isDetectable.ValueOrDefault(false) { sizes[language] += f.Size() } else if len(sizes) == 0 && (firstExcludedLanguage == "" || firstExcludedLanguage == language) { firstExcludedLanguage = language firstExcludedLanguageSize += f.Size() } - continue } // If there are no included languages add the first excluded language diff --git a/tests/integration/linguist_test.go b/tests/integration/linguist_test.go new file mode 100644 index 0000000000..e569de93a8 --- /dev/null +++ b/tests/integration/linguist_test.go @@ -0,0 +1,259 @@ +// Copyright 2024 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package integration + +import ( + "context" + "net/url" + "strings" + "testing" + "time" + + "code.gitea.io/gitea/models/db" + repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/models/unittest" + user_model "code.gitea.io/gitea/models/user" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/indexer/stats" + "code.gitea.io/gitea/modules/queue" + repo_service "code.gitea.io/gitea/services/repository" + files_service "code.gitea.io/gitea/services/repository/files" + + "github.com/stretchr/testify/assert" +) + +func TestLinguist(t *testing.T) { + onGiteaRun(t, func(t *testing.T, _ *url.URL) { + user := unittest.AssertExistsAndLoadBean(t, &user_model.User{ID: 2}) + + cppContent := "#include \nint main() {\nstd::cout << \"Hello Gitea!\";\nreturn 0;\n}" + pyContent := "print(\"Hello Gitea!\")" + phpContent := "" + lockContent := "# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand." + mdContent := "markdown" + + cases := []struct { + GitAttributesContent string + FilesToAdd []*files_service.ChangeRepoFile + ExpectedLanguageOrder []string + }{ + // case 0 + { + ExpectedLanguageOrder: []string{}, + }, + // case 1 + { + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "cplusplus.cpp", + ContentReader: strings.NewReader(cppContent), + }, + { + TreePath: "python.py", + ContentReader: strings.NewReader(pyContent), + }, + { + TreePath: "php.php", + ContentReader: strings.NewReader(phpContent), + }, + }, + ExpectedLanguageOrder: []string{"C++", "PHP", "Python"}, + }, + // case 2 + { + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: ".cplusplus.cpp", + ContentReader: strings.NewReader(cppContent), + }, + { + TreePath: "python.py", + ContentReader: strings.NewReader(pyContent), + }, + { + TreePath: "vendor/php.php", + ContentReader: strings.NewReader(phpContent), + }, + }, + ExpectedLanguageOrder: []string{"Python"}, + }, + // case 3 + { + GitAttributesContent: "*.cpp linguist-language=Go", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "cplusplus.cpp", + ContentReader: strings.NewReader(cppContent), + }, + }, + ExpectedLanguageOrder: []string{"Go"}, + }, + // case 4 + { + GitAttributesContent: "*.cpp gitlab-language=Go?parent=json", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "cplusplus.cpp", + ContentReader: strings.NewReader(cppContent), + }, + }, + ExpectedLanguageOrder: []string{"Go"}, + }, + // case 5 + { + GitAttributesContent: "*.cpp linguist-language=HTML gitlab-language=Go?parent=json", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "cplusplus.cpp", + ContentReader: strings.NewReader(cppContent), + }, + }, + ExpectedLanguageOrder: []string{"HTML"}, + }, + // case 6 + { + GitAttributesContent: "vendor/** linguist-vendored=false", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "vendor/php.php", + ContentReader: strings.NewReader(phpContent), + }, + }, + ExpectedLanguageOrder: []string{"PHP"}, + }, + // case 7 + { + GitAttributesContent: "*.cpp linguist-vendored=true\n*.py linguist-vendored\nvendor/** -linguist-vendored", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "cplusplus.cpp", + ContentReader: strings.NewReader(cppContent), + }, + { + TreePath: "python.py", + ContentReader: strings.NewReader(pyContent), + }, + { + TreePath: "vendor/php.php", + ContentReader: strings.NewReader(phpContent), + }, + }, + ExpectedLanguageOrder: []string{"PHP"}, + }, + // case 8 + { + GitAttributesContent: "poetry.lock linguist-language=Go", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "poetry.lock", + ContentReader: strings.NewReader(lockContent), + }, + }, + ExpectedLanguageOrder: []string{"Go"}, + }, + // case 9 + { + GitAttributesContent: "poetry.lock linguist-generated=false", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "poetry.lock", + ContentReader: strings.NewReader(lockContent), + }, + }, + ExpectedLanguageOrder: []string{"TOML"}, + }, + // case 10 + { + GitAttributesContent: "*.cpp -linguist-detectable", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "cplusplus.cpp", + ContentReader: strings.NewReader(cppContent), + }, + }, + ExpectedLanguageOrder: []string{}, + }, + // case 11 + { + GitAttributesContent: "*.md linguist-detectable", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "test.md", + ContentReader: strings.NewReader(mdContent), + }, + }, + ExpectedLanguageOrder: []string{"Markdown"}, + }, + // case 12 + { + GitAttributesContent: "test2.md linguist-detectable", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "cplusplus.cpp", + ContentReader: strings.NewReader(cppContent), + }, + { + TreePath: "test.md", + ContentReader: strings.NewReader(mdContent), + }, + { + TreePath: "test2.md", + ContentReader: strings.NewReader(mdContent), + }, + }, + ExpectedLanguageOrder: []string{"C++", "Markdown"}, + }, + // case 13 + { + GitAttributesContent: "README.md linguist-documentation=false", + FilesToAdd: []*files_service.ChangeRepoFile{ + { + TreePath: "README.md", + ContentReader: strings.NewReader(mdContent), + }, + }, + ExpectedLanguageOrder: []string{"Markdown"}, + }, + } + + for i, c := range cases { + repo, err := repo_service.CreateRepository(db.DefaultContext, user, user, repo_service.CreateRepoOptions{ + Name: "linguist-test", + }) + assert.NoError(t, err) + + files := []*files_service.ChangeRepoFile{ + { + TreePath: ".gitattributes", + ContentReader: strings.NewReader(c.GitAttributesContent), + }, + } + files = append(files, c.FilesToAdd...) + for _, f := range files { + f.Operation = "create" + } + + _, err = files_service.ChangeRepoFiles(git.DefaultContext, repo, user, &files_service.ChangeRepoFilesOptions{ + Files: files, + OldBranch: repo.DefaultBranch, + NewBranch: repo.DefaultBranch, + }) + assert.NoError(t, err) + + assert.NoError(t, stats.UpdateRepoIndexer(repo)) + assert.NoError(t, queue.GetManager().FlushAll(context.Background(), 10*time.Second)) + + stats, err := repo_model.GetTopLanguageStats(db.DefaultContext, repo, len(c.FilesToAdd)) + assert.NoError(t, err) + + languages := make([]string, 0, len(stats)) + for _, s := range stats { + languages = append(languages, s.Language) + } + assert.Equal(t, c.ExpectedLanguageOrder, languages, "case %d: unexpected language stats", i) + + assert.NoError(t, repo_service.DeleteRepository(db.DefaultContext, user, repo, false)) + } + }) +}