0
0
mirror of https://github.com/go-gitea/gitea.git synced 2026-03-14 05:36:24 +01:00

Fix chroma lexer mapping (#36629)

Fix some edge cases for ".hcl" and ".v" files, and add more tests
This commit is contained in:
wxiaoguang 2026-02-16 10:11:02 +08:00 committed by GitHub
parent 08d9845635
commit 258754f299
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 70 additions and 16 deletions

View File

@ -21,7 +21,8 @@ const mapKeyLowerPrefix = "lower/"
// chromaLexers is fully managed by us to do fast lookup for chroma lexers by file name or language name
// Don't use lexers.Get because it is very slow in many cases (iterate all rules, filepath glob match, etc.)
var chromaLexers = sync.OnceValue(func() (ret struct {
conflictingExtLangMap map[string]string
conflictingExtLangMap map[string]string
conflictingAliasLangMap map[string]string
lowerNameMap map[string]chroma.Lexer // lexer name (lang name) in lower-case
fileBaseMap map[string]chroma.Lexer
@ -36,9 +37,9 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
ret.fileBaseMap = make(map[string]chroma.Lexer)
ret.fileExtMap = make(map[string]chroma.Lexer)
// Chroma has overlaps in file extension for different languages,
// Chroma has conflicts in file extension for different languages,
// When we need to do fast render, there is no way to detect the language by content,
// So we can only choose some default languages for the overlapped file extensions.
// So we can only choose some default languages for the conflicted file extensions.
ret.conflictingExtLangMap = map[string]string{
".as": "ActionScript 3", // ActionScript
".asm": "NASM", // TASM, NASM, RGBDS Assembly, Z80 Assembly
@ -71,12 +72,17 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
".v": "V", // verilog
".xslt": "HTML", // XML
}
// use widely used language names as the default mapping to resolve name alias conflict
ret.conflictingAliasLangMap = map[string]string{
"hcl": "HCL", // Terraform
"v": "V", // verilog
}
isPlainPattern := func(key string) bool {
return !strings.ContainsAny(key, "*?[]") // only support simple patterns
}
setMapWithLowerKey := func(m map[string]chroma.Lexer, key string, lexer chroma.Lexer) {
setFileNameMapWithLowerKey := func(m map[string]chroma.Lexer, key string, lexer chroma.Lexer) {
if _, conflict := m[key]; conflict {
panic("duplicate key in lexer map: " + key + ", need to add it to conflictingExtLangMap")
}
@ -87,7 +93,7 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
processFileName := func(fileName string, lexer chroma.Lexer) bool {
if isPlainPattern(fileName) {
// full base name match
setMapWithLowerKey(ret.fileBaseMap, fileName, lexer)
setFileNameMapWithLowerKey(ret.fileBaseMap, fileName, lexer)
return true
}
if strings.HasPrefix(fileName, "*") {
@ -96,7 +102,7 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
if isPlainPattern(fileExt) {
presetName := ret.conflictingExtLangMap[fileExt]
if presetName == "" || lexer.Config().Name == presetName {
setMapWithLowerKey(ret.fileExtMap, fileExt, lexer)
setFileNameMapWithLowerKey(ret.fileExtMap, fileExt, lexer)
}
return true
}
@ -134,13 +140,30 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
return patterns
}
// add lexers to our map, for fast lookup
processLexerNameAliases := func(lexer chroma.Lexer) {
cfg := lexer.Config()
lowerName := strings.ToLower(cfg.Name)
if _, conflicted := ret.lowerNameMap[lowerName]; conflicted {
panic("duplicate language name in lexer map: " + lowerName)
}
ret.lowerNameMap[lowerName] = lexer
for _, name := range cfg.Aliases {
lowerName := strings.ToLower(name)
if overriddenName, overridden := ret.conflictingAliasLangMap[lowerName]; overridden && overriddenName != cfg.Name {
continue
}
if existingLexer, conflict := ret.lowerNameMap[lowerName]; conflict && existingLexer.Config().Name != cfg.Name {
panic("duplicate alias in lexer map: " + name + ", conflict between " + existingLexer.Config().Name + " and " + cfg.Name)
}
ret.lowerNameMap[lowerName] = lexer
}
}
// the main loop: build our lookup maps for lexers
for _, lexer := range lexers.GlobalLexerRegistry.Lexers {
cfg := lexer.Config()
ret.lowerNameMap[strings.ToLower(lexer.Config().Name)] = lexer
for _, alias := range cfg.Aliases {
ret.lowerNameMap[strings.ToLower(alias)] = lexer
}
processLexerNameAliases(lexer)
for _, s := range expandGlobPatterns(cfg.Filenames) {
if !processFileName(s, lexer) {
panic("unsupported file name pattern in lexer: " + s)
@ -153,7 +176,12 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
}
}
// final check: make sure the default ext-lang mapping is correct, nothing is missing
// final check: make sure the default overriding mapping is correct, nothing is missing
for lowerName, lexerName := range ret.conflictingAliasLangMap {
if lexer, ok := ret.lowerNameMap[lowerName]; !ok || lexer.Config().Name != lexerName {
panic("missing default name-lang mapping for: " + lowerName)
}
}
for ext, lexerName := range ret.conflictingExtLangMap {
if lexer, ok := ret.fileExtMap[ext]; !ok || lexer.Config().Name != lexerName {
panic("missing default ext-lang mapping for: " + ext)

View File

@ -45,7 +45,7 @@ func BenchmarkRenderCodeByLexer(b *testing.B) {
lexer := DetectChromaLexerByFileName("a.sql", "")
b.StartTimer()
for b.Loop() {
// Really slow .......
// Really slow ....... the regexp2 used by Chroma takes most of the time
// BenchmarkRenderCodeByLexer-12 22 47159038 ns/op
RenderCodeByLexer(lexer, code)
}
@ -55,13 +55,14 @@ func TestDetectChromaLexer(t *testing.T) {
globalVars().highlightMapping[".my-html"] = "HTML"
t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") })
cases := []struct {
casesWithContent := []struct {
fileName string
language string
content string
expected string
}{
{"test.py", "", "", "Python"},
{"test.v", "", "", "V"},
{"test.v", "any-lang-name", "", "V"},
{"any-file", "javascript", "", "JavaScript"},
{"any-file", "", "/* vim: set filetype=python */", "Python"},
@ -80,11 +81,36 @@ func TestDetectChromaLexer(t *testing.T) {
{"a.sql", "", "", "SQL"},
{"dhcpd.conf", "", "", "ISCdhcpd"},
{".env.my-production", "", "", "Bash"},
{"a.hcl", "", "", "HCL"}, // not the same as Chroma, enry detects "*.hcl" as "HCL"
{"a.hcl", "HCL", "", "HCL"},
{"a.hcl", "Terraform", "", "Terraform"},
}
for _, c := range cases {
for _, c := range casesWithContent {
lexer := detectChromaLexerWithAnalyze(c.fileName, c.language, []byte(c.content))
if assert.NotNil(t, lexer, "case: %+v", c) {
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
}
}
casesNameLang := []struct {
fileName string
language string
expected string
byLang bool
}{
{"a.v", "", "V", false},
{"a.v", "V", "V", true},
{"a.v", "verilog", "verilog", true},
{"a.v", "any-lang-name", "V", false},
{"a.hcl", "", "Terraform", false}, // not the same as enry
{"a.hcl", "HCL", "HCL", true},
{"a.hcl", "Terraform", "Terraform", true},
}
for _, c := range casesNameLang {
lexer, byLang := detectChromaLexerByFileName(c.fileName, c.language)
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
assert.Equal(t, c.byLang, byLang, "case: %+v", c)
}
}