From 258754f2997ad4db11d198b72d9c9fe6af9fbcb3 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Mon, 16 Feb 2026 10:11:02 +0800 Subject: [PATCH] Fix chroma lexer mapping (#36629) Fix some edge cases for ".hcl" and ".v" files, and add more tests --- modules/highlight/lexerdetect.go | 52 ++++++++++++++++++++------- modules/highlight/lexerdetect_test.go | 34 +++++++++++++++--- 2 files changed, 70 insertions(+), 16 deletions(-) diff --git a/modules/highlight/lexerdetect.go b/modules/highlight/lexerdetect.go index 5b39617566..5d98578f35 100644 --- a/modules/highlight/lexerdetect.go +++ b/modules/highlight/lexerdetect.go @@ -21,7 +21,8 @@ const mapKeyLowerPrefix = "lower/" // chromaLexers is fully managed by us to do fast lookup for chroma lexers by file name or language name // Don't use lexers.Get because it is very slow in many cases (iterate all rules, filepath glob match, etc.) var chromaLexers = sync.OnceValue(func() (ret struct { - conflictingExtLangMap map[string]string + conflictingExtLangMap map[string]string + conflictingAliasLangMap map[string]string lowerNameMap map[string]chroma.Lexer // lexer name (lang name) in lower-case fileBaseMap map[string]chroma.Lexer @@ -36,9 +37,9 @@ var chromaLexers = sync.OnceValue(func() (ret struct { ret.fileBaseMap = make(map[string]chroma.Lexer) ret.fileExtMap = make(map[string]chroma.Lexer) - // Chroma has overlaps in file extension for different languages, + // Chroma has conflicts in file extension for different languages, // When we need to do fast render, there is no way to detect the language by content, - // So we can only choose some default languages for the overlapped file extensions. + // So we can only choose some default languages for the conflicted file extensions. ret.conflictingExtLangMap = map[string]string{ ".as": "ActionScript 3", // ActionScript ".asm": "NASM", // TASM, NASM, RGBDS Assembly, Z80 Assembly @@ -71,12 +72,17 @@ var chromaLexers = sync.OnceValue(func() (ret struct { ".v": "V", // verilog ".xslt": "HTML", // XML } + // use widely used language names as the default mapping to resolve name alias conflict + ret.conflictingAliasLangMap = map[string]string{ + "hcl": "HCL", // Terraform + "v": "V", // verilog + } isPlainPattern := func(key string) bool { return !strings.ContainsAny(key, "*?[]") // only support simple patterns } - setMapWithLowerKey := func(m map[string]chroma.Lexer, key string, lexer chroma.Lexer) { + setFileNameMapWithLowerKey := func(m map[string]chroma.Lexer, key string, lexer chroma.Lexer) { if _, conflict := m[key]; conflict { panic("duplicate key in lexer map: " + key + ", need to add it to conflictingExtLangMap") } @@ -87,7 +93,7 @@ var chromaLexers = sync.OnceValue(func() (ret struct { processFileName := func(fileName string, lexer chroma.Lexer) bool { if isPlainPattern(fileName) { // full base name match - setMapWithLowerKey(ret.fileBaseMap, fileName, lexer) + setFileNameMapWithLowerKey(ret.fileBaseMap, fileName, lexer) return true } if strings.HasPrefix(fileName, "*") { @@ -96,7 +102,7 @@ var chromaLexers = sync.OnceValue(func() (ret struct { if isPlainPattern(fileExt) { presetName := ret.conflictingExtLangMap[fileExt] if presetName == "" || lexer.Config().Name == presetName { - setMapWithLowerKey(ret.fileExtMap, fileExt, lexer) + setFileNameMapWithLowerKey(ret.fileExtMap, fileExt, lexer) } return true } @@ -134,13 +140,30 @@ var chromaLexers = sync.OnceValue(func() (ret struct { return patterns } - // add lexers to our map, for fast lookup + processLexerNameAliases := func(lexer chroma.Lexer) { + cfg := lexer.Config() + lowerName := strings.ToLower(cfg.Name) + if _, conflicted := ret.lowerNameMap[lowerName]; conflicted { + panic("duplicate language name in lexer map: " + lowerName) + } + ret.lowerNameMap[lowerName] = lexer + + for _, name := range cfg.Aliases { + lowerName := strings.ToLower(name) + if overriddenName, overridden := ret.conflictingAliasLangMap[lowerName]; overridden && overriddenName != cfg.Name { + continue + } + if existingLexer, conflict := ret.lowerNameMap[lowerName]; conflict && existingLexer.Config().Name != cfg.Name { + panic("duplicate alias in lexer map: " + name + ", conflict between " + existingLexer.Config().Name + " and " + cfg.Name) + } + ret.lowerNameMap[lowerName] = lexer + } + } + + // the main loop: build our lookup maps for lexers for _, lexer := range lexers.GlobalLexerRegistry.Lexers { cfg := lexer.Config() - ret.lowerNameMap[strings.ToLower(lexer.Config().Name)] = lexer - for _, alias := range cfg.Aliases { - ret.lowerNameMap[strings.ToLower(alias)] = lexer - } + processLexerNameAliases(lexer) for _, s := range expandGlobPatterns(cfg.Filenames) { if !processFileName(s, lexer) { panic("unsupported file name pattern in lexer: " + s) @@ -153,7 +176,12 @@ var chromaLexers = sync.OnceValue(func() (ret struct { } } - // final check: make sure the default ext-lang mapping is correct, nothing is missing + // final check: make sure the default overriding mapping is correct, nothing is missing + for lowerName, lexerName := range ret.conflictingAliasLangMap { + if lexer, ok := ret.lowerNameMap[lowerName]; !ok || lexer.Config().Name != lexerName { + panic("missing default name-lang mapping for: " + lowerName) + } + } for ext, lexerName := range ret.conflictingExtLangMap { if lexer, ok := ret.fileExtMap[ext]; !ok || lexer.Config().Name != lexerName { panic("missing default ext-lang mapping for: " + ext) diff --git a/modules/highlight/lexerdetect_test.go b/modules/highlight/lexerdetect_test.go index 868e793a68..a06053be0c 100644 --- a/modules/highlight/lexerdetect_test.go +++ b/modules/highlight/lexerdetect_test.go @@ -45,7 +45,7 @@ func BenchmarkRenderCodeByLexer(b *testing.B) { lexer := DetectChromaLexerByFileName("a.sql", "") b.StartTimer() for b.Loop() { - // Really slow ....... + // Really slow ....... the regexp2 used by Chroma takes most of the time // BenchmarkRenderCodeByLexer-12 22 47159038 ns/op RenderCodeByLexer(lexer, code) } @@ -55,13 +55,14 @@ func TestDetectChromaLexer(t *testing.T) { globalVars().highlightMapping[".my-html"] = "HTML" t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") }) - cases := []struct { + casesWithContent := []struct { fileName string language string content string expected string }{ - {"test.py", "", "", "Python"}, + {"test.v", "", "", "V"}, + {"test.v", "any-lang-name", "", "V"}, {"any-file", "javascript", "", "JavaScript"}, {"any-file", "", "/* vim: set filetype=python */", "Python"}, @@ -80,11 +81,36 @@ func TestDetectChromaLexer(t *testing.T) { {"a.sql", "", "", "SQL"}, {"dhcpd.conf", "", "", "ISCdhcpd"}, {".env.my-production", "", "", "Bash"}, + + {"a.hcl", "", "", "HCL"}, // not the same as Chroma, enry detects "*.hcl" as "HCL" + {"a.hcl", "HCL", "", "HCL"}, + {"a.hcl", "Terraform", "", "Terraform"}, } - for _, c := range cases { + for _, c := range casesWithContent { lexer := detectChromaLexerWithAnalyze(c.fileName, c.language, []byte(c.content)) if assert.NotNil(t, lexer, "case: %+v", c) { assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c) } } + + casesNameLang := []struct { + fileName string + language string + expected string + byLang bool + }{ + {"a.v", "", "V", false}, + {"a.v", "V", "V", true}, + {"a.v", "verilog", "verilog", true}, + {"a.v", "any-lang-name", "V", false}, + + {"a.hcl", "", "Terraform", false}, // not the same as enry + {"a.hcl", "HCL", "HCL", true}, + {"a.hcl", "Terraform", "Terraform", true}, + } + for _, c := range casesNameLang { + lexer, byLang := detectChromaLexerByFileName(c.fileName, c.language) + assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c) + assert.Equal(t, c.byLang, byLang, "case: %+v", c) + } }