mirror of
https://github.com/go-gitea/gitea.git
synced 2026-03-14 05:36:24 +01:00
Fix chroma lexer mapping (#36629)
Fix some edge cases for ".hcl" and ".v" files, and add more tests
This commit is contained in:
parent
08d9845635
commit
258754f299
@ -21,7 +21,8 @@ const mapKeyLowerPrefix = "lower/"
|
||||
// chromaLexers is fully managed by us to do fast lookup for chroma lexers by file name or language name
|
||||
// Don't use lexers.Get because it is very slow in many cases (iterate all rules, filepath glob match, etc.)
|
||||
var chromaLexers = sync.OnceValue(func() (ret struct {
|
||||
conflictingExtLangMap map[string]string
|
||||
conflictingExtLangMap map[string]string
|
||||
conflictingAliasLangMap map[string]string
|
||||
|
||||
lowerNameMap map[string]chroma.Lexer // lexer name (lang name) in lower-case
|
||||
fileBaseMap map[string]chroma.Lexer
|
||||
@ -36,9 +37,9 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
|
||||
ret.fileBaseMap = make(map[string]chroma.Lexer)
|
||||
ret.fileExtMap = make(map[string]chroma.Lexer)
|
||||
|
||||
// Chroma has overlaps in file extension for different languages,
|
||||
// Chroma has conflicts in file extension for different languages,
|
||||
// When we need to do fast render, there is no way to detect the language by content,
|
||||
// So we can only choose some default languages for the overlapped file extensions.
|
||||
// So we can only choose some default languages for the conflicted file extensions.
|
||||
ret.conflictingExtLangMap = map[string]string{
|
||||
".as": "ActionScript 3", // ActionScript
|
||||
".asm": "NASM", // TASM, NASM, RGBDS Assembly, Z80 Assembly
|
||||
@ -71,12 +72,17 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
|
||||
".v": "V", // verilog
|
||||
".xslt": "HTML", // XML
|
||||
}
|
||||
// use widely used language names as the default mapping to resolve name alias conflict
|
||||
ret.conflictingAliasLangMap = map[string]string{
|
||||
"hcl": "HCL", // Terraform
|
||||
"v": "V", // verilog
|
||||
}
|
||||
|
||||
isPlainPattern := func(key string) bool {
|
||||
return !strings.ContainsAny(key, "*?[]") // only support simple patterns
|
||||
}
|
||||
|
||||
setMapWithLowerKey := func(m map[string]chroma.Lexer, key string, lexer chroma.Lexer) {
|
||||
setFileNameMapWithLowerKey := func(m map[string]chroma.Lexer, key string, lexer chroma.Lexer) {
|
||||
if _, conflict := m[key]; conflict {
|
||||
panic("duplicate key in lexer map: " + key + ", need to add it to conflictingExtLangMap")
|
||||
}
|
||||
@ -87,7 +93,7 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
|
||||
processFileName := func(fileName string, lexer chroma.Lexer) bool {
|
||||
if isPlainPattern(fileName) {
|
||||
// full base name match
|
||||
setMapWithLowerKey(ret.fileBaseMap, fileName, lexer)
|
||||
setFileNameMapWithLowerKey(ret.fileBaseMap, fileName, lexer)
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(fileName, "*") {
|
||||
@ -96,7 +102,7 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
|
||||
if isPlainPattern(fileExt) {
|
||||
presetName := ret.conflictingExtLangMap[fileExt]
|
||||
if presetName == "" || lexer.Config().Name == presetName {
|
||||
setMapWithLowerKey(ret.fileExtMap, fileExt, lexer)
|
||||
setFileNameMapWithLowerKey(ret.fileExtMap, fileExt, lexer)
|
||||
}
|
||||
return true
|
||||
}
|
||||
@ -134,13 +140,30 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
|
||||
return patterns
|
||||
}
|
||||
|
||||
// add lexers to our map, for fast lookup
|
||||
processLexerNameAliases := func(lexer chroma.Lexer) {
|
||||
cfg := lexer.Config()
|
||||
lowerName := strings.ToLower(cfg.Name)
|
||||
if _, conflicted := ret.lowerNameMap[lowerName]; conflicted {
|
||||
panic("duplicate language name in lexer map: " + lowerName)
|
||||
}
|
||||
ret.lowerNameMap[lowerName] = lexer
|
||||
|
||||
for _, name := range cfg.Aliases {
|
||||
lowerName := strings.ToLower(name)
|
||||
if overriddenName, overridden := ret.conflictingAliasLangMap[lowerName]; overridden && overriddenName != cfg.Name {
|
||||
continue
|
||||
}
|
||||
if existingLexer, conflict := ret.lowerNameMap[lowerName]; conflict && existingLexer.Config().Name != cfg.Name {
|
||||
panic("duplicate alias in lexer map: " + name + ", conflict between " + existingLexer.Config().Name + " and " + cfg.Name)
|
||||
}
|
||||
ret.lowerNameMap[lowerName] = lexer
|
||||
}
|
||||
}
|
||||
|
||||
// the main loop: build our lookup maps for lexers
|
||||
for _, lexer := range lexers.GlobalLexerRegistry.Lexers {
|
||||
cfg := lexer.Config()
|
||||
ret.lowerNameMap[strings.ToLower(lexer.Config().Name)] = lexer
|
||||
for _, alias := range cfg.Aliases {
|
||||
ret.lowerNameMap[strings.ToLower(alias)] = lexer
|
||||
}
|
||||
processLexerNameAliases(lexer)
|
||||
for _, s := range expandGlobPatterns(cfg.Filenames) {
|
||||
if !processFileName(s, lexer) {
|
||||
panic("unsupported file name pattern in lexer: " + s)
|
||||
@ -153,7 +176,12 @@ var chromaLexers = sync.OnceValue(func() (ret struct {
|
||||
}
|
||||
}
|
||||
|
||||
// final check: make sure the default ext-lang mapping is correct, nothing is missing
|
||||
// final check: make sure the default overriding mapping is correct, nothing is missing
|
||||
for lowerName, lexerName := range ret.conflictingAliasLangMap {
|
||||
if lexer, ok := ret.lowerNameMap[lowerName]; !ok || lexer.Config().Name != lexerName {
|
||||
panic("missing default name-lang mapping for: " + lowerName)
|
||||
}
|
||||
}
|
||||
for ext, lexerName := range ret.conflictingExtLangMap {
|
||||
if lexer, ok := ret.fileExtMap[ext]; !ok || lexer.Config().Name != lexerName {
|
||||
panic("missing default ext-lang mapping for: " + ext)
|
||||
|
||||
@ -45,7 +45,7 @@ func BenchmarkRenderCodeByLexer(b *testing.B) {
|
||||
lexer := DetectChromaLexerByFileName("a.sql", "")
|
||||
b.StartTimer()
|
||||
for b.Loop() {
|
||||
// Really slow .......
|
||||
// Really slow ....... the regexp2 used by Chroma takes most of the time
|
||||
// BenchmarkRenderCodeByLexer-12 22 47159038 ns/op
|
||||
RenderCodeByLexer(lexer, code)
|
||||
}
|
||||
@ -55,13 +55,14 @@ func TestDetectChromaLexer(t *testing.T) {
|
||||
globalVars().highlightMapping[".my-html"] = "HTML"
|
||||
t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") })
|
||||
|
||||
cases := []struct {
|
||||
casesWithContent := []struct {
|
||||
fileName string
|
||||
language string
|
||||
content string
|
||||
expected string
|
||||
}{
|
||||
{"test.py", "", "", "Python"},
|
||||
{"test.v", "", "", "V"},
|
||||
{"test.v", "any-lang-name", "", "V"},
|
||||
|
||||
{"any-file", "javascript", "", "JavaScript"},
|
||||
{"any-file", "", "/* vim: set filetype=python */", "Python"},
|
||||
@ -80,11 +81,36 @@ func TestDetectChromaLexer(t *testing.T) {
|
||||
{"a.sql", "", "", "SQL"},
|
||||
{"dhcpd.conf", "", "", "ISCdhcpd"},
|
||||
{".env.my-production", "", "", "Bash"},
|
||||
|
||||
{"a.hcl", "", "", "HCL"}, // not the same as Chroma, enry detects "*.hcl" as "HCL"
|
||||
{"a.hcl", "HCL", "", "HCL"},
|
||||
{"a.hcl", "Terraform", "", "Terraform"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
for _, c := range casesWithContent {
|
||||
lexer := detectChromaLexerWithAnalyze(c.fileName, c.language, []byte(c.content))
|
||||
if assert.NotNil(t, lexer, "case: %+v", c) {
|
||||
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
|
||||
}
|
||||
}
|
||||
|
||||
casesNameLang := []struct {
|
||||
fileName string
|
||||
language string
|
||||
expected string
|
||||
byLang bool
|
||||
}{
|
||||
{"a.v", "", "V", false},
|
||||
{"a.v", "V", "V", true},
|
||||
{"a.v", "verilog", "verilog", true},
|
||||
{"a.v", "any-lang-name", "V", false},
|
||||
|
||||
{"a.hcl", "", "Terraform", false}, // not the same as enry
|
||||
{"a.hcl", "HCL", "HCL", true},
|
||||
{"a.hcl", "Terraform", "Terraform", true},
|
||||
}
|
||||
for _, c := range casesNameLang {
|
||||
lexer, byLang := detectChromaLexerByFileName(c.fileName, c.language)
|
||||
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
|
||||
assert.Equal(t, c.byLang, byLang, "case: %+v", c)
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user