diff --git a/modules/highlight/highlight.go b/modules/highlight/highlight.go
index c7416c7a10..addc372f85 100644
--- a/modules/highlight/highlight.go
+++ b/modules/highlight/highlight.go
@@ -5,13 +5,9 @@
package highlight
import (
- "bufio"
"bytes"
- "fmt"
- gohtml "html"
"html/template"
- "io"
- "strings"
+ "slices"
"sync"
"code.gitea.io/gitea/modules/log"
@@ -23,12 +19,14 @@ import (
"github.com/alecthomas/chroma/v2/styles"
)
-// don't index files larger than this many bytes for performance purposes
+// don't highlight files larger than this many bytes for performance purposes
const sizeLimit = 1024 * 1024
type globalVarsType struct {
highlightMapping map[string]string
githubStyles *chroma.Style
+ escapeFull []template.HTML
+ escCtrlCharsMap []template.HTML
}
var (
@@ -44,10 +42,69 @@ func globalVars() *globalVarsType {
globalVarsPtr = &globalVarsType{}
globalVarsPtr.githubStyles = styles.Get("github")
globalVarsPtr.highlightMapping = setting.GetHighlightMapping()
+ globalVarsPtr.escCtrlCharsMap = make([]template.HTML, 256)
+ // ASCII Table 0x00 - 0x1F
+ controlCharNames := []string{
+ "NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",
+ "BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI",
+ "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",
+ "CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US",
+ }
+ // Uncomment this line if you'd debug the layout without creating a special file, then Space (0x20) will also be escaped.
+ // Don't worry, even if you forget to comment it out and push it to git repo, the CI tests will catch it and fail.
+ // controlCharNames = append(controlCharNames, "SP")
+ for i, s := range controlCharNames {
+ globalVarsPtr.escCtrlCharsMap[i] = template.HTML(`` + string(byte(i)) + ``)
+ }
+ globalVarsPtr.escCtrlCharsMap[0x7f] = template.HTML(`` + string(byte(0x7f)) + ``)
+ globalVarsPtr.escCtrlCharsMap['\t'] = ""
+ globalVarsPtr.escCtrlCharsMap['\n'] = ""
+ globalVarsPtr.escCtrlCharsMap['\r'] = ""
+
+ globalVarsPtr.escapeFull = slices.Clone(globalVarsPtr.escCtrlCharsMap)
+ // exactly the same as Golang's html.EscapeString
+ globalVarsPtr.escapeFull['&'] = "&"
+ globalVarsPtr.escapeFull['\''] = "'"
+ globalVarsPtr.escapeFull['<'] = "<"
+ globalVarsPtr.escapeFull['>'] = ">"
+ globalVarsPtr.escapeFull['"'] = """
}
return globalVarsPtr
}
+func escapeByMap(code []byte, escapeMap []template.HTML) template.HTML {
+ firstEscapePos := -1
+ for i, c := range code {
+ if escapeMap[c] != "" {
+ firstEscapePos = i
+ break
+ }
+ }
+ if firstEscapePos == -1 {
+ return template.HTML(util.UnsafeBytesToString(code))
+ }
+
+ buf := make([]byte, firstEscapePos, len(code)*2)
+ copy(buf[:firstEscapePos], code[:firstEscapePos])
+ for i := firstEscapePos; i < len(code); i++ {
+ c := code[i]
+ if esc := escapeMap[c]; esc != "" {
+ buf = append(buf, esc...)
+ } else {
+ buf = append(buf, c)
+ }
+ }
+ return template.HTML(util.UnsafeBytesToString(buf))
+}
+
+func escapeFullString(code string) template.HTML {
+ return escapeByMap(util.UnsafeStringToBytes(code), globalVars().escapeFull)
+}
+
+func escapeControlChars(code []byte) template.HTML {
+ return escapeByMap(code, globalVars().escCtrlCharsMap)
+}
+
// UnsafeSplitHighlightedLines splits highlighted code into lines preserving HTML tags
// It always includes '\n', '\n' can appear at the end of each line or in the middle of HTML tags
// The '\n' is necessary for copying code from web UI to preserve original code lines
@@ -90,7 +147,7 @@ func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML,
}
if len(code) > sizeLimit {
- return template.HTML(template.HTMLEscapeString(code)), nil, ""
+ return escapeFullString(code), nil, ""
}
lexer = detectChromaLexerWithAnalyze(fileName, language, util.UnsafeStringToBytes(code)) // it is also slow
@@ -104,86 +161,66 @@ func RenderCodeByLexer(lexer chroma.Lexer, code string) template.HTML {
html.PreventSurroundingPre(true),
)
- htmlbuf := bytes.Buffer{}
- htmlw := bufio.NewWriter(&htmlbuf)
-
iterator, err := lexer.Tokenise(nil, code)
if err != nil {
log.Error("Can't tokenize code: %v", err)
- return template.HTML(template.HTMLEscapeString(code))
- }
- // style not used for live site but need to pass something
- err = formatter.Format(htmlw, globalVars().githubStyles, iterator)
- if err != nil {
- log.Error("Can't format code: %v", err)
- return template.HTML(template.HTMLEscapeString(code))
+ return escapeFullString(code)
}
- _ = htmlw.Flush()
- // Chroma will add newlines for certain lexers in order to highlight them properly
- // Once highlighted, strip them here, so they don't cause copy/paste trouble in HTML output
- return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n"))
+ htmlBuf := &bytes.Buffer{}
+ // style not used for live site but need to pass something
+ err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator)
+ if err != nil {
+ log.Error("Can't format code: %v", err)
+ return escapeFullString(code)
+ }
+
+ // At the moment, we do not escape control chars here (unlike RenderFullFile which escapes control chars).
+ // The reason is: it is a very rare case that a text file contains control chars.
+ // This function is usually used by highlight diff and blame, not quite sure whether there will be side effects.
+ // If there would be new user feedback about this, we can re-consider about various edge cases.
+ return template.HTML(htmlBuf.String())
}
// RenderFullFile returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name
-func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string, error) {
- if len(code) > sizeLimit {
- return RenderPlainText(code), "", nil
+func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string) {
+ if language == LanguagePlaintext || len(code) > sizeLimit {
+ return renderPlainText(code), formatLexerName(LanguagePlaintext)
}
-
- formatter := html.New(html.WithClasses(true),
- html.WithLineNumbers(false),
- html.PreventSurroundingPre(true),
- )
-
lexer := detectChromaLexerWithAnalyze(fileName, language, code)
lexerName := formatLexerName(lexer.Config().Name)
-
- iterator, err := lexer.Tokenise(nil, string(code))
- if err != nil {
- return nil, "", fmt.Errorf("can't tokenize code: %w", err)
+ rendered := RenderCodeByLexer(lexer, util.UnsafeBytesToString(code))
+ unsafeLines := UnsafeSplitHighlightedLines(rendered)
+ lines := make([]template.HTML, 0, len(unsafeLines))
+ for _, lineBytes := range unsafeLines {
+ line := escapeControlChars(lineBytes)
+ lines = append(lines, line)
}
-
- tokensLines := chroma.SplitTokensIntoLines(iterator.Tokens())
- htmlBuf := &bytes.Buffer{}
-
- lines := make([]template.HTML, 0, len(tokensLines))
- for _, tokens := range tokensLines {
- iterator = chroma.Literator(tokens...)
- err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator)
- if err != nil {
- return nil, "", fmt.Errorf("can't format code: %w", err)
- }
- lines = append(lines, template.HTML(htmlBuf.String()))
- htmlBuf.Reset()
- }
-
- return lines, lexerName, nil
+ return lines, lexerName
}
-// RenderPlainText returns non-highlighted HTML for code
-func RenderPlainText(code []byte) []template.HTML {
- r := bufio.NewReader(bytes.NewReader(code))
- m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
- for {
- content, err := r.ReadString('\n')
- if err != nil && err != io.EOF {
- log.Error("failed to read string from buffer: %v", err)
- break
+// renderPlainText returns non-highlighted HTML for code
+func renderPlainText(code []byte) []template.HTML {
+ lines := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
+ pos := 0
+ for pos < len(code) {
+ var content []byte
+ nextPos := bytes.IndexByte(code[pos:], '\n')
+ if nextPos == -1 {
+ content = code[pos:]
+ pos = len(code)
+ } else {
+ content = code[pos : pos+nextPos+1]
+ pos += nextPos + 1
}
- if content == "" && err == io.EOF {
- break
- }
- s := template.HTML(gohtml.EscapeString(content))
- m = append(m, s)
+ lines = append(lines, escapeFullString(util.UnsafeBytesToString(content)))
}
- return m
+ return lines
}
func formatLexerName(name string) string {
- if name == "fallback" {
+ if name == LanguagePlaintext || name == chromaLexerFallback {
return "Plaintext"
}
-
return util.ToTitleCaseNoLower(name)
}
diff --git a/modules/highlight/highlight_test.go b/modules/highlight/highlight_test.go
index d026210475..cad22ba9bb 100644
--- a/modules/highlight/highlight_test.go
+++ b/modules/highlight/highlight_test.go
@@ -118,8 +118,7 @@ c=2
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- out, lexerName, err := RenderFullFile(tt.name, "", []byte(tt.code))
- assert.NoError(t, err)
+ out, lexerName := RenderFullFile(tt.name, "", []byte(tt.code))
assert.Equal(t, tt.want, out)
assert.Equal(t, tt.lexerName, lexerName)
})
@@ -182,7 +181,7 @@ c=2`),
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- out := RenderPlainText([]byte(tt.code))
+ out := renderPlainText([]byte(tt.code))
assert.Equal(t, tt.want, out)
})
}
@@ -205,3 +204,14 @@ func TestUnsafeSplitHighlightedLines(t *testing.T) {
assert.Equal(t, "a\n", string(ret[0]))
assert.Equal(t, "b\n", string(ret[1]))
}
+
+func TestEscape(t *testing.T) {
+ assert.Equal(t, template.HTML("\t\r\n\x00\x1f&'\"<>"), escapeControlChars([]byte("\t\r\n\x00\x1f&'\"<>")))
+ assert.Equal(t, template.HTML("\x00\x1f&'"<>\t\r\n"), escapeFullString("\x00\x1f&'\"<>\t\r\n"))
+
+ out, _ := RenderFullFile("a.py", "", []byte("# \x7f<>"))
+ assert.Equal(t, template.HTML(`# `+string(byte(0x7f))+`<>`), out[0])
+
+ out = renderPlainText([]byte("# \x7f<>"))
+ assert.Equal(t, template.HTML(`# `+string(byte(0x7f))+`<>`), out[0])
+}
diff --git a/modules/highlight/lexerdetect.go b/modules/highlight/lexerdetect.go
index 5d98578f35..fe430f463f 100644
--- a/modules/highlight/lexerdetect.go
+++ b/modules/highlight/lexerdetect.go
@@ -16,7 +16,11 @@ import (
"github.com/go-enry/go-enry/v2"
)
-const mapKeyLowerPrefix = "lower/"
+const (
+ mapKeyLowerPrefix = "lower/"
+ LanguagePlaintext = "plaintext"
+ chromaLexerFallback = "fallback"
+)
// chromaLexers is fully managed by us to do fast lookup for chroma lexers by file name or language name
// Don't use lexers.Get because it is very slow in many cases (iterate all rules, filepath glob match, etc.)
diff --git a/routers/web/repo/view_file.go b/routers/web/repo/view_file.go
index 44bc8543b0..3ae0dab25b 100644
--- a/routers/web/repo/view_file.go
+++ b/routers/web/repo/view_file.go
@@ -119,12 +119,8 @@ func handleFileViewRenderSource(ctx *context.Context, attrs *attribute.Attribute
}
language := attrs.GetLanguage().Value()
- fileContent, lexerName, err := highlight.RenderFullFile(filename, language, buf)
+ fileContent, lexerName := highlight.RenderFullFile(filename, language, buf)
ctx.Data["LexerName"] = lexerName
- if err != nil {
- log.Error("highlight.RenderFullFile failed, fallback to plain text: %v", err)
- fileContent = highlight.RenderPlainText(buf)
- }
status := &charset.EscapeStatus{}
statuses := make([]*charset.EscapeStatus, len(fileContent))
for i, line := range fileContent {
diff --git a/services/gitdiff/gitdiff_test.go b/services/gitdiff/gitdiff_test.go
index cfd99544cc..e1b358215f 100644
--- a/services/gitdiff/gitdiff_test.go
+++ b/services/gitdiff/gitdiff_test.go
@@ -1140,7 +1140,7 @@ func TestHighlightCodeLines(t *testing.T) {
ret := highlightCodeLinesForDiffFile(diffFile, true, []byte("a\nb\n"))
assert.Equal(t, map[int]template.HTML{
0: `a` + nl,
- 1: `b`,
+ 1: `b` + nl,
}, ret)
})
}
diff --git a/web_src/css/index.css b/web_src/css/index.css
index f44a5d41ed..c23e3e1c19 100644
--- a/web_src/css/index.css
+++ b/web_src/css/index.css
@@ -33,6 +33,7 @@
@import "./modules/flexcontainer.css";
@import "./modules/codeeditor.css";
@import "./modules/chroma.css";
+@import "./modules/charescape.css";
@import "./shared/flex-list.css";
@import "./shared/milestone.css";
diff --git a/web_src/css/modules/charescape.css b/web_src/css/modules/charescape.css
new file mode 100644
index 0000000000..0c9cbb55b5
--- /dev/null
+++ b/web_src/css/modules/charescape.css
@@ -0,0 +1,48 @@
+/*
+Show the escaped and hide the real char:
+ {real-char}
+Only show the real-char:
+ {real-char}
+*/
+.broken-code-point:not([data-escaped]),
+.broken-code-point[data-escaped]::before {
+ border-radius: 4px;
+ padding: 0 2px;
+ color: var(--color-body);
+ background: var(--color-text-light-1);
+}
+
+.broken-code-point[data-escaped]::before {
+ visibility: visible;
+ content: attr(data-escaped);
+}
+.broken-code-point[data-escaped] .char {
+ /* make it copyable by selecting the text (AI suggestion, no other solution) */
+ position: absolute;
+ opacity: 0;
+ pointer-events: none;
+}
+
+/*
+Show the escaped and hide the real-char:
+
+ {real-char}
+
+Hide the escaped and show the real-char:
+
+ {real-char}
+
+*/
+.unicode-escaped .escaped-code-point[data-escaped]::before {
+ visibility: visible;
+ content: attr(data-escaped);
+ color: var(--color-red);
+}
+
+.unicode-escaped .escaped-code-point .char {
+ display: none;
+}
+
+.unicode-escaped .ambiguous-code-point {
+ border: 1px var(--color-yellow) solid;
+}
diff --git a/web_src/css/repo.css b/web_src/css/repo.css
index 1dd5301338..95d6ca2169 100644
--- a/web_src/css/repo.css
+++ b/web_src/css/repo.css
@@ -8,26 +8,6 @@
min-width: 40% !important;
}
-.repository .unicode-escaped .escaped-code-point[data-escaped]::before {
- visibility: visible;
- content: attr(data-escaped);
- font-family: var(--fonts-monospace);
- color: var(--color-red);
-}
-
-.repository .unicode-escaped .escaped-code-point .char {
- display: none;
-}
-
-.repository .broken-code-point {
- font-family: var(--fonts-monospace);
- color: var(--color-blue);
-}
-
-.repository .unicode-escaped .ambiguous-code-point {
- border: 1px var(--color-yellow) solid;
-}
-
.issue-content {
display: flex;
align-items: flex-start;