diff --git a/modules/highlight/highlight.go b/modules/highlight/highlight.go index c7416c7a10..addc372f85 100644 --- a/modules/highlight/highlight.go +++ b/modules/highlight/highlight.go @@ -5,13 +5,9 @@ package highlight import ( - "bufio" "bytes" - "fmt" - gohtml "html" "html/template" - "io" - "strings" + "slices" "sync" "code.gitea.io/gitea/modules/log" @@ -23,12 +19,14 @@ import ( "github.com/alecthomas/chroma/v2/styles" ) -// don't index files larger than this many bytes for performance purposes +// don't highlight files larger than this many bytes for performance purposes const sizeLimit = 1024 * 1024 type globalVarsType struct { highlightMapping map[string]string githubStyles *chroma.Style + escapeFull []template.HTML + escCtrlCharsMap []template.HTML } var ( @@ -44,10 +42,69 @@ func globalVars() *globalVarsType { globalVarsPtr = &globalVarsType{} globalVarsPtr.githubStyles = styles.Get("github") globalVarsPtr.highlightMapping = setting.GetHighlightMapping() + globalVarsPtr.escCtrlCharsMap = make([]template.HTML, 256) + // ASCII Table 0x00 - 0x1F + controlCharNames := []string{ + "NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", + "BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI", + "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB", + "CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US", + } + // Uncomment this line if you'd debug the layout without creating a special file, then Space (0x20) will also be escaped. + // Don't worry, even if you forget to comment it out and push it to git repo, the CI tests will catch it and fail. + // controlCharNames = append(controlCharNames, "SP") + for i, s := range controlCharNames { + globalVarsPtr.escCtrlCharsMap[i] = template.HTML(`` + string(byte(i)) + ``) + } + globalVarsPtr.escCtrlCharsMap[0x7f] = template.HTML(`` + string(byte(0x7f)) + ``) + globalVarsPtr.escCtrlCharsMap['\t'] = "" + globalVarsPtr.escCtrlCharsMap['\n'] = "" + globalVarsPtr.escCtrlCharsMap['\r'] = "" + + globalVarsPtr.escapeFull = slices.Clone(globalVarsPtr.escCtrlCharsMap) + // exactly the same as Golang's html.EscapeString + globalVarsPtr.escapeFull['&'] = "&" + globalVarsPtr.escapeFull['\''] = "'" + globalVarsPtr.escapeFull['<'] = "<" + globalVarsPtr.escapeFull['>'] = ">" + globalVarsPtr.escapeFull['"'] = """ } return globalVarsPtr } +func escapeByMap(code []byte, escapeMap []template.HTML) template.HTML { + firstEscapePos := -1 + for i, c := range code { + if escapeMap[c] != "" { + firstEscapePos = i + break + } + } + if firstEscapePos == -1 { + return template.HTML(util.UnsafeBytesToString(code)) + } + + buf := make([]byte, firstEscapePos, len(code)*2) + copy(buf[:firstEscapePos], code[:firstEscapePos]) + for i := firstEscapePos; i < len(code); i++ { + c := code[i] + if esc := escapeMap[c]; esc != "" { + buf = append(buf, esc...) + } else { + buf = append(buf, c) + } + } + return template.HTML(util.UnsafeBytesToString(buf)) +} + +func escapeFullString(code string) template.HTML { + return escapeByMap(util.UnsafeStringToBytes(code), globalVars().escapeFull) +} + +func escapeControlChars(code []byte) template.HTML { + return escapeByMap(code, globalVars().escCtrlCharsMap) +} + // UnsafeSplitHighlightedLines splits highlighted code into lines preserving HTML tags // It always includes '\n', '\n' can appear at the end of each line or in the middle of HTML tags // The '\n' is necessary for copying code from web UI to preserve original code lines @@ -90,7 +147,7 @@ func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, } if len(code) > sizeLimit { - return template.HTML(template.HTMLEscapeString(code)), nil, "" + return escapeFullString(code), nil, "" } lexer = detectChromaLexerWithAnalyze(fileName, language, util.UnsafeStringToBytes(code)) // it is also slow @@ -104,86 +161,66 @@ func RenderCodeByLexer(lexer chroma.Lexer, code string) template.HTML { html.PreventSurroundingPre(true), ) - htmlbuf := bytes.Buffer{} - htmlw := bufio.NewWriter(&htmlbuf) - iterator, err := lexer.Tokenise(nil, code) if err != nil { log.Error("Can't tokenize code: %v", err) - return template.HTML(template.HTMLEscapeString(code)) - } - // style not used for live site but need to pass something - err = formatter.Format(htmlw, globalVars().githubStyles, iterator) - if err != nil { - log.Error("Can't format code: %v", err) - return template.HTML(template.HTMLEscapeString(code)) + return escapeFullString(code) } - _ = htmlw.Flush() - // Chroma will add newlines for certain lexers in order to highlight them properly - // Once highlighted, strip them here, so they don't cause copy/paste trouble in HTML output - return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n")) + htmlBuf := &bytes.Buffer{} + // style not used for live site but need to pass something + err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator) + if err != nil { + log.Error("Can't format code: %v", err) + return escapeFullString(code) + } + + // At the moment, we do not escape control chars here (unlike RenderFullFile which escapes control chars). + // The reason is: it is a very rare case that a text file contains control chars. + // This function is usually used by highlight diff and blame, not quite sure whether there will be side effects. + // If there would be new user feedback about this, we can re-consider about various edge cases. + return template.HTML(htmlBuf.String()) } // RenderFullFile returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name -func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string, error) { - if len(code) > sizeLimit { - return RenderPlainText(code), "", nil +func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string) { + if language == LanguagePlaintext || len(code) > sizeLimit { + return renderPlainText(code), formatLexerName(LanguagePlaintext) } - - formatter := html.New(html.WithClasses(true), - html.WithLineNumbers(false), - html.PreventSurroundingPre(true), - ) - lexer := detectChromaLexerWithAnalyze(fileName, language, code) lexerName := formatLexerName(lexer.Config().Name) - - iterator, err := lexer.Tokenise(nil, string(code)) - if err != nil { - return nil, "", fmt.Errorf("can't tokenize code: %w", err) + rendered := RenderCodeByLexer(lexer, util.UnsafeBytesToString(code)) + unsafeLines := UnsafeSplitHighlightedLines(rendered) + lines := make([]template.HTML, 0, len(unsafeLines)) + for _, lineBytes := range unsafeLines { + line := escapeControlChars(lineBytes) + lines = append(lines, line) } - - tokensLines := chroma.SplitTokensIntoLines(iterator.Tokens()) - htmlBuf := &bytes.Buffer{} - - lines := make([]template.HTML, 0, len(tokensLines)) - for _, tokens := range tokensLines { - iterator = chroma.Literator(tokens...) - err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator) - if err != nil { - return nil, "", fmt.Errorf("can't format code: %w", err) - } - lines = append(lines, template.HTML(htmlBuf.String())) - htmlBuf.Reset() - } - - return lines, lexerName, nil + return lines, lexerName } -// RenderPlainText returns non-highlighted HTML for code -func RenderPlainText(code []byte) []template.HTML { - r := bufio.NewReader(bytes.NewReader(code)) - m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1) - for { - content, err := r.ReadString('\n') - if err != nil && err != io.EOF { - log.Error("failed to read string from buffer: %v", err) - break +// renderPlainText returns non-highlighted HTML for code +func renderPlainText(code []byte) []template.HTML { + lines := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1) + pos := 0 + for pos < len(code) { + var content []byte + nextPos := bytes.IndexByte(code[pos:], '\n') + if nextPos == -1 { + content = code[pos:] + pos = len(code) + } else { + content = code[pos : pos+nextPos+1] + pos += nextPos + 1 } - if content == "" && err == io.EOF { - break - } - s := template.HTML(gohtml.EscapeString(content)) - m = append(m, s) + lines = append(lines, escapeFullString(util.UnsafeBytesToString(content))) } - return m + return lines } func formatLexerName(name string) string { - if name == "fallback" { + if name == LanguagePlaintext || name == chromaLexerFallback { return "Plaintext" } - return util.ToTitleCaseNoLower(name) } diff --git a/modules/highlight/highlight_test.go b/modules/highlight/highlight_test.go index d026210475..cad22ba9bb 100644 --- a/modules/highlight/highlight_test.go +++ b/modules/highlight/highlight_test.go @@ -118,8 +118,7 @@ c=2 for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - out, lexerName, err := RenderFullFile(tt.name, "", []byte(tt.code)) - assert.NoError(t, err) + out, lexerName := RenderFullFile(tt.name, "", []byte(tt.code)) assert.Equal(t, tt.want, out) assert.Equal(t, tt.lexerName, lexerName) }) @@ -182,7 +181,7 @@ c=2`), for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - out := RenderPlainText([]byte(tt.code)) + out := renderPlainText([]byte(tt.code)) assert.Equal(t, tt.want, out) }) } @@ -205,3 +204,14 @@ func TestUnsafeSplitHighlightedLines(t *testing.T) { assert.Equal(t, "a\n", string(ret[0])) assert.Equal(t, "b\n", string(ret[1])) } + +func TestEscape(t *testing.T) { + assert.Equal(t, template.HTML("\t\r\n\x00\x1f&'\"<>"), escapeControlChars([]byte("\t\r\n\x00\x1f&'\"<>"))) + assert.Equal(t, template.HTML("\x00\x1f&'"<>\t\r\n"), escapeFullString("\x00\x1f&'\"<>\t\r\n")) + + out, _ := RenderFullFile("a.py", "", []byte("# \x7f<>")) + assert.Equal(t, template.HTML(`# `+string(byte(0x7f))+`<>`), out[0]) + + out = renderPlainText([]byte("# \x7f<>")) + assert.Equal(t, template.HTML(`# `+string(byte(0x7f))+`<>`), out[0]) +} diff --git a/modules/highlight/lexerdetect.go b/modules/highlight/lexerdetect.go index 5d98578f35..fe430f463f 100644 --- a/modules/highlight/lexerdetect.go +++ b/modules/highlight/lexerdetect.go @@ -16,7 +16,11 @@ import ( "github.com/go-enry/go-enry/v2" ) -const mapKeyLowerPrefix = "lower/" +const ( + mapKeyLowerPrefix = "lower/" + LanguagePlaintext = "plaintext" + chromaLexerFallback = "fallback" +) // chromaLexers is fully managed by us to do fast lookup for chroma lexers by file name or language name // Don't use lexers.Get because it is very slow in many cases (iterate all rules, filepath glob match, etc.) diff --git a/routers/web/repo/view_file.go b/routers/web/repo/view_file.go index 44bc8543b0..3ae0dab25b 100644 --- a/routers/web/repo/view_file.go +++ b/routers/web/repo/view_file.go @@ -119,12 +119,8 @@ func handleFileViewRenderSource(ctx *context.Context, attrs *attribute.Attribute } language := attrs.GetLanguage().Value() - fileContent, lexerName, err := highlight.RenderFullFile(filename, language, buf) + fileContent, lexerName := highlight.RenderFullFile(filename, language, buf) ctx.Data["LexerName"] = lexerName - if err != nil { - log.Error("highlight.RenderFullFile failed, fallback to plain text: %v", err) - fileContent = highlight.RenderPlainText(buf) - } status := &charset.EscapeStatus{} statuses := make([]*charset.EscapeStatus, len(fileContent)) for i, line := range fileContent { diff --git a/services/gitdiff/gitdiff_test.go b/services/gitdiff/gitdiff_test.go index cfd99544cc..e1b358215f 100644 --- a/services/gitdiff/gitdiff_test.go +++ b/services/gitdiff/gitdiff_test.go @@ -1140,7 +1140,7 @@ func TestHighlightCodeLines(t *testing.T) { ret := highlightCodeLinesForDiffFile(diffFile, true, []byte("a\nb\n")) assert.Equal(t, map[int]template.HTML{ 0: `a` + nl, - 1: `b`, + 1: `b` + nl, }, ret) }) } diff --git a/web_src/css/index.css b/web_src/css/index.css index f44a5d41ed..c23e3e1c19 100644 --- a/web_src/css/index.css +++ b/web_src/css/index.css @@ -33,6 +33,7 @@ @import "./modules/flexcontainer.css"; @import "./modules/codeeditor.css"; @import "./modules/chroma.css"; +@import "./modules/charescape.css"; @import "./shared/flex-list.css"; @import "./shared/milestone.css"; diff --git a/web_src/css/modules/charescape.css b/web_src/css/modules/charescape.css new file mode 100644 index 0000000000..0c9cbb55b5 --- /dev/null +++ b/web_src/css/modules/charescape.css @@ -0,0 +1,48 @@ +/* +Show the escaped and hide the real char: + {real-char} +Only show the real-char: + {real-char} +*/ +.broken-code-point:not([data-escaped]), +.broken-code-point[data-escaped]::before { + border-radius: 4px; + padding: 0 2px; + color: var(--color-body); + background: var(--color-text-light-1); +} + +.broken-code-point[data-escaped]::before { + visibility: visible; + content: attr(data-escaped); +} +.broken-code-point[data-escaped] .char { + /* make it copyable by selecting the text (AI suggestion, no other solution) */ + position: absolute; + opacity: 0; + pointer-events: none; +} + +/* +Show the escaped and hide the real-char: + + {real-char} + +Hide the escaped and show the real-char: + + {real-char} + +*/ +.unicode-escaped .escaped-code-point[data-escaped]::before { + visibility: visible; + content: attr(data-escaped); + color: var(--color-red); +} + +.unicode-escaped .escaped-code-point .char { + display: none; +} + +.unicode-escaped .ambiguous-code-point { + border: 1px var(--color-yellow) solid; +} diff --git a/web_src/css/repo.css b/web_src/css/repo.css index 1dd5301338..95d6ca2169 100644 --- a/web_src/css/repo.css +++ b/web_src/css/repo.css @@ -8,26 +8,6 @@ min-width: 40% !important; } -.repository .unicode-escaped .escaped-code-point[data-escaped]::before { - visibility: visible; - content: attr(data-escaped); - font-family: var(--fonts-monospace); - color: var(--color-red); -} - -.repository .unicode-escaped .escaped-code-point .char { - display: none; -} - -.repository .broken-code-point { - font-family: var(--fonts-monospace); - color: var(--color-blue); -} - -.repository .unicode-escaped .ambiguous-code-point { - border: 1px var(--color-yellow) solid; -} - .issue-content { display: flex; align-items: flex-start;