0
0
mirror of https://github.com/go-gitea/gitea.git synced 2025-09-09 04:44:46 +02:00
gitea/modules/csv/csv.go
Luke H-W 5fe3296055
Disable Field count validation of CSV viewer (#35228)
Default behaviour rejected all rows (Records) with more or fewer columns
(Fields) than the first row, preventing them from parsing at all and
silently hiding them. While RFC4180 section 2.4 says each line _should_
contain the same number of fields, enforcing this on the viewer is
unhelpful.
This pull request disables that validation, allowing the viewer to
render lines with fewer columns than the maximum number within the file.
As it's a simple HTML table, this works without additional changes (i.e.
no need to manually determine the maximum number of columns), but the
default appearance of rows with fewer columns may be undesirable to some
people, especially when using CSS that has `td {border-right: none}`.
<img width="1408" height="156" alt="Screenshot without cell right
borders"
src="https://github.com/user-attachments/assets/d4c19bbc-3fd2-4fd1-83a6-1125e953e95b"
/>
<img width="1397" height="158" alt="Screenshot with cell right borders"
src="https://github.com/user-attachments/assets/86aaafcb-d7e8-4228-99a8-7527c823a07c"
/>

Fixes #16559, #30358.

Unfortunately, retaining empty lines is less trivial, so the line
numbers on the leftmost column will still not match the source file
whenever those are present, though a future PR could address that.
2025-09-04 09:54:58 -07:00

152 lines
5.4 KiB
Go

// Copyright 2021 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package csv
import (
"bytes"
stdcsv "encoding/csv"
"io"
"path"
"regexp"
"strings"
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/translation"
"code.gitea.io/gitea/modules/util"
)
const (
maxLines = 10
guessSampleSize = 1e4 // 10k
)
// CreateReader creates a csv.Reader with the given delimiter.
func CreateReader(input io.Reader, delimiter rune) *stdcsv.Reader {
rd := stdcsv.NewReader(input)
rd.Comma = delimiter
if delimiter != '\t' && delimiter != ' ' {
// TrimLeadingSpace can't be true when delimiter is a tab or a space as the value for a column might be empty,
// thus would change `\t\t` to just `\t` or ` ` (two spaces) to just ` ` (single space)
rd.TrimLeadingSpace = true
}
// Don't force validation of every row to have the same number of entries as the first row.
rd.FieldsPerRecord = -1
return rd
}
// CreateReaderAndDetermineDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
// Reads at most guessSampleSize bytes.
func CreateReaderAndDetermineDelimiter(ctx *markup.RenderContext, rd io.Reader) (*stdcsv.Reader, error) {
data := make([]byte, guessSampleSize)
size, err := util.ReadAtMost(rd, data)
if err != nil {
return nil, err
}
return CreateReader(
io.MultiReader(bytes.NewReader(data[:size]), rd),
determineDelimiter(ctx, data[:size]),
), nil
}
// determineDelimiter takes a RenderContext and if it isn't nil and the Filename has an extension that specifies the delimiter,
// it is used as the delimiter. Otherwise we call guessDelimiter with the data passed
func determineDelimiter(ctx *markup.RenderContext, data []byte) rune {
extension := ".csv"
if ctx != nil {
extension = strings.ToLower(path.Ext(ctx.RenderOptions.RelativePath))
}
var delimiter rune
switch extension {
case ".tsv":
delimiter = '\t'
case ".psv":
delimiter = '|'
default:
delimiter = guessDelimiter(data)
}
return delimiter
}
// quoteRegexp follows the RFC-4180 CSV standard for when double-quotes are used to enclose fields, then a double-quote appearing inside a
// field must be escaped by preceding it with another double quote. https://www.ietf.org/rfc/rfc4180.txt
// This finds all quoted strings that have escaped quotes.
var quoteRegexp = regexp.MustCompile(`"[^"]*"`)
// removeQuotedStrings uses the quoteRegexp to remove all quoted strings so that we can reliably have each row on one line
// (quoted strings often have new lines within the string)
func removeQuotedString(text string) string {
return quoteRegexp.ReplaceAllLiteralString(text, "")
}
// guessDelimiter takes up to maxLines of the CSV text, iterates through the possible delimiters, and sees if the CSV Reader reads it without throwing any errors.
// If more than one delimiter passes, the delimiter that results in the most columns is returned.
func guessDelimiter(data []byte) rune {
delimiter := guessFromBeforeAfterQuotes(data)
if delimiter != 0 {
return delimiter
}
// Removes quoted values so we don't have columns with new lines in them
text := removeQuotedString(string(data))
// Make the text just be maxLines or less, ignoring truncated lines
lines := strings.SplitN(text, "\n", maxLines+1) // Will contain at least one line, and if there are more than MaxLines, the last item holds the rest of the lines
if len(lines) > maxLines {
// If the length of lines is > maxLines we know we have the max number of lines, trim it to maxLines
lines = lines[:maxLines]
} else if len(lines) > 1 && len(data) >= guessSampleSize {
// Even with data >= guessSampleSize, we don't have maxLines + 1 (no extra lines, must have really long lines)
// thus the last line is probably have a truncated line. Drop the last line if len(lines) > 1
lines = lines[:len(lines)-1]
}
// Put lines back together as a string
text = strings.Join(lines, "\n")
delimiters := []rune{',', '\t', ';', '|', '@'}
validDelim := delimiters[0]
validDelimColCount := 0
for _, delim := range delimiters {
csvReader := stdcsv.NewReader(strings.NewReader(text))
csvReader.Comma = delim
if rows, err := csvReader.ReadAll(); err == nil && len(rows) > 0 && len(rows[0]) > validDelimColCount {
validDelim = delim
validDelimColCount = len(rows[0])
}
}
return validDelim
}
// FormatError converts csv errors into readable messages.
func FormatError(err error, locale translation.Locale) (string, error) {
if perr, ok := err.(*stdcsv.ParseError); ok {
if perr.Err == stdcsv.ErrFieldCount {
return locale.TrString("repo.error.csv.invalid_field_count", perr.Line), nil
}
return locale.TrString("repo.error.csv.unexpected", perr.Line, perr.Column), nil
}
return "", err
}
// Looks for possible delimiters right before or after (with spaces after the former) double quotes with closing quotes
var beforeAfterQuotes = regexp.MustCompile(`([,@\t;|]{0,1}) *(?:"[^"]*")+([,@\t;|]{0,1})`)
// guessFromBeforeAfterQuotes guesses the limiter by finding a double quote that has a valid delimiter before it and a closing quote,
// or a double quote with a closing quote and a valid delimiter after it
func guessFromBeforeAfterQuotes(data []byte) rune {
rs := beforeAfterQuotes.FindStringSubmatch(string(data)) // returns first match, or nil if none
if rs != nil {
if rs[1] != "" {
return rune(rs[1][0]) // delimiter found left of quoted string
} else if rs[2] != "" {
return rune(rs[2][0]) // delimiter found right of quoted string
}
}
return 0 // no match found
}