mirror of
https://github.com/go-gitea/gitea.git
synced 2025-09-09 04:44:46 +02:00
Default behaviour rejected all rows (Records) with more or fewer columns (Fields) than the first row, preventing them from parsing at all and silently hiding them. While RFC4180 section 2.4 says each line _should_ contain the same number of fields, enforcing this on the viewer is unhelpful. This pull request disables that validation, allowing the viewer to render lines with fewer columns than the maximum number within the file. As it's a simple HTML table, this works without additional changes (i.e. no need to manually determine the maximum number of columns), but the default appearance of rows with fewer columns may be undesirable to some people, especially when using CSS that has `td {border-right: none}`. <img width="1408" height="156" alt="Screenshot without cell right borders" src="https://github.com/user-attachments/assets/d4c19bbc-3fd2-4fd1-83a6-1125e953e95b" /> <img width="1397" height="158" alt="Screenshot with cell right borders" src="https://github.com/user-attachments/assets/86aaafcb-d7e8-4228-99a8-7527c823a07c" /> Fixes #16559, #30358. Unfortunately, retaining empty lines is less trivial, so the line numbers on the leftmost column will still not match the source file whenever those are present, though a future PR could address that.
152 lines
5.4 KiB
Go
152 lines
5.4 KiB
Go
// Copyright 2021 The Gitea Authors. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
package csv
|
|
|
|
import (
|
|
"bytes"
|
|
stdcsv "encoding/csv"
|
|
"io"
|
|
"path"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"code.gitea.io/gitea/modules/markup"
|
|
"code.gitea.io/gitea/modules/translation"
|
|
"code.gitea.io/gitea/modules/util"
|
|
)
|
|
|
|
const (
|
|
maxLines = 10
|
|
guessSampleSize = 1e4 // 10k
|
|
)
|
|
|
|
// CreateReader creates a csv.Reader with the given delimiter.
|
|
func CreateReader(input io.Reader, delimiter rune) *stdcsv.Reader {
|
|
rd := stdcsv.NewReader(input)
|
|
rd.Comma = delimiter
|
|
if delimiter != '\t' && delimiter != ' ' {
|
|
// TrimLeadingSpace can't be true when delimiter is a tab or a space as the value for a column might be empty,
|
|
// thus would change `\t\t` to just `\t` or ` ` (two spaces) to just ` ` (single space)
|
|
rd.TrimLeadingSpace = true
|
|
}
|
|
// Don't force validation of every row to have the same number of entries as the first row.
|
|
rd.FieldsPerRecord = -1
|
|
return rd
|
|
}
|
|
|
|
// CreateReaderAndDetermineDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
|
|
// Reads at most guessSampleSize bytes.
|
|
func CreateReaderAndDetermineDelimiter(ctx *markup.RenderContext, rd io.Reader) (*stdcsv.Reader, error) {
|
|
data := make([]byte, guessSampleSize)
|
|
size, err := util.ReadAtMost(rd, data)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return CreateReader(
|
|
io.MultiReader(bytes.NewReader(data[:size]), rd),
|
|
determineDelimiter(ctx, data[:size]),
|
|
), nil
|
|
}
|
|
|
|
// determineDelimiter takes a RenderContext and if it isn't nil and the Filename has an extension that specifies the delimiter,
|
|
// it is used as the delimiter. Otherwise we call guessDelimiter with the data passed
|
|
func determineDelimiter(ctx *markup.RenderContext, data []byte) rune {
|
|
extension := ".csv"
|
|
if ctx != nil {
|
|
extension = strings.ToLower(path.Ext(ctx.RenderOptions.RelativePath))
|
|
}
|
|
|
|
var delimiter rune
|
|
switch extension {
|
|
case ".tsv":
|
|
delimiter = '\t'
|
|
case ".psv":
|
|
delimiter = '|'
|
|
default:
|
|
delimiter = guessDelimiter(data)
|
|
}
|
|
|
|
return delimiter
|
|
}
|
|
|
|
// quoteRegexp follows the RFC-4180 CSV standard for when double-quotes are used to enclose fields, then a double-quote appearing inside a
|
|
// field must be escaped by preceding it with another double quote. https://www.ietf.org/rfc/rfc4180.txt
|
|
// This finds all quoted strings that have escaped quotes.
|
|
var quoteRegexp = regexp.MustCompile(`"[^"]*"`)
|
|
|
|
// removeQuotedStrings uses the quoteRegexp to remove all quoted strings so that we can reliably have each row on one line
|
|
// (quoted strings often have new lines within the string)
|
|
func removeQuotedString(text string) string {
|
|
return quoteRegexp.ReplaceAllLiteralString(text, "")
|
|
}
|
|
|
|
// guessDelimiter takes up to maxLines of the CSV text, iterates through the possible delimiters, and sees if the CSV Reader reads it without throwing any errors.
|
|
// If more than one delimiter passes, the delimiter that results in the most columns is returned.
|
|
func guessDelimiter(data []byte) rune {
|
|
delimiter := guessFromBeforeAfterQuotes(data)
|
|
if delimiter != 0 {
|
|
return delimiter
|
|
}
|
|
|
|
// Removes quoted values so we don't have columns with new lines in them
|
|
text := removeQuotedString(string(data))
|
|
|
|
// Make the text just be maxLines or less, ignoring truncated lines
|
|
lines := strings.SplitN(text, "\n", maxLines+1) // Will contain at least one line, and if there are more than MaxLines, the last item holds the rest of the lines
|
|
if len(lines) > maxLines {
|
|
// If the length of lines is > maxLines we know we have the max number of lines, trim it to maxLines
|
|
lines = lines[:maxLines]
|
|
} else if len(lines) > 1 && len(data) >= guessSampleSize {
|
|
// Even with data >= guessSampleSize, we don't have maxLines + 1 (no extra lines, must have really long lines)
|
|
// thus the last line is probably have a truncated line. Drop the last line if len(lines) > 1
|
|
lines = lines[:len(lines)-1]
|
|
}
|
|
|
|
// Put lines back together as a string
|
|
text = strings.Join(lines, "\n")
|
|
|
|
delimiters := []rune{',', '\t', ';', '|', '@'}
|
|
validDelim := delimiters[0]
|
|
validDelimColCount := 0
|
|
for _, delim := range delimiters {
|
|
csvReader := stdcsv.NewReader(strings.NewReader(text))
|
|
csvReader.Comma = delim
|
|
if rows, err := csvReader.ReadAll(); err == nil && len(rows) > 0 && len(rows[0]) > validDelimColCount {
|
|
validDelim = delim
|
|
validDelimColCount = len(rows[0])
|
|
}
|
|
}
|
|
return validDelim
|
|
}
|
|
|
|
// FormatError converts csv errors into readable messages.
|
|
func FormatError(err error, locale translation.Locale) (string, error) {
|
|
if perr, ok := err.(*stdcsv.ParseError); ok {
|
|
if perr.Err == stdcsv.ErrFieldCount {
|
|
return locale.TrString("repo.error.csv.invalid_field_count", perr.Line), nil
|
|
}
|
|
return locale.TrString("repo.error.csv.unexpected", perr.Line, perr.Column), nil
|
|
}
|
|
|
|
return "", err
|
|
}
|
|
|
|
// Looks for possible delimiters right before or after (with spaces after the former) double quotes with closing quotes
|
|
var beforeAfterQuotes = regexp.MustCompile(`([,@\t;|]{0,1}) *(?:"[^"]*")+([,@\t;|]{0,1})`)
|
|
|
|
// guessFromBeforeAfterQuotes guesses the limiter by finding a double quote that has a valid delimiter before it and a closing quote,
|
|
// or a double quote with a closing quote and a valid delimiter after it
|
|
func guessFromBeforeAfterQuotes(data []byte) rune {
|
|
rs := beforeAfterQuotes.FindStringSubmatch(string(data)) // returns first match, or nil if none
|
|
if rs != nil {
|
|
if rs[1] != "" {
|
|
return rune(rs[1][0]) // delimiter found left of quoted string
|
|
} else if rs[2] != "" {
|
|
return rune(rs[2][0]) // delimiter found right of quoted string
|
|
}
|
|
}
|
|
return 0 // no match found
|
|
}
|