mirror of
				https://github.com/go-gitea/gitea.git
				synced 2025-10-26 15:01:19 +01:00 
			
		
		
		
	Fix chardet test and add ordering option (#11621)
* Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io>
This commit is contained in:
		
							parent
							
								
									fe2cacf5ea
								
							
						
					
					
						commit
						a1ad188326
					
				| @ -14,7 +14,12 @@ RUN_MODE = dev | |||||||
| [repository] | [repository] | ||||||
| ROOT = | ROOT = | ||||||
| SCRIPT_TYPE = bash | SCRIPT_TYPE = bash | ||||||
| ; Default ANSI charset | ; DETECTED_CHARSETS_ORDER tie-break order for detected charsets. | ||||||
|  | ; If the charsets have equal confidence, tie-breaking will be done by order in this list | ||||||
|  | ; with charsets earlier in the list chosen in preference to those later. | ||||||
|  | ; Adding "defaults" will place the unused charsets at that position.  | ||||||
|  | DETECTED_CHARSETS_ORDER=UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr | ||||||
|  | ; Default ANSI charset to override non-UTF-8 charsets to | ||||||
| ANSI_CHARSET = | ANSI_CHARSET = | ||||||
| ; Force every new repository to be private | ; Force every new repository to be private | ||||||
| FORCE_PRIVATE = false | FORCE_PRIVATE = false | ||||||
|  | |||||||
| @ -46,7 +46,8 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`. | |||||||
|    an absolute path. |    an absolute path. | ||||||
| - `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`, | - `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`, | ||||||
|    but some users report that only `sh` is available. |    but some users report that only `sh` is available. | ||||||
| - `ANSI_CHARSET`: **\<empty\>**: The default charset for an unrecognized charset. | - `DETECTED_CHARSETS_ORDER`: **UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr**: Tie-break order of detected charsets - if the detected charsets have equal confidence, charsets earlier in the list will be chosen in preference to those later. Adding `defaults` will place the unnamed charsets at that point. | ||||||
|  | - `ANSI_CHARSET`: **\<empty\>**: Default ANSI charset to override non-UTF-8 charsets to. | ||||||
| - `FORCE_PRIVATE`: **false**: Force every new repository to be private. | - `FORCE_PRIVATE`: **false**: Force every new repository to be private. | ||||||
| - `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository. | - `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository. | ||||||
|    \[last, private, public\] |    \[last, private, public\] | ||||||
|  | |||||||
| @ -7,6 +7,7 @@ package charset | |||||||
| import ( | import ( | ||||||
| 	"bytes" | 	"bytes" | ||||||
| 	"fmt" | 	"fmt" | ||||||
|  | 	"strings" | ||||||
| 	"unicode/utf8" | 	"unicode/utf8" | ||||||
| 
 | 
 | ||||||
| 	"code.gitea.io/gitea/modules/log" | 	"code.gitea.io/gitea/modules/log" | ||||||
| @ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) { | |||||||
| 	} else { | 	} else { | ||||||
| 		detectContent = content | 		detectContent = content | ||||||
| 	} | 	} | ||||||
| 	result, err := textDetector.DetectBest(detectContent) | 
 | ||||||
|  | 	// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break | ||||||
|  | 	results, err := textDetector.DetectAll(detectContent) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
|  | 		if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { | ||||||
|  | 			log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) | ||||||
|  | 			return setting.Repository.AnsiCharset, nil | ||||||
|  | 		} | ||||||
| 		return "", err | 		return "", err | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | 	topConfidence := results[0].Confidence | ||||||
|  | 	topResult := results[0] | ||||||
|  | 	priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))] | ||||||
|  | 	for _, result := range results { | ||||||
|  | 		// As results are sorted in confidence order - if we have a different confidence | ||||||
|  | 		// we know it's less than the current confidence and can break out of the loop early | ||||||
|  | 		if result.Confidence != topConfidence { | ||||||
|  | 			break | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss | ||||||
|  | 		resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))] | ||||||
|  | 		if resultHas && (!has || resultPriority < priority) { | ||||||
|  | 			topResult = result | ||||||
|  | 			priority = resultPriority | ||||||
|  | 			has = true | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument | 	// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument | ||||||
| 	if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { | 	if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { | ||||||
| 		log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) | 		log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) | ||||||
| 		return setting.Repository.AnsiCharset, err | 		return setting.Repository.AnsiCharset, err | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	log.Debug("Detected encoding: %s", result.Charset) | 	log.Debug("Detected encoding: %s", topResult.Charset) | ||||||
| 	return result.Charset, err | 	return topResult.Charset, err | ||||||
| } | } | ||||||
|  | |||||||
| @ -230,7 +230,11 @@ func TestDetectEncoding(t *testing.T) { | |||||||
| 	// we accept either. | 	// we accept either. | ||||||
| 	assert.Contains(t, encoding, "ISO-8859") | 	assert.Contains(t, encoding, "ISO-8859") | ||||||
| 
 | 
 | ||||||
|  | 	old := setting.Repository.AnsiCharset | ||||||
| 	setting.Repository.AnsiCharset = "placeholder" | 	setting.Repository.AnsiCharset = "placeholder" | ||||||
|  | 	defer func() { | ||||||
|  | 		setting.Repository.AnsiCharset = old | ||||||
|  | 	}() | ||||||
| 	testSuccess(b, "placeholder") | 	testSuccess(b, "placeholder") | ||||||
| 
 | 
 | ||||||
| 	// invalid bytes | 	// invalid bytes | ||||||
|  | |||||||
| @ -24,6 +24,8 @@ const ( | |||||||
| // Repository settings | // Repository settings | ||||||
| var ( | var ( | ||||||
| 	Repository = struct { | 	Repository = struct { | ||||||
|  | 		DetectedCharsetsOrder                   []string | ||||||
|  | 		DetectedCharsetScore                    map[string]int `ini:"-"` | ||||||
| 		AnsiCharset                             string | 		AnsiCharset                             string | ||||||
| 		ForcePrivate                            bool | 		ForcePrivate                            bool | ||||||
| 		DefaultPrivate                          string | 		DefaultPrivate                          string | ||||||
| @ -88,6 +90,42 @@ var ( | |||||||
| 			Wiki          []string | 			Wiki          []string | ||||||
| 		} `ini:"repository.signing"` | 		} `ini:"repository.signing"` | ||||||
| 	}{ | 	}{ | ||||||
|  | 		DetectedCharsetsOrder: []string{ | ||||||
|  | 			"UTF-8", | ||||||
|  | 			"UTF-16BE", | ||||||
|  | 			"UTF-16LE", | ||||||
|  | 			"UTF-32BE", | ||||||
|  | 			"UTF-32LE", | ||||||
|  | 			"ISO-8859-1", | ||||||
|  | 			"windows-1252", | ||||||
|  | 			"ISO-8859-2", | ||||||
|  | 			"windows-1250", | ||||||
|  | 			"ISO-8859-5", | ||||||
|  | 			"ISO-8859-6", | ||||||
|  | 			"ISO-8859-7", | ||||||
|  | 			"windows-1253", | ||||||
|  | 			"ISO-8859-8-I", | ||||||
|  | 			"windows-1255", | ||||||
|  | 			"ISO-8859-8", | ||||||
|  | 			"windows-1251", | ||||||
|  | 			"windows-1256", | ||||||
|  | 			"KOI8-R", | ||||||
|  | 			"ISO-8859-9", | ||||||
|  | 			"windows-1254", | ||||||
|  | 			"Shift_JIS", | ||||||
|  | 			"GB18030", | ||||||
|  | 			"EUC-JP", | ||||||
|  | 			"EUC-KR", | ||||||
|  | 			"Big5", | ||||||
|  | 			"ISO-2022-JP", | ||||||
|  | 			"ISO-2022-KR", | ||||||
|  | 			"ISO-2022-CN", | ||||||
|  | 			"IBM424_rtl", | ||||||
|  | 			"IBM424_ltr", | ||||||
|  | 			"IBM420_rtl", | ||||||
|  | 			"IBM420_ltr", | ||||||
|  | 		}, | ||||||
|  | 		DetectedCharsetScore:                    map[string]int{}, | ||||||
| 		AnsiCharset:                             "", | 		AnsiCharset:                             "", | ||||||
| 		ForcePrivate:                            false, | 		ForcePrivate:                            false, | ||||||
| 		DefaultPrivate:                          RepoCreatingLastUserVisibility, | 		DefaultPrivate:                          RepoCreatingLastUserVisibility, | ||||||
| @ -208,6 +246,10 @@ func newRepository() { | |||||||
| 	} else { | 	} else { | ||||||
| 		RepoRootPath = filepath.Clean(RepoRootPath) | 		RepoRootPath = filepath.Clean(RepoRootPath) | ||||||
| 	} | 	} | ||||||
|  | 	defaultDetectedCharsetsOrder := make([]string, 0, len(Repository.DetectedCharsetsOrder)) | ||||||
|  | 	for _, charset := range Repository.DetectedCharsetsOrder { | ||||||
|  | 		defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset))) | ||||||
|  | 	} | ||||||
| 	ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash") | 	ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash") | ||||||
| 
 | 
 | ||||||
| 	if err = Cfg.Section("repository").MapTo(&Repository); err != nil { | 	if err = Cfg.Section("repository").MapTo(&Repository); err != nil { | ||||||
| @ -222,6 +264,38 @@ func newRepository() { | |||||||
| 		log.Fatal("Failed to map Repository.PullRequest settings: %v", err) | 		log.Fatal("Failed to map Repository.PullRequest settings: %v", err) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	preferred := make([]string, 0, len(Repository.DetectedCharsetsOrder)) | ||||||
|  | 	for _, charset := range Repository.DetectedCharsetsOrder { | ||||||
|  | 		canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) | ||||||
|  | 		preferred = append(preferred, canonicalCharset) | ||||||
|  | 		// remove it from the defaults | ||||||
|  | 		for i, charset := range defaultDetectedCharsetsOrder { | ||||||
|  | 			if charset == canonicalCharset { | ||||||
|  | 				defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder[:i], defaultDetectedCharsetsOrder[i+1:]...) | ||||||
|  | 				break | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	i := 0 | ||||||
|  | 	for _, charset := range preferred { | ||||||
|  | 		// Add the defaults | ||||||
|  | 		if charset == "defaults" { | ||||||
|  | 			for _, charset := range defaultDetectedCharsetsOrder { | ||||||
|  | 				canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) | ||||||
|  | 				if _, has := Repository.DetectedCharsetScore[canonicalCharset]; !has { | ||||||
|  | 					Repository.DetectedCharsetScore[canonicalCharset] = i | ||||||
|  | 					i++ | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 			continue | ||||||
|  | 		} | ||||||
|  | 		if _, has := Repository.DetectedCharsetScore[charset]; !has { | ||||||
|  | 			Repository.DetectedCharsetScore[charset] = i | ||||||
|  | 			i++ | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if !filepath.IsAbs(Repository.Upload.TempPath) { | 	if !filepath.IsAbs(Repository.Upload.TempPath) { | ||||||
| 		Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath) | 		Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath) | ||||||
| 	} | 	} | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user