// Copyright 2019 The Gitea Authors. All rights reserved. // SPDX-License-Identifier: MIT package references import ( "bytes" "net/url" "regexp" "strconv" "strings" "sync" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/markup/mdstripper" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/util" ) var ( // validNamePattern performs only the most basic validation for user or repository names // Repository name should contain only alphanumeric, dash ('-'), underscore ('_') and dot ('.') characters. validNamePattern = regexp.MustCompile(`^[a-z0-9_.-]+$`) // NOTE: All below regex matching do not perform any extra validation. // Thus a link is produced even if the linked entity does not exist. // While fast, this is also incorrect and lead to false positives. // TODO: fix invalid linking issue // mentionPattern matches all mentions in the form of "@user" or "@org/team" mentionPattern = regexp.MustCompile(`(?:\s|^|\(|\[)(@[-\w][-.\w]*?|@[-\w][-.\w]*?/[-\w][-.\w]*?)(?:\s|$|[:,;.?!](\s|$)|'|\)|\])`) // issueNumericPattern matches string that references to a numeric issue, e.g. #1287 issueNumericPattern = regexp.MustCompile(`(?:\s|^|\(|\[|\'|\")([#!][0-9]+)(?:\s|$|\)|\]|\'|\"|[:;,.?!]\s|[:;,.?!]$)`) // issueAlphanumericPattern matches string that references to an alphanumeric issue, e.g. ABC-1234 issueAlphanumericPattern = regexp.MustCompile(`(?:\s|^|\(|\[|\"|\')([A-Z]{1,10}-[1-9][0-9]*)(?:\s|$|\)|\]|:|\.(\s|$)|\"|\'|,)`) // crossReferenceIssueNumericPattern matches string that references a numeric issue in a different repository // e.g. org/repo#12345 crossReferenceIssueNumericPattern = regexp.MustCompile(`(?:\s|^|\(|\[)([0-9a-zA-Z-_\.]+/[0-9a-zA-Z-_\.]+[#!][0-9]+)(?:\s|$|\)|\]|[:;,.?!]\s|[:;,.?!]$)`) // crossReferenceCommitPattern matches a string that references a commit in a different repository // e.g. go-gitea/gitea@d8a994ef, go-gitea/gitea@d8a994ef243349f321568f9e36d5c3f444b99cae (7-40 characters) crossReferenceCommitPattern = regexp.MustCompile(`(?:\s|^|\(|\[)([0-9a-zA-Z-_\.]+)/([0-9a-zA-Z-_\.]+)@([0-9a-f]{7,64})(?:\s|$|\)|\]|[:;,.?!]\s|[:;,.?!]$)`) // spaceTrimmedPattern let's find the trailing space spaceTrimmedPattern = regexp.MustCompile(`(?:.*[0-9a-zA-Z-_])\s`) // timeLogPattern matches string for time tracking timeLogPattern = regexp.MustCompile(`(?:\s|^|\(|\[)(@([0-9]+([\.,][0-9]+)?(w|d|m|h))+)(?:\s|$|\)|\]|[:;,.?!]\s|[:;,.?!]$)`) issueCloseKeywordsPat, issueReopenKeywordsPat *regexp.Regexp issueKeywordsOnce sync.Once giteaHostInit sync.Once giteaHost string giteaIssuePullPattern *regexp.Regexp actionStrings = []string{ "none", "closes", "reopens", "neutered", } ) // XRefAction represents the kind of effect a cross reference has once is resolved type XRefAction int64 const ( // XRefActionNone means the cross-reference is simply a comment XRefActionNone XRefAction = iota // 0 // XRefActionCloses means the cross-reference should close an issue if it is resolved XRefActionCloses // 1 // XRefActionReopens means the cross-reference should reopen an issue if it is resolved XRefActionReopens // 2 // XRefActionNeutered means the cross-reference will no longer affect the source XRefActionNeutered // 3 ) func (a XRefAction) String() string { return actionStrings[a] } // IssueReference contains an unverified cross-reference to a local issue or pull request type IssueReference struct { Index int64 Owner string Name string Action XRefAction TimeLog string } // RenderizableReference contains an unverified cross-reference to with rendering information // The IsPull member means that a `!num` reference was used instead of `#num`. // This kind of reference is used to make pulls available when an external issue tracker // is used. Otherwise, `#` and `!` are completely interchangeable. type RenderizableReference struct { Issue string Owner string Name string CommitSha string IsPull bool RefLocation *RefSpan Action XRefAction ActionLocation *RefSpan } type rawReference struct { index int64 owner string name string isPull bool action XRefAction issue string refLocation *RefSpan actionLocation *RefSpan timeLog string } func rawToIssueReferenceList(reflist []*rawReference) []IssueReference { refarr := make([]IssueReference, len(reflist)) for i, r := range reflist { refarr[i] = IssueReference{ Index: r.index, Owner: r.owner, Name: r.name, Action: r.action, TimeLog: r.timeLog, } } return refarr } // RefSpan is the position where the reference was found within the parsed text type RefSpan struct { Start int End int } func makeKeywordsPat(words []string) *regexp.Regexp { acceptedWords := parseKeywords(words) if len(acceptedWords) == 0 { // Never match return nil } return regexp.MustCompile(`(?i)(?:\s|^|\(|\[)(` + strings.Join(acceptedWords, `|`) + `):? $`) } func parseKeywords(words []string) []string { acceptedWords := make([]string, 0, 5) wordPat := regexp.MustCompile(`^[\pL]+$`) for _, word := range words { word = strings.ToLower(strings.TrimSpace(word)) // Accept Unicode letter class runes (a-z, á, à, ä, ) if wordPat.MatchString(word) { acceptedWords = append(acceptedWords, word) } else { log.Info("Invalid keyword: %s", word) } } return acceptedWords } func newKeywords() { issueKeywordsOnce.Do(func() { // Delay initialization until after the settings module is initialized doNewKeywords(setting.Repository.PullRequest.CloseKeywords, setting.Repository.PullRequest.ReopenKeywords) }) } func doNewKeywords(closeKeywords, reopenKeywords []string) { issueCloseKeywordsPat = makeKeywordsPat(closeKeywords) issueReopenKeywordsPat = makeKeywordsPat(reopenKeywords) } // getGiteaHostName returns a normalized string with the local host name, with no scheme or port information func getGiteaHostName() string { giteaHostInit.Do(func() { if uapp, err := url.Parse(setting.AppURL); err == nil { giteaHost = strings.ToLower(uapp.Host) giteaIssuePullPattern = regexp.MustCompile( `(\s|^|\(|\[)` + regexp.QuoteMeta(strings.TrimSpace(setting.AppURL)) + `([0-9a-zA-Z-_\.]+/[0-9a-zA-Z-_\.]+)/` + `((?:issues)|(?:pulls))/([0-9]+)(?:\s|$|\)|\]|[:;,.?!]\s|[:;,.?!]$)`) } else { giteaHost = "" giteaIssuePullPattern = nil } }) return giteaHost } // getGiteaIssuePullPattern func getGiteaIssuePullPattern() *regexp.Regexp { getGiteaHostName() return giteaIssuePullPattern } // FindAllMentionsMarkdown matches mention patterns in given content and // returns a list of found unvalidated user names **not including** the @ prefix. func FindAllMentionsMarkdown(content string) []string { bcontent, _ := mdstripper.StripMarkdownBytes([]byte(content)) locations := FindAllMentionsBytes(bcontent) mentions := make([]string, len(locations)) for i, val := range locations { mentions[i] = string(bcontent[val.Start+1 : val.End]) } return mentions } // FindAllMentionsBytes matches mention patterns in given content // and returns a list of locations for the unvalidated user names, including the @ prefix. func FindAllMentionsBytes(content []byte) []RefSpan { // Sadly we can't use FindAllSubmatchIndex because our pattern checks for starting and // trailing spaces (\s@mention,\s), so if we get two consecutive references, the space // from the second reference will be "eaten" by the first one: // ...\s@mention1\s@mention2\s... --> ...`\s@mention1\s`, (not) `@mention2,\s...` ret := make([]RefSpan, 0, 5) pos := 0 for { match := mentionPattern.FindSubmatchIndex(content[pos:]) if match == nil { break } ret = append(ret, RefSpan{Start: match[2] + pos, End: match[3] + pos}) notrail := spaceTrimmedPattern.FindSubmatchIndex(content[match[2]+pos : match[3]+pos]) if notrail == nil { pos = match[3] + pos } else { pos = match[3] + pos + notrail[1] - notrail[3] } } return ret } // FindFirstMentionBytes matches the first mention in then given content // and returns the location of the unvalidated user name, including the @ prefix. func FindFirstMentionBytes(content []byte) (bool, RefSpan) { mention := mentionPattern.FindSubmatchIndex(content) if mention == nil { return false, RefSpan{} } return true, RefSpan{Start: mention[2], End: mention[3]} } // FindAllIssueReferencesMarkdown strips content from markdown markup // and returns a list of unvalidated references found in it. func FindAllIssueReferencesMarkdown(content string) []IssueReference { return rawToIssueReferenceList(findAllIssueReferencesMarkdown(content)) } func findAllIssueReferencesMarkdown(content string) []*rawReference { bcontent, links := mdstripper.StripMarkdownBytes([]byte(content)) return findAllIssueReferencesBytes(bcontent, links) } func convertFullHTMLReferencesToShortRefs(re *regexp.Regexp, contentBytes *[]byte) { // We will iterate through the content, rewrite and simplify full references. // // We want to transform something like: // // this is a https://ourgitea.com/git/owner/repo/issues/123456789, foo // https://ourgitea.com/git/owner/repo/pulls/123456789 // // Into something like: // // this is a #123456789, foo // !123456789 pos := 0 for { // re looks for something like: (\s|^|\(|\[)https://ourgitea.com/git/(owner/repo)/(issues)/(123456789)(?:\s|$|\)|\]|[:;,.?!]\s|[:;,.?!]$) match := re.FindSubmatchIndex((*contentBytes)[pos:]) if match == nil { break } // match is a bunch of indices into the content from pos onwards so // to simplify things let's just add pos to all of the indices in match for i := range match { match[i] += pos } // match[0]-match[1] is whole string // match[2]-match[3] is preamble // move the position to the end of the preamble pos = match[3] // match[4]-match[5] is owner/repo // now copy the owner/repo to end of the preamble endPos := pos + match[5] - match[4] copy((*contentBytes)[pos:endPos], (*contentBytes)[match[4]:match[5]]) // move the current position to the end of the newly copied owner/repo pos = endPos // Now set the issue/pull marker: // // match[6]-match[7] == 'issues' (*contentBytes)[pos] = '#' if string((*contentBytes)[match[6]:match[7]]) == "pulls" { (*contentBytes)[pos] = '!' } pos++ // Then add the issue/pull number // // match[8]-match[9] is the number endPos = pos + match[9] - match[8] copy((*contentBytes)[pos:endPos], (*contentBytes)[match[8]:match[9]]) // Now copy what's left at the end of the string to the new end position copy((*contentBytes)[endPos:], (*contentBytes)[match[9]:]) // now we reset the length // our new section has length endPos - match[3] // our old section has length match[9] - match[3] *contentBytes = (*contentBytes)[:len(*contentBytes)-match[9]+endPos] pos = endPos } } // FindAllIssueReferences returns a list of unvalidated references found in a string. func FindAllIssueReferences(content string) []IssueReference { // Need to convert fully qualified html references to local system to #/! short codes contentBytes := []byte(content) if re := getGiteaIssuePullPattern(); re != nil { convertFullHTMLReferencesToShortRefs(re, &contentBytes) } else { log.Debug("No GiteaIssuePullPattern pattern") } return rawToIssueReferenceList(findAllIssueReferencesBytes(contentBytes, []string{})) } // FindRenderizableReferenceNumeric returns the first unvalidated reference found in a string. func FindRenderizableReferenceNumeric(content string, prOnly, crossLinkOnly bool) (bool, *RenderizableReference) { var match []int if !crossLinkOnly { match = issueNumericPattern.FindStringSubmatchIndex(content) } if match == nil { if match = crossReferenceIssueNumericPattern.FindStringSubmatchIndex(content); match == nil { return false, nil } } r := getCrossReference(util.UnsafeStringToBytes(content), match[2], match[3], false, prOnly) if r == nil { return false, nil } return true, &RenderizableReference{ Issue: r.issue, Owner: r.owner, Name: r.name, IsPull: r.isPull, RefLocation: r.refLocation, Action: r.action, ActionLocation: r.actionLocation, } } // FindRenderizableCommitCrossReference returns the first unvalidated commit cross reference found in a string. func FindRenderizableCommitCrossReference(content string) (bool, *RenderizableReference) { m := crossReferenceCommitPattern.FindStringSubmatchIndex(content) if len(m) < 8 { return false, nil } return true, &RenderizableReference{ Owner: content[m[2]:m[3]], Name: content[m[4]:m[5]], CommitSha: content[m[6]:m[7]], RefLocation: &RefSpan{Start: m[2], End: m[7]}, } } // FindRenderizableReferenceRegexp returns the first regexp unvalidated references found in a string. func FindRenderizableReferenceRegexp(content string, pattern *regexp.Regexp) (bool, *RenderizableReference) { match := pattern.FindStringSubmatchIndex(content) if len(match) < 4 { return false, nil } action, location := findActionKeywords([]byte(content), match[2]) return true, &RenderizableReference{ Issue: content[match[2]:match[3]], RefLocation: &RefSpan{Start: match[0], End: match[1]}, Action: action, ActionLocation: location, IsPull: false, } } // FindRenderizableReferenceAlphanumeric returns the first alphanumeric unvalidated references found in a string. func FindRenderizableReferenceAlphanumeric(content string) (bool, *RenderizableReference) { match := issueAlphanumericPattern.FindStringSubmatchIndex(content) if match == nil { return false, nil } action, location := findActionKeywords([]byte(content), match[2]) return true, &RenderizableReference{ Issue: content[match[2]:match[3]], RefLocation: &RefSpan{Start: match[2], End: match[3]}, Action: action, ActionLocation: location, IsPull: false, } } // FindAllIssueReferencesBytes returns a list of unvalidated references found in a byte slice. func findAllIssueReferencesBytes(content []byte, links []string) []*rawReference { ret := make([]*rawReference, 0, 10) pos := 0 // Sadly we can't use FindAllSubmatchIndex because our pattern checks for starting and // trailing spaces (\s#ref,\s), so if we get two consecutive references, the space // from the second reference will be "eaten" by the first one: // ...\s#ref1\s#ref2\s... --> ...`\s#ref1\s`, (not) `#ref2,\s...` for { match := issueNumericPattern.FindSubmatchIndex(content[pos:]) if match == nil { break } if ref := getCrossReference(content, match[2]+pos, match[3]+pos, false, false); ref != nil { ret = append(ret, ref) } notrail := spaceTrimmedPattern.FindSubmatchIndex(content[match[2]+pos : match[3]+pos]) if notrail == nil { pos = match[3] + pos } else { pos = match[3] + pos + notrail[1] - notrail[3] } } pos = 0 for { match := crossReferenceIssueNumericPattern.FindSubmatchIndex(content[pos:]) if match == nil { break } if ref := getCrossReference(content, match[2]+pos, match[3]+pos, false, false); ref != nil { ret = append(ret, ref) } notrail := spaceTrimmedPattern.FindSubmatchIndex(content[match[2]+pos : match[3]+pos]) if notrail == nil { pos = match[3] + pos } else { pos = match[3] + pos + notrail[1] - notrail[3] } } localhost := getGiteaHostName() for _, link := range links { if u, err := url.Parse(link); err == nil { // Note: we're not attempting to match the URL scheme (http/https) host := strings.ToLower(u.Host) if host != "" && host != localhost { continue } parts := strings.Split(u.EscapedPath(), "/") // /user/repo/issues/3 if len(parts) != 5 || parts[0] != "" { continue } var sep string if parts[3] == "issues" { sep = "#" } else if parts[3] == "pulls" { sep = "!" } else { continue } // Note: closing/reopening keywords not supported with URLs bytes := []byte(parts[1] + "/" + parts[2] + sep + parts[4]) if ref := getCrossReference(bytes, 0, len(bytes), true, false); ref != nil { ref.refLocation = nil ret = append(ret, ref) } } } if len(ret) == 0 { return ret } pos = 0 for { match := timeLogPattern.FindSubmatchIndex(content[pos:]) if match == nil { break } timeLogEntry := string(content[match[2]+pos+1 : match[3]+pos]) var f *rawReference for _, ref := range ret { if ref.refLocation != nil && ref.refLocation.End < match[2]+pos && (f == nil || f.refLocation.End < ref.refLocation.End) { f = ref } } pos = match[1] + pos if f == nil { f = ret[0] } if len(f.timeLog) == 0 { f.timeLog = timeLogEntry } } return ret } func getCrossReference(content []byte, start, end int, fromLink, prOnly bool) *rawReference { sep := bytes.IndexAny(content[start:end], "#!") if sep < 0 { return nil } isPull := content[start+sep] == '!' if prOnly && !isPull { return nil } repo := string(content[start : start+sep]) issue := string(content[start+sep+1 : end]) index, err := strconv.ParseInt(issue, 10, 64) if err != nil { return nil } if repo == "" { if fromLink { // Markdown links must specify owner/repo return nil } action, location := findActionKeywords(content, start) return &rawReference{ index: index, action: action, issue: issue, isPull: isPull, refLocation: &RefSpan{Start: start, End: end}, actionLocation: location, } } parts := strings.Split(strings.ToLower(repo), "/") if len(parts) != 2 { return nil } owner, name := parts[0], parts[1] if !validNamePattern.MatchString(owner) || !validNamePattern.MatchString(name) { return nil } action, location := findActionKeywords(content, start) return &rawReference{ index: index, owner: owner, name: name, action: action, issue: issue, isPull: isPull, refLocation: &RefSpan{Start: start, End: end}, actionLocation: location, } } func findActionKeywords(content []byte, start int) (XRefAction, *RefSpan) { newKeywords() var m []int if issueCloseKeywordsPat != nil { m = issueCloseKeywordsPat.FindSubmatchIndex(content[:start]) if m != nil { return XRefActionCloses, &RefSpan{Start: m[2], End: m[3]} } } if issueReopenKeywordsPat != nil { m = issueReopenKeywordsPat.FindSubmatchIndex(content[:start]) if m != nil { return XRefActionReopens, &RefSpan{Start: m[2], End: m[3]} } } return XRefActionNone, nil } // IsXrefActionable returns true if the xref action is actionable (i.e. produces a result when resolved) func IsXrefActionable(ref *RenderizableReference, extTracker bool) bool { if extTracker { // External issues cannot be automatically closed return false } return ref.Action == XRefActionCloses || ref.Action == XRefActionReopens }