summaryrefslogtreecommitdiff
path: root/pkg/gitdiff/patch_header.go
diff options
context:
space:
mode:
authorAnton Medvedev <anton@medv.io>2025-11-30 12:46:34 +0100
committerAnton Medvedev <anton@medv.io>2025-11-30 12:46:34 +0100
commitf6b0f38af648d028422a7494378b5dabdc90573f (patch)
tree3c26cfc269c021300a2d1e4e02623dd440c20226 /pkg/gitdiff/patch_header.go
First commit
Diffstat (limited to 'pkg/gitdiff/patch_header.go')
-rw-r--r--pkg/gitdiff/patch_header.go470
1 files changed, 470 insertions, 0 deletions
diff --git a/pkg/gitdiff/patch_header.go b/pkg/gitdiff/patch_header.go
new file mode 100644
index 0000000..f047059
--- /dev/null
+++ b/pkg/gitdiff/patch_header.go
@@ -0,0 +1,470 @@
+package gitdiff
+
+import (
+ "bufio"
+ "errors"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "mime/quotedprintable"
+ "net/mail"
+ "strconv"
+ "strings"
+ "time"
+ "unicode"
+)
+
+const (
+ mailHeaderPrefix = "From "
+ prettyHeaderPrefix = "commit "
+ mailMinimumHeaderPrefix = "From:"
+)
+
+// PatchHeader is a parsed version of the preamble content that appears before
+// the first diff in a patch. It includes metadata about the patch, such as the
+// author and a subject.
+type PatchHeader struct {
+ // The SHA of the commit the patch was generated from. Empty if the SHA is
+ // not included in the header.
+ SHA string
+
+ // The author details of the patch. If these details are not included in
+ // the header, Author is nil and AuthorDate is the zero time.
+ Author *PatchIdentity
+ AuthorDate time.Time
+
+ // The committer details of the patch. If these details are not included in
+ // the header, Committer is nil and CommitterDate is the zero time.
+ Committer *PatchIdentity
+ CommitterDate time.Time
+
+ // The title and body of the commit message describing the changes in the
+ // patch. Empty if no message is included in the header.
+ Title string
+ Body string
+
+ // If the preamble looks like an email, ParsePatchHeader will
+ // remove prefixes such as `Re: ` and `[PATCH v3 5/17]` from the
+ // Title and place them here.
+ SubjectPrefix string
+
+ // If the preamble looks like an email, and it contains a `---`
+ // line, that line will be removed and everything after it will be
+ // placed in BodyAppendix.
+ BodyAppendix string
+}
+
+// Message returns the commit message for the header. The message consists of
+// the title and the body separated by an empty line.
+func (h *PatchHeader) Message() string {
+ var msg strings.Builder
+ if h != nil {
+ msg.WriteString(h.Title)
+ if h.Body != "" {
+ msg.WriteString("\n\n")
+ msg.WriteString(h.Body)
+ }
+ }
+ return msg.String()
+}
+
+// ParsePatchDate parses a patch date string. It returns the parsed time or an
+// error if s has an unknown format. ParsePatchDate supports the iso, rfc,
+// short, raw, unix, and default formats (with local variants) used by the
+// --date flag in Git.
+func ParsePatchDate(s string) (time.Time, error) {
+ const (
+ isoFormat = "2006-01-02 15:04:05 -0700"
+ isoStrictFormat = "2006-01-02T15:04:05-07:00"
+ rfc2822Format = "Mon, 2 Jan 2006 15:04:05 -0700"
+ shortFormat = "2006-01-02"
+ defaultFormat = "Mon Jan 2 15:04:05 2006 -0700"
+ defaultLocalFormat = "Mon Jan 2 15:04:05 2006"
+ )
+
+ if s == "" {
+ return time.Time{}, nil
+ }
+
+ for _, fmt := range []string{
+ isoFormat,
+ isoStrictFormat,
+ rfc2822Format,
+ shortFormat,
+ defaultFormat,
+ defaultLocalFormat,
+ } {
+ if t, err := time.ParseInLocation(fmt, s, time.Local); err == nil {
+ return t, nil
+ }
+ }
+
+ // unix format
+ if unix, err := strconv.ParseInt(s, 10, 64); err == nil {
+ return time.Unix(unix, 0), nil
+ }
+
+ // raw format
+ if space := strings.IndexByte(s, ' '); space > 0 {
+ unix, uerr := strconv.ParseInt(s[:space], 10, 64)
+ zone, zerr := time.Parse("-0700", s[space+1:])
+ if uerr == nil && zerr == nil {
+ return time.Unix(unix, 0).In(zone.Location()), nil
+ }
+ }
+
+ return time.Time{}, fmt.Errorf("unknown date format: %s", s)
+}
+
+// A PatchHeaderOption modifies the behavior of ParsePatchHeader.
+type PatchHeaderOption func(*patchHeaderOptions)
+
+// SubjectCleanMode controls how ParsePatchHeader cleans subject lines when
+// parsing mail-formatted patches.
+type SubjectCleanMode int
+
+const (
+ // SubjectCleanWhitespace removes leading and trailing whitespace.
+ SubjectCleanWhitespace SubjectCleanMode = iota
+
+ // SubjectCleanAll removes leading and trailing whitespace, leading "Re:",
+ // "re:", and ":" strings, and leading strings enclosed by '[' and ']'.
+ // This is the default behavior of git (see `git mailinfo`) and this
+ // package.
+ SubjectCleanAll
+
+ // SubjectCleanPatchOnly is the same as SubjectCleanAll, but only removes
+ // leading strings enclosed by '[' and ']' if they start with "PATCH".
+ SubjectCleanPatchOnly
+)
+
+// WithSubjectCleanMode sets the SubjectCleanMode for header parsing. By
+// default, uses SubjectCleanAll.
+func WithSubjectCleanMode(m SubjectCleanMode) PatchHeaderOption {
+ return func(opts *patchHeaderOptions) {
+ opts.subjectCleanMode = m
+ }
+}
+
+type patchHeaderOptions struct {
+ subjectCleanMode SubjectCleanMode
+}
+
+// ParsePatchHeader parses the preamble string returned by [Parse] into a
+// PatchHeader. Due to the variety of header formats, some fields of the parsed
+// PatchHeader may be unset after parsing.
+//
+// Supported formats are the short, medium, full, fuller, and email pretty
+// formats used by `git diff`, `git log`, and `git show` and the UNIX mailbox
+// format used by `git format-patch`.
+//
+// When parsing mail-formatted headers, ParsePatchHeader tries to remove
+// email-specific content from the title and body:
+//
+// - Based on the SubjectCleanMode, remove prefixes like reply markers and
+// "[PATCH]" strings from the subject, saving any removed content in the
+// SubjectPrefix field. Parsing always discards leading and trailing
+// whitespace from the subject line. The default mode is SubjectCleanAll.
+//
+// - If the body contains a "---" line (3 hyphens), remove that line and any
+// content after it from the body and save it in the BodyAppendix field.
+//
+// ParsePatchHeader tries to process content it does not understand wthout
+// returning errors, but will return errors if well-identified content like
+// dates or identies uses unknown or invalid formats.
+func ParsePatchHeader(header string, options ...PatchHeaderOption) (*PatchHeader, error) {
+ opts := patchHeaderOptions{
+ subjectCleanMode: SubjectCleanAll, // match git defaults
+ }
+ for _, optFn := range options {
+ optFn(&opts)
+ }
+
+ header = strings.TrimSpace(header)
+ if header == "" {
+ return &PatchHeader{}, nil
+ }
+
+ var firstLine, rest string
+ if idx := strings.IndexByte(header, '\n'); idx >= 0 {
+ firstLine = header[:idx]
+ rest = header[idx+1:]
+ } else {
+ firstLine = header
+ rest = ""
+ }
+
+ switch {
+ case strings.HasPrefix(firstLine, mailHeaderPrefix):
+ return parseHeaderMail(firstLine, strings.NewReader(rest), opts)
+
+ case strings.HasPrefix(firstLine, mailMinimumHeaderPrefix):
+ // With a minimum header, the first line is part of the actual mail
+ // content and needs to be parsed as part of the "rest"
+ return parseHeaderMail("", strings.NewReader(header), opts)
+
+ case strings.HasPrefix(firstLine, prettyHeaderPrefix):
+ return parseHeaderPretty(firstLine, strings.NewReader(rest))
+ }
+
+ return nil, errors.New("unrecognized patch header format")
+}
+
+func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) {
+ const (
+ authorPrefix = "Author:"
+ commitPrefix = "Commit:"
+ datePrefix = "Date:"
+ authorDatePrefix = "AuthorDate:"
+ commitDatePrefix = "CommitDate:"
+ )
+
+ h := &PatchHeader{}
+
+ prettyLine = strings.TrimPrefix(prettyLine, prettyHeaderPrefix)
+ if i := strings.IndexByte(prettyLine, ' '); i > 0 {
+ h.SHA = prettyLine[:i]
+ } else {
+ h.SHA = prettyLine
+ }
+
+ s := bufio.NewScanner(r)
+ for s.Scan() {
+ line := s.Text()
+
+ // empty line marks end of fields, remaining lines are title/message
+ if strings.TrimSpace(line) == "" {
+ break
+ }
+
+ switch {
+ case strings.HasPrefix(line, authorPrefix):
+ u, err := ParsePatchIdentity(line[len(authorPrefix):])
+ if err != nil {
+ return nil, err
+ }
+ h.Author = &u
+
+ case strings.HasPrefix(line, commitPrefix):
+ u, err := ParsePatchIdentity(line[len(commitPrefix):])
+ if err != nil {
+ return nil, err
+ }
+ h.Committer = &u
+
+ case strings.HasPrefix(line, datePrefix):
+ d, err := ParsePatchDate(strings.TrimSpace(line[len(datePrefix):]))
+ if err != nil {
+ return nil, err
+ }
+ h.AuthorDate = d
+
+ case strings.HasPrefix(line, authorDatePrefix):
+ d, err := ParsePatchDate(strings.TrimSpace(line[len(authorDatePrefix):]))
+ if err != nil {
+ return nil, err
+ }
+ h.AuthorDate = d
+
+ case strings.HasPrefix(line, commitDatePrefix):
+ d, err := ParsePatchDate(strings.TrimSpace(line[len(commitDatePrefix):]))
+ if err != nil {
+ return nil, err
+ }
+ h.CommitterDate = d
+ }
+ }
+ if s.Err() != nil {
+ return nil, s.Err()
+ }
+
+ title, indent := scanMessageTitle(s)
+ if s.Err() != nil {
+ return nil, s.Err()
+ }
+ h.Title = title
+
+ if title != "" {
+ // Don't check for an appendix, pretty headers do not contain them
+ body, _ := scanMessageBody(s, indent, false)
+ if s.Err() != nil {
+ return nil, s.Err()
+ }
+ h.Body = body
+ }
+
+ return h, nil
+}
+
+func scanMessageTitle(s *bufio.Scanner) (title string, indent string) {
+ var b strings.Builder
+ for i := 0; s.Scan(); i++ {
+ line := s.Text()
+ trimLine := strings.TrimSpace(line)
+ if trimLine == "" {
+ break
+ }
+
+ if i == 0 {
+ if start := strings.IndexFunc(line, func(c rune) bool { return !unicode.IsSpace(c) }); start > 0 {
+ indent = line[:start]
+ }
+ }
+ if b.Len() > 0 {
+ b.WriteByte(' ')
+ }
+ b.WriteString(trimLine)
+ }
+ return b.String(), indent
+}
+
+func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) {
+ // Body and appendix
+ var body, appendix strings.Builder
+ c := &body
+ var empty int
+ for i := 0; s.Scan(); i++ {
+ line := s.Text()
+
+ line = strings.TrimRightFunc(line, unicode.IsSpace)
+ line = strings.TrimPrefix(line, indent)
+
+ if line == "" {
+ empty++
+ continue
+ }
+
+ // If requested, parse out "appendix" information (often added
+ // by `git format-patch` and removed by `git am`).
+ if separateAppendix && c == &body && line == "---" {
+ c = &appendix
+ continue
+ }
+
+ if c.Len() > 0 {
+ c.WriteByte('\n')
+ if empty > 0 {
+ c.WriteByte('\n')
+ }
+ }
+ empty = 0
+
+ c.WriteString(line)
+ }
+ return body.String(), appendix.String()
+}
+
+func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*PatchHeader, error) {
+ msg, err := mail.ReadMessage(r)
+ if err != nil {
+ return nil, err
+ }
+
+ h := &PatchHeader{}
+
+ if strings.HasPrefix(mailLine, mailHeaderPrefix) {
+ mailLine = strings.TrimPrefix(mailLine, mailHeaderPrefix)
+ if i := strings.IndexByte(mailLine, ' '); i > 0 {
+ h.SHA = mailLine[:i]
+ }
+ }
+
+ from := msg.Header.Get("From")
+ if from != "" {
+ u, err := ParsePatchIdentity(from)
+ if err != nil {
+ return nil, err
+ }
+ h.Author = &u
+ }
+
+ date := msg.Header.Get("Date")
+ if date != "" {
+ d, err := ParsePatchDate(date)
+ if err != nil {
+ return nil, err
+ }
+ h.AuthorDate = d
+ }
+
+ subject := msg.Header.Get("Subject")
+ h.SubjectPrefix, h.Title = cleanSubject(subject, opts.subjectCleanMode)
+
+ s := bufio.NewScanner(msg.Body)
+ h.Body, h.BodyAppendix = scanMessageBody(s, "", true)
+ if s.Err() != nil {
+ return nil, s.Err()
+ }
+
+ return h, nil
+}
+
+func cleanSubject(s string, mode SubjectCleanMode) (prefix string, subject string) {
+ switch mode {
+ case SubjectCleanAll, SubjectCleanPatchOnly:
+ case SubjectCleanWhitespace:
+ return "", strings.TrimSpace(decodeSubject(s))
+ default:
+ panic(fmt.Sprintf("unknown clean mode: %d", mode))
+ }
+
+ // Based on the algorithm from Git in mailinfo.c:cleanup_subject()
+ // If compatibility with `git am` drifts, go there to see if there are any updates.
+
+ at := 0
+ for at < len(s) {
+ switch s[at] {
+ case 'r', 'R':
+ // Detect re:, Re:, rE: and RE:
+ if at+2 < len(s) && (s[at+1] == 'e' || s[at+1] == 'E') && s[at+2] == ':' {
+ at += 3
+ continue
+ }
+
+ case ' ', '\t', ':':
+ // Delete whitespace and duplicate ':' characters
+ at++
+ continue
+
+ case '[':
+ if i := strings.IndexByte(s[at:], ']'); i > 0 {
+ if mode == SubjectCleanAll || strings.Contains(s[at:at+i+1], "PATCH") {
+ at += i + 1
+ continue
+ }
+ }
+ }
+
+ // Nothing was removed, end processing
+ break
+ }
+
+ prefix = strings.TrimLeftFunc(s[:at], unicode.IsSpace)
+ subject = strings.TrimRightFunc(decodeSubject(s[at:]), unicode.IsSpace)
+ return
+}
+
+// Decodes a subject line. Currently only supports quoted-printable UTF-8. This format is the result
+// of a `git format-patch` when the commit title has a non-ASCII character (i.e. an emoji).
+// See for reference: https://stackoverflow.com/questions/27695749/gmail-api-not-respecting-utf-encoding-in-subject
+func decodeSubject(encoded string) string {
+ if !strings.HasPrefix(encoded, "=?UTF-8?q?") {
+ // not UTF-8 encoded
+ return encoded
+ }
+
+ // If the subject is too long, `git format-patch` may produce a subject line across
+ // multiple lines. When parsed, this can look like the following:
+ // <UTF8-prefix><first-line> <UTF8-prefix><second-line>
+ payload := " " + encoded
+ payload = strings.ReplaceAll(payload, " =?UTF-8?q?", "")
+ payload = strings.ReplaceAll(payload, "?=", "")
+
+ decoded, err := ioutil.ReadAll(quotedprintable.NewReader(strings.NewReader(payload)))
+ if err != nil {
+ // if err, abort decoding and return original subject
+ return encoded
+ }
+
+ return string(decoded)
+}