summaryrefslogtreecommitdiff
path: root/internal/gitdiff/file_header.go
diff options
context:
space:
mode:
authormo khan <mo@mokhan.ca>2026-01-30 18:16:31 -0700
committermo khan <mo@mokhan.ca>2026-01-30 18:16:31 -0700
commitfeee7d43ef63ae607c6fd4cca88a356a93553ebe (patch)
tree2969055a894dc4e72d8d79a9ac74cc30d78aff64 /internal/gitdiff/file_header.go
parente0db8f82e96acadf6968e0cf9c805a7b22d835db (diff)
refactor: move packages to internal/
Diffstat (limited to 'internal/gitdiff/file_header.go')
-rw-r--r--internal/gitdiff/file_header.go546
1 files changed, 546 insertions, 0 deletions
diff --git a/internal/gitdiff/file_header.go b/internal/gitdiff/file_header.go
new file mode 100644
index 0000000..7ae4bc9
--- /dev/null
+++ b/internal/gitdiff/file_header.go
@@ -0,0 +1,546 @@
+package gitdiff
+
+import (
+ "fmt"
+ "io"
+ "os"
+ "strconv"
+ "strings"
+ "time"
+)
+
+const (
+ devNull = "/dev/null"
+)
+
+// ParseNextFileHeader finds and parses the next file header in the stream. If
+// a header is found, it returns a file and all input before the header. It
+// returns nil if no headers are found before the end of the input.
+func (p *parser) ParseNextFileHeader() (*File, string, error) {
+ var preamble strings.Builder
+ var file *File
+ for {
+ // check for disconnected fragment headers (corrupt patch)
+ frag, err := p.ParseTextFragmentHeader()
+ if err != nil {
+ // not a valid header, nothing to worry about
+ goto NextLine
+ }
+ if frag != nil {
+ return nil, "", p.Errorf(-1, "patch fragment without file header: %s", frag.Header())
+ }
+
+ // check for a git-generated patch
+ file, err = p.ParseGitFileHeader()
+ if err != nil {
+ return nil, "", err
+ }
+ if file != nil {
+ return file, preamble.String(), nil
+ }
+
+ // check for a "traditional" patch
+ file, err = p.ParseTraditionalFileHeader()
+ if err != nil {
+ return nil, "", err
+ }
+ if file != nil {
+ return file, preamble.String(), nil
+ }
+
+ NextLine:
+ preamble.WriteString(p.Line(0))
+ if err := p.Next(); err != nil {
+ if err == io.EOF {
+ break
+ }
+ return nil, "", err
+ }
+ }
+ return nil, preamble.String(), nil
+}
+
+func (p *parser) ParseGitFileHeader() (*File, error) {
+ const prefix = "diff --git "
+
+ if !strings.HasPrefix(p.Line(0), prefix) {
+ return nil, nil
+ }
+ header := p.Line(0)[len(prefix):]
+
+ defaultName, err := parseGitHeaderName(header)
+ if err != nil {
+ return nil, p.Errorf(0, "git file header: %v", err)
+ }
+
+ f := &File{}
+ for {
+ end, err := parseGitHeaderData(f, p.Line(1), defaultName)
+ if err != nil {
+ return nil, p.Errorf(1, "git file header: %v", err)
+ }
+
+ if err := p.Next(); err != nil {
+ if err == io.EOF {
+ break
+ }
+ return nil, err
+ }
+
+ if end {
+ break
+ }
+ }
+
+ if f.OldName == "" && f.NewName == "" {
+ if defaultName == "" {
+ return nil, p.Errorf(0, "git file header: missing filename information")
+ }
+ f.OldName = defaultName
+ f.NewName = defaultName
+ }
+
+ if (f.NewName == "" && !f.IsDelete) || (f.OldName == "" && !f.IsNew) {
+ return nil, p.Errorf(0, "git file header: missing filename information")
+ }
+
+ return f, nil
+}
+
+func (p *parser) ParseTraditionalFileHeader() (*File, error) {
+ const shortestValidFragHeader = "@@ -1 +1 @@\n"
+ const (
+ oldPrefix = "--- "
+ newPrefix = "+++ "
+ )
+
+ oldLine, newLine := p.Line(0), p.Line(1)
+
+ if !strings.HasPrefix(oldLine, oldPrefix) || !strings.HasPrefix(newLine, newPrefix) {
+ return nil, nil
+ }
+ // heuristic: only a file header if followed by a (probable) fragment header
+ if len(p.Line(2)) < len(shortestValidFragHeader) || !strings.HasPrefix(p.Line(2), "@@ -") {
+ return nil, nil
+ }
+
+ // advance past the first two lines so parser is after the header
+ // no EOF check needed because we know there are >=3 valid lines
+ if err := p.Next(); err != nil {
+ return nil, err
+ }
+ if err := p.Next(); err != nil {
+ return nil, err
+ }
+
+ oldName, _, err := parseName(oldLine[len(oldPrefix):], '\t', 0)
+ if err != nil {
+ return nil, p.Errorf(0, "file header: %v", err)
+ }
+
+ newName, _, err := parseName(newLine[len(newPrefix):], '\t', 0)
+ if err != nil {
+ return nil, p.Errorf(1, "file header: %v", err)
+ }
+
+ f := &File{}
+ switch {
+ case oldName == devNull || hasEpochTimestamp(oldLine):
+ f.IsNew = true
+ f.NewName = newName
+ case newName == devNull || hasEpochTimestamp(newLine):
+ f.IsDelete = true
+ f.OldName = oldName
+ default:
+ // if old name is a prefix of new name, use that instead
+ // this avoids picking variants like "file.bak" or "file~"
+ if strings.HasPrefix(newName, oldName) {
+ f.OldName = oldName
+ f.NewName = oldName
+ } else {
+ f.OldName = newName
+ f.NewName = newName
+ }
+ }
+
+ return f, nil
+}
+
+// parseGitHeaderName extracts a default file name from the Git file header
+// line. This is required for mode-only changes and creation/deletion of empty
+// files. Other types of patch include the file name(s) in the header data.
+// If the names in the header do not match because the patch is a rename,
+// return an empty default name.
+func parseGitHeaderName(header string) (string, error) {
+ header = strings.TrimSuffix(header, "\n")
+ if len(header) == 0 {
+ return "", nil
+ }
+
+ var err error
+ var first, second string
+
+ // there are 4 cases to account for:
+ //
+ // 1) unquoted unquoted
+ // 2) unquoted "quoted"
+ // 3) "quoted" unquoted
+ // 4) "quoted" "quoted"
+ //
+ quote := strings.IndexByte(header, '"')
+ switch {
+ case quote < 0:
+ // case 1
+ first = header
+
+ case quote > 0:
+ // case 2
+ first = header[:quote-1]
+ if !isSpace(header[quote-1]) {
+ return "", fmt.Errorf("missing separator")
+ }
+
+ second, _, err = parseQuotedName(header[quote:])
+ if err != nil {
+ return "", err
+ }
+
+ case quote == 0:
+ // case 3 or case 4
+ var n int
+ first, n, err = parseQuotedName(header)
+ if err != nil {
+ return "", err
+ }
+
+ // git accepts multiple spaces after a quoted name, but not after an
+ // unquoted name, since the name might end with one or more spaces
+ for n < len(header) && isSpace(header[n]) {
+ n++
+ }
+ if n == len(header) {
+ return "", nil
+ }
+
+ if header[n] == '"' {
+ second, _, err = parseQuotedName(header[n:])
+ if err != nil {
+ return "", err
+ }
+ } else {
+ second = header[n:]
+ }
+ }
+
+ first = trimTreePrefix(first, 1)
+ if second != "" {
+ if first == trimTreePrefix(second, 1) {
+ return first, nil
+ }
+ return "", nil
+ }
+
+ // at this point, both names are unquoted (case 1)
+ // since names may contain spaces, we can't use a known separator
+ // instead, look for a split that produces two equal names
+
+ for i := 0; i < len(first)-1; i++ {
+ if !isSpace(first[i]) {
+ continue
+ }
+ second = trimTreePrefix(first[i+1:], 1)
+ if name := first[:i]; name == second {
+ return name, nil
+ }
+ }
+ return "", nil
+}
+
+// parseGitHeaderData parses a single line of metadata from a Git file header.
+// It returns true when header parsing is complete; in that case, line was the
+// first line of non-header content.
+func parseGitHeaderData(f *File, line, defaultName string) (end bool, err error) {
+ if len(line) > 0 && line[len(line)-1] == '\n' {
+ line = line[:len(line)-1]
+ }
+
+ for _, hdr := range []struct {
+ prefix string
+ end bool
+ parse func(*File, string, string) error
+ }{
+ {"@@ -", true, nil},
+ {"--- ", false, parseGitHeaderOldName},
+ {"+++ ", false, parseGitHeaderNewName},
+ {"old mode ", false, parseGitHeaderOldMode},
+ {"new mode ", false, parseGitHeaderNewMode},
+ {"deleted file mode ", false, parseGitHeaderDeletedMode},
+ {"new file mode ", false, parseGitHeaderCreatedMode},
+ {"copy from ", false, parseGitHeaderCopyFrom},
+ {"copy to ", false, parseGitHeaderCopyTo},
+ {"rename old ", false, parseGitHeaderRenameFrom},
+ {"rename new ", false, parseGitHeaderRenameTo},
+ {"rename from ", false, parseGitHeaderRenameFrom},
+ {"rename to ", false, parseGitHeaderRenameTo},
+ {"similarity index ", false, parseGitHeaderScore},
+ {"dissimilarity index ", false, parseGitHeaderScore},
+ {"index ", false, parseGitHeaderIndex},
+ } {
+ if strings.HasPrefix(line, hdr.prefix) {
+ if hdr.parse != nil {
+ err = hdr.parse(f, line[len(hdr.prefix):], defaultName)
+ }
+ return hdr.end, err
+ }
+ }
+
+ // unknown line indicates the end of the header
+ // this usually happens if the diff is empty
+ return true, nil
+}
+
+func parseGitHeaderOldName(f *File, line, defaultName string) error {
+ name, _, err := parseName(line, '\t', 1)
+ if err != nil {
+ return err
+ }
+ if f.OldName == "" && !f.IsNew {
+ f.OldName = name
+ return nil
+ }
+ return verifyGitHeaderName(name, f.OldName, f.IsNew, "old")
+}
+
+func parseGitHeaderNewName(f *File, line, defaultName string) error {
+ name, _, err := parseName(line, '\t', 1)
+ if err != nil {
+ return err
+ }
+ if f.NewName == "" && !f.IsDelete {
+ f.NewName = name
+ return nil
+ }
+ return verifyGitHeaderName(name, f.NewName, f.IsDelete, "new")
+}
+
+func parseGitHeaderOldMode(f *File, line, defaultName string) (err error) {
+ f.OldMode, err = parseMode(strings.TrimSpace(line))
+ return
+}
+
+func parseGitHeaderNewMode(f *File, line, defaultName string) (err error) {
+ f.NewMode, err = parseMode(strings.TrimSpace(line))
+ return
+}
+
+func parseGitHeaderDeletedMode(f *File, line, defaultName string) error {
+ f.IsDelete = true
+ f.OldName = defaultName
+ return parseGitHeaderOldMode(f, line, defaultName)
+}
+
+func parseGitHeaderCreatedMode(f *File, line, defaultName string) error {
+ f.IsNew = true
+ f.NewName = defaultName
+ return parseGitHeaderNewMode(f, line, defaultName)
+}
+
+func parseGitHeaderCopyFrom(f *File, line, defaultName string) (err error) {
+ f.IsCopy = true
+ f.OldName, _, err = parseName(line, 0, 0)
+ return
+}
+
+func parseGitHeaderCopyTo(f *File, line, defaultName string) (err error) {
+ f.IsCopy = true
+ f.NewName, _, err = parseName(line, 0, 0)
+ return
+}
+
+func parseGitHeaderRenameFrom(f *File, line, defaultName string) (err error) {
+ f.IsRename = true
+ f.OldName, _, err = parseName(line, 0, 0)
+ return
+}
+
+func parseGitHeaderRenameTo(f *File, line, defaultName string) (err error) {
+ f.IsRename = true
+ f.NewName, _, err = parseName(line, 0, 0)
+ return
+}
+
+func parseGitHeaderScore(f *File, line, defaultName string) error {
+ score, err := strconv.ParseInt(strings.TrimSuffix(line, "%"), 10, 32)
+ if err != nil {
+ nerr := err.(*strconv.NumError)
+ return fmt.Errorf("invalid score line: %v", nerr.Err)
+ }
+ if score <= 100 {
+ f.Score = int(score)
+ }
+ return nil
+}
+
+func parseGitHeaderIndex(f *File, line, defaultName string) error {
+ const sep = ".."
+
+ // note that git stops parsing if the OIDs are too long to be valid
+ // checking this requires knowing if the repository uses SHA1 or SHA256
+ // hashes, which we don't know, so we just skip that check
+
+ parts := strings.SplitN(line, " ", 2)
+ oids := strings.SplitN(parts[0], sep, 2)
+
+ if len(oids) < 2 {
+ return fmt.Errorf("invalid index line: missing %q", sep)
+ }
+ f.OldOIDPrefix, f.NewOIDPrefix = oids[0], oids[1]
+
+ if len(parts) > 1 {
+ return parseGitHeaderOldMode(f, parts[1], defaultName)
+ }
+ return nil
+}
+
+func parseMode(s string) (os.FileMode, error) {
+ mode, err := strconv.ParseInt(s, 8, 32)
+ if err != nil {
+ nerr := err.(*strconv.NumError)
+ return os.FileMode(0), fmt.Errorf("invalid mode line: %v", nerr.Err)
+ }
+ return os.FileMode(mode), nil
+}
+
+// parseName extracts a file name from the start of a string and returns the
+// name and the index of the first character after the name. If the name is
+// unquoted and term is non-zero, parsing stops at the first occurrence of
+// term.
+//
+// If the name is exactly "/dev/null", no further processing occurs. Otherwise,
+// if dropPrefix is greater than zero, that number of prefix components
+// separated by forward slashes are dropped from the name and any duplicate
+// slashes are collapsed.
+func parseName(s string, term byte, dropPrefix int) (name string, n int, err error) {
+ if len(s) > 0 && s[0] == '"' {
+ name, n, err = parseQuotedName(s)
+ } else {
+ name, n, err = parseUnquotedName(s, term)
+ }
+ if err != nil {
+ return "", 0, err
+ }
+ if name == devNull {
+ return name, n, nil
+ }
+ return cleanName(name, dropPrefix), n, nil
+}
+
+func parseQuotedName(s string) (name string, n int, err error) {
+ for n = 1; n < len(s); n++ {
+ if s[n] == '"' && s[n-1] != '\\' {
+ n++
+ break
+ }
+ }
+ if n == 2 {
+ return "", 0, fmt.Errorf("missing name")
+ }
+ if name, err = strconv.Unquote(s[:n]); err != nil {
+ return "", 0, err
+ }
+ return name, n, err
+}
+
+func parseUnquotedName(s string, term byte) (name string, n int, err error) {
+ for n = 0; n < len(s); n++ {
+ if s[n] == '\n' {
+ break
+ }
+ if term > 0 && s[n] == term {
+ break
+ }
+ }
+ if n == 0 {
+ return "", 0, fmt.Errorf("missing name")
+ }
+ return s[:n], n, nil
+}
+
+// verifyGitHeaderName checks a parsed name against state set by previous lines
+func verifyGitHeaderName(parsed, existing string, isNull bool, side string) error {
+ if existing != "" {
+ if isNull {
+ return fmt.Errorf("expected %s, but filename is set to %s", devNull, existing)
+ }
+ if existing != parsed {
+ return fmt.Errorf("inconsistent %s filename", side)
+ }
+ }
+ if isNull && parsed != devNull {
+ return fmt.Errorf("expected %s", devNull)
+ }
+ return nil
+}
+
+// cleanName removes double slashes and drops prefix segments.
+func cleanName(name string, drop int) string {
+ var b strings.Builder
+ for i := 0; i < len(name); i++ {
+ if name[i] == '/' {
+ if i < len(name)-1 && name[i+1] == '/' {
+ continue
+ }
+ if drop > 0 {
+ drop--
+ b.Reset()
+ continue
+ }
+ }
+ b.WriteByte(name[i])
+ }
+ return b.String()
+}
+
+// trimTreePrefix removes up to n leading directory components from name.
+func trimTreePrefix(name string, n int) string {
+ i := 0
+ for ; i < len(name) && n > 0; i++ {
+ if name[i] == '/' {
+ n--
+ }
+ }
+ return name[i:]
+}
+
+// hasEpochTimestamp returns true if the string ends with a POSIX-formatted
+// timestamp for the UNIX epoch after a tab character. According to git, this
+// is used by GNU diff to mark creations and deletions.
+func hasEpochTimestamp(s string) bool {
+ const posixTimeLayout = "2006-01-02 15:04:05.9 -0700"
+
+ start := strings.IndexRune(s, '\t')
+ if start < 0 {
+ return false
+ }
+
+ ts := strings.TrimSuffix(s[start+1:], "\n")
+
+ // a valid timestamp can have optional ':' in zone specifier
+ // remove that if it exists so we have a single format
+ if len(ts) >= 3 && ts[len(ts)-3] == ':' {
+ ts = ts[:len(ts)-3] + ts[len(ts)-2:]
+ }
+
+ t, err := time.Parse(posixTimeLayout, ts)
+ if err != nil {
+ return false
+ }
+ if !t.Equal(time.Unix(0, 0)) {
+ return false
+ }
+ return true
+}
+
+func isSpace(c byte) bool {
+ return c == ' ' || c == '\t' || c == '\n'
+}