diff options
| author | mo khan <mo@mokhan.ca> | 2026-01-30 18:16:31 -0700 |
|---|---|---|
| committer | mo khan <mo@mokhan.ca> | 2026-01-30 18:16:31 -0700 |
| commit | feee7d43ef63ae607c6fd4cca88a356a93553ebe (patch) | |
| tree | 2969055a894dc4e72d8d79a9ac74cc30d78aff64 /internal/gitdiff/file_header.go | |
| parent | e0db8f82e96acadf6968e0cf9c805a7b22d835db (diff) | |
refactor: move packages to internal/
Diffstat (limited to 'internal/gitdiff/file_header.go')
| -rw-r--r-- | internal/gitdiff/file_header.go | 546 |
1 files changed, 546 insertions, 0 deletions
diff --git a/internal/gitdiff/file_header.go b/internal/gitdiff/file_header.go new file mode 100644 index 0000000..7ae4bc9 --- /dev/null +++ b/internal/gitdiff/file_header.go @@ -0,0 +1,546 @@ +package gitdiff + +import ( + "fmt" + "io" + "os" + "strconv" + "strings" + "time" +) + +const ( + devNull = "/dev/null" +) + +// ParseNextFileHeader finds and parses the next file header in the stream. If +// a header is found, it returns a file and all input before the header. It +// returns nil if no headers are found before the end of the input. +func (p *parser) ParseNextFileHeader() (*File, string, error) { + var preamble strings.Builder + var file *File + for { + // check for disconnected fragment headers (corrupt patch) + frag, err := p.ParseTextFragmentHeader() + if err != nil { + // not a valid header, nothing to worry about + goto NextLine + } + if frag != nil { + return nil, "", p.Errorf(-1, "patch fragment without file header: %s", frag.Header()) + } + + // check for a git-generated patch + file, err = p.ParseGitFileHeader() + if err != nil { + return nil, "", err + } + if file != nil { + return file, preamble.String(), nil + } + + // check for a "traditional" patch + file, err = p.ParseTraditionalFileHeader() + if err != nil { + return nil, "", err + } + if file != nil { + return file, preamble.String(), nil + } + + NextLine: + preamble.WriteString(p.Line(0)) + if err := p.Next(); err != nil { + if err == io.EOF { + break + } + return nil, "", err + } + } + return nil, preamble.String(), nil +} + +func (p *parser) ParseGitFileHeader() (*File, error) { + const prefix = "diff --git " + + if !strings.HasPrefix(p.Line(0), prefix) { + return nil, nil + } + header := p.Line(0)[len(prefix):] + + defaultName, err := parseGitHeaderName(header) + if err != nil { + return nil, p.Errorf(0, "git file header: %v", err) + } + + f := &File{} + for { + end, err := parseGitHeaderData(f, p.Line(1), defaultName) + if err != nil { + return nil, p.Errorf(1, "git file header: %v", err) + } + + if err := p.Next(); err != nil { + if err == io.EOF { + break + } + return nil, err + } + + if end { + break + } + } + + if f.OldName == "" && f.NewName == "" { + if defaultName == "" { + return nil, p.Errorf(0, "git file header: missing filename information") + } + f.OldName = defaultName + f.NewName = defaultName + } + + if (f.NewName == "" && !f.IsDelete) || (f.OldName == "" && !f.IsNew) { + return nil, p.Errorf(0, "git file header: missing filename information") + } + + return f, nil +} + +func (p *parser) ParseTraditionalFileHeader() (*File, error) { + const shortestValidFragHeader = "@@ -1 +1 @@\n" + const ( + oldPrefix = "--- " + newPrefix = "+++ " + ) + + oldLine, newLine := p.Line(0), p.Line(1) + + if !strings.HasPrefix(oldLine, oldPrefix) || !strings.HasPrefix(newLine, newPrefix) { + return nil, nil + } + // heuristic: only a file header if followed by a (probable) fragment header + if len(p.Line(2)) < len(shortestValidFragHeader) || !strings.HasPrefix(p.Line(2), "@@ -") { + return nil, nil + } + + // advance past the first two lines so parser is after the header + // no EOF check needed because we know there are >=3 valid lines + if err := p.Next(); err != nil { + return nil, err + } + if err := p.Next(); err != nil { + return nil, err + } + + oldName, _, err := parseName(oldLine[len(oldPrefix):], '\t', 0) + if err != nil { + return nil, p.Errorf(0, "file header: %v", err) + } + + newName, _, err := parseName(newLine[len(newPrefix):], '\t', 0) + if err != nil { + return nil, p.Errorf(1, "file header: %v", err) + } + + f := &File{} + switch { + case oldName == devNull || hasEpochTimestamp(oldLine): + f.IsNew = true + f.NewName = newName + case newName == devNull || hasEpochTimestamp(newLine): + f.IsDelete = true + f.OldName = oldName + default: + // if old name is a prefix of new name, use that instead + // this avoids picking variants like "file.bak" or "file~" + if strings.HasPrefix(newName, oldName) { + f.OldName = oldName + f.NewName = oldName + } else { + f.OldName = newName + f.NewName = newName + } + } + + return f, nil +} + +// parseGitHeaderName extracts a default file name from the Git file header +// line. This is required for mode-only changes and creation/deletion of empty +// files. Other types of patch include the file name(s) in the header data. +// If the names in the header do not match because the patch is a rename, +// return an empty default name. +func parseGitHeaderName(header string) (string, error) { + header = strings.TrimSuffix(header, "\n") + if len(header) == 0 { + return "", nil + } + + var err error + var first, second string + + // there are 4 cases to account for: + // + // 1) unquoted unquoted + // 2) unquoted "quoted" + // 3) "quoted" unquoted + // 4) "quoted" "quoted" + // + quote := strings.IndexByte(header, '"') + switch { + case quote < 0: + // case 1 + first = header + + case quote > 0: + // case 2 + first = header[:quote-1] + if !isSpace(header[quote-1]) { + return "", fmt.Errorf("missing separator") + } + + second, _, err = parseQuotedName(header[quote:]) + if err != nil { + return "", err + } + + case quote == 0: + // case 3 or case 4 + var n int + first, n, err = parseQuotedName(header) + if err != nil { + return "", err + } + + // git accepts multiple spaces after a quoted name, but not after an + // unquoted name, since the name might end with one or more spaces + for n < len(header) && isSpace(header[n]) { + n++ + } + if n == len(header) { + return "", nil + } + + if header[n] == '"' { + second, _, err = parseQuotedName(header[n:]) + if err != nil { + return "", err + } + } else { + second = header[n:] + } + } + + first = trimTreePrefix(first, 1) + if second != "" { + if first == trimTreePrefix(second, 1) { + return first, nil + } + return "", nil + } + + // at this point, both names are unquoted (case 1) + // since names may contain spaces, we can't use a known separator + // instead, look for a split that produces two equal names + + for i := 0; i < len(first)-1; i++ { + if !isSpace(first[i]) { + continue + } + second = trimTreePrefix(first[i+1:], 1) + if name := first[:i]; name == second { + return name, nil + } + } + return "", nil +} + +// parseGitHeaderData parses a single line of metadata from a Git file header. +// It returns true when header parsing is complete; in that case, line was the +// first line of non-header content. +func parseGitHeaderData(f *File, line, defaultName string) (end bool, err error) { + if len(line) > 0 && line[len(line)-1] == '\n' { + line = line[:len(line)-1] + } + + for _, hdr := range []struct { + prefix string + end bool + parse func(*File, string, string) error + }{ + {"@@ -", true, nil}, + {"--- ", false, parseGitHeaderOldName}, + {"+++ ", false, parseGitHeaderNewName}, + {"old mode ", false, parseGitHeaderOldMode}, + {"new mode ", false, parseGitHeaderNewMode}, + {"deleted file mode ", false, parseGitHeaderDeletedMode}, + {"new file mode ", false, parseGitHeaderCreatedMode}, + {"copy from ", false, parseGitHeaderCopyFrom}, + {"copy to ", false, parseGitHeaderCopyTo}, + {"rename old ", false, parseGitHeaderRenameFrom}, + {"rename new ", false, parseGitHeaderRenameTo}, + {"rename from ", false, parseGitHeaderRenameFrom}, + {"rename to ", false, parseGitHeaderRenameTo}, + {"similarity index ", false, parseGitHeaderScore}, + {"dissimilarity index ", false, parseGitHeaderScore}, + {"index ", false, parseGitHeaderIndex}, + } { + if strings.HasPrefix(line, hdr.prefix) { + if hdr.parse != nil { + err = hdr.parse(f, line[len(hdr.prefix):], defaultName) + } + return hdr.end, err + } + } + + // unknown line indicates the end of the header + // this usually happens if the diff is empty + return true, nil +} + +func parseGitHeaderOldName(f *File, line, defaultName string) error { + name, _, err := parseName(line, '\t', 1) + if err != nil { + return err + } + if f.OldName == "" && !f.IsNew { + f.OldName = name + return nil + } + return verifyGitHeaderName(name, f.OldName, f.IsNew, "old") +} + +func parseGitHeaderNewName(f *File, line, defaultName string) error { + name, _, err := parseName(line, '\t', 1) + if err != nil { + return err + } + if f.NewName == "" && !f.IsDelete { + f.NewName = name + return nil + } + return verifyGitHeaderName(name, f.NewName, f.IsDelete, "new") +} + +func parseGitHeaderOldMode(f *File, line, defaultName string) (err error) { + f.OldMode, err = parseMode(strings.TrimSpace(line)) + return +} + +func parseGitHeaderNewMode(f *File, line, defaultName string) (err error) { + f.NewMode, err = parseMode(strings.TrimSpace(line)) + return +} + +func parseGitHeaderDeletedMode(f *File, line, defaultName string) error { + f.IsDelete = true + f.OldName = defaultName + return parseGitHeaderOldMode(f, line, defaultName) +} + +func parseGitHeaderCreatedMode(f *File, line, defaultName string) error { + f.IsNew = true + f.NewName = defaultName + return parseGitHeaderNewMode(f, line, defaultName) +} + +func parseGitHeaderCopyFrom(f *File, line, defaultName string) (err error) { + f.IsCopy = true + f.OldName, _, err = parseName(line, 0, 0) + return +} + +func parseGitHeaderCopyTo(f *File, line, defaultName string) (err error) { + f.IsCopy = true + f.NewName, _, err = parseName(line, 0, 0) + return +} + +func parseGitHeaderRenameFrom(f *File, line, defaultName string) (err error) { + f.IsRename = true + f.OldName, _, err = parseName(line, 0, 0) + return +} + +func parseGitHeaderRenameTo(f *File, line, defaultName string) (err error) { + f.IsRename = true + f.NewName, _, err = parseName(line, 0, 0) + return +} + +func parseGitHeaderScore(f *File, line, defaultName string) error { + score, err := strconv.ParseInt(strings.TrimSuffix(line, "%"), 10, 32) + if err != nil { + nerr := err.(*strconv.NumError) + return fmt.Errorf("invalid score line: %v", nerr.Err) + } + if score <= 100 { + f.Score = int(score) + } + return nil +} + +func parseGitHeaderIndex(f *File, line, defaultName string) error { + const sep = ".." + + // note that git stops parsing if the OIDs are too long to be valid + // checking this requires knowing if the repository uses SHA1 or SHA256 + // hashes, which we don't know, so we just skip that check + + parts := strings.SplitN(line, " ", 2) + oids := strings.SplitN(parts[0], sep, 2) + + if len(oids) < 2 { + return fmt.Errorf("invalid index line: missing %q", sep) + } + f.OldOIDPrefix, f.NewOIDPrefix = oids[0], oids[1] + + if len(parts) > 1 { + return parseGitHeaderOldMode(f, parts[1], defaultName) + } + return nil +} + +func parseMode(s string) (os.FileMode, error) { + mode, err := strconv.ParseInt(s, 8, 32) + if err != nil { + nerr := err.(*strconv.NumError) + return os.FileMode(0), fmt.Errorf("invalid mode line: %v", nerr.Err) + } + return os.FileMode(mode), nil +} + +// parseName extracts a file name from the start of a string and returns the +// name and the index of the first character after the name. If the name is +// unquoted and term is non-zero, parsing stops at the first occurrence of +// term. +// +// If the name is exactly "/dev/null", no further processing occurs. Otherwise, +// if dropPrefix is greater than zero, that number of prefix components +// separated by forward slashes are dropped from the name and any duplicate +// slashes are collapsed. +func parseName(s string, term byte, dropPrefix int) (name string, n int, err error) { + if len(s) > 0 && s[0] == '"' { + name, n, err = parseQuotedName(s) + } else { + name, n, err = parseUnquotedName(s, term) + } + if err != nil { + return "", 0, err + } + if name == devNull { + return name, n, nil + } + return cleanName(name, dropPrefix), n, nil +} + +func parseQuotedName(s string) (name string, n int, err error) { + for n = 1; n < len(s); n++ { + if s[n] == '"' && s[n-1] != '\\' { + n++ + break + } + } + if n == 2 { + return "", 0, fmt.Errorf("missing name") + } + if name, err = strconv.Unquote(s[:n]); err != nil { + return "", 0, err + } + return name, n, err +} + +func parseUnquotedName(s string, term byte) (name string, n int, err error) { + for n = 0; n < len(s); n++ { + if s[n] == '\n' { + break + } + if term > 0 && s[n] == term { + break + } + } + if n == 0 { + return "", 0, fmt.Errorf("missing name") + } + return s[:n], n, nil +} + +// verifyGitHeaderName checks a parsed name against state set by previous lines +func verifyGitHeaderName(parsed, existing string, isNull bool, side string) error { + if existing != "" { + if isNull { + return fmt.Errorf("expected %s, but filename is set to %s", devNull, existing) + } + if existing != parsed { + return fmt.Errorf("inconsistent %s filename", side) + } + } + if isNull && parsed != devNull { + return fmt.Errorf("expected %s", devNull) + } + return nil +} + +// cleanName removes double slashes and drops prefix segments. +func cleanName(name string, drop int) string { + var b strings.Builder + for i := 0; i < len(name); i++ { + if name[i] == '/' { + if i < len(name)-1 && name[i+1] == '/' { + continue + } + if drop > 0 { + drop-- + b.Reset() + continue + } + } + b.WriteByte(name[i]) + } + return b.String() +} + +// trimTreePrefix removes up to n leading directory components from name. +func trimTreePrefix(name string, n int) string { + i := 0 + for ; i < len(name) && n > 0; i++ { + if name[i] == '/' { + n-- + } + } + return name[i:] +} + +// hasEpochTimestamp returns true if the string ends with a POSIX-formatted +// timestamp for the UNIX epoch after a tab character. According to git, this +// is used by GNU diff to mark creations and deletions. +func hasEpochTimestamp(s string) bool { + const posixTimeLayout = "2006-01-02 15:04:05.9 -0700" + + start := strings.IndexRune(s, '\t') + if start < 0 { + return false + } + + ts := strings.TrimSuffix(s[start+1:], "\n") + + // a valid timestamp can have optional ':' in zone specifier + // remove that if it exists so we have a single format + if len(ts) >= 3 && ts[len(ts)-3] == ':' { + ts = ts[:len(ts)-3] + ts[len(ts)-2:] + } + + t, err := time.Parse(posixTimeLayout, ts) + if err != nil { + return false + } + if !t.Equal(time.Unix(0, 0)) { + return false + } + return true +} + +func isSpace(c byte) bool { + return c == ' ' || c == '\t' || c == '\n' +} |
