Skip to content

Commit

Permalink
feat: simplify git walking
Browse files Browse the repository at this point in the history
Signed-off-by: Brian McGee <[email protected]>
  • Loading branch information
brianmcgee committed Oct 12, 2024
1 parent 51cd11a commit d48855f
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 249 deletions.
62 changes: 0 additions & 62 deletions walk/filetree.go

This file was deleted.

31 changes: 0 additions & 31 deletions walk/filetree_test.go

This file was deleted.

232 changes: 79 additions & 153 deletions walk/git.go
Original file line number Diff line number Diff line change
@@ -1,200 +1,126 @@
package walk

import (
"bufio"
"context"
"fmt"
"github.com/charmbracelet/log"
"github.com/numtide/treefmt/stats"
"golang.org/x/sync/errgroup"
"io"
"io/fs"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"

"github.com/charmbracelet/log"
"github.com/go-git/go-git/v5"
"github.com/go-git/go-git/v5/plumbing/filemode"
"github.com/numtide/treefmt/stats"
"golang.org/x/sync/errgroup"
)

type GitReader struct {
root string
path string
stats *stats.Stats
batchSize int
root string
path string
args []string

log *log.Logger
repo *git.Repository
log *log.Logger
stats *stats.Stats

filesCh chan *File

eg *errgroup.Group
eg *errgroup.Group
scanner *bufio.Scanner
}

func (g *GitReader) process() error {
func (g *GitReader) Read(ctx context.Context, files []*File) (n int, err error) {
// ensure we record how many files we traversed
defer func() {
close(g.filesCh)
g.stats.Add(stats.Traversed, int32(n))
}()

gitIndex, err := g.repo.Storer.Index()
if err != nil {
return fmt.Errorf("failed to open git index: %w", err)
}

// if we need to walk a path that is not the root of the repository, we will read the directory structure of the
// git index into memory for faster lookups
var idxCache *filetree

path := filepath.Clean(filepath.Join(g.root, g.path))
if !strings.HasPrefix(path, g.root) {
return fmt.Errorf("path '%s' is outside of the root '%s'", path, g.root)
}

switch path {

case g.root:

// we can just iterate the index entries
for _, entry := range gitIndex.Entries {

// we only want regular files, not directories or symlinks
if entry.Mode == filemode.Dir || entry.Mode == filemode.Symlink {
continue
}

// stat the file
path := filepath.Join(g.root, entry.Name)

info, err := os.Lstat(path)
if os.IsNotExist(err) {
// the underlying file might have been removed without the change being staged yet
g.log.Warnf("Path %s is in the index but appears to have been removed from the filesystem", path)
continue
} else if err != nil {
return fmt.Errorf("failed to stat %s: %w", path, err)
}

// determine a relative path
relPath, err := filepath.Rel(g.root, path)
if err != nil {
return fmt.Errorf("failed to determine a relative path for %s: %w", path, err)
}

file := File{
Path: path,
RelPath: relPath,
Info: info,
}

g.stats.Add(stats.Traversed, 1)
g.filesCh <- &file
}

default:

// read the git index into memory if it hasn't already
if idxCache == nil {
idxCache = &filetree{name: ""}
idxCache.readIndex(gitIndex)
}

// git index entries are relative to the repository root, so we need to determine a relative path for the
// one we are currently processing before checking if it exists within the git index
relPath, err := filepath.Rel(g.root, path)
if err != nil {
return fmt.Errorf("failed to find root relative path for %v: %w", path, err)
}

if !idxCache.hasPath(relPath) {
log.Debugf("path %s not found in git index, skipping", relPath)
return nil
}

err = filepath.Walk(path, func(path string, info fs.FileInfo, _ error) error {
// skip directories
if info.IsDir() {
return nil
}

// determine a path relative to g.root before checking presence in the git index
relPath, err := filepath.Rel(g.root, path)
if err != nil {
return fmt.Errorf("failed to determine a relative path for %s: %w", path, err)
}

if !idxCache.hasPath(relPath) {
log.Debugf("path %v not found in git index, skipping", relPath)
return nil
}
if g.scanner == nil {
// create a pipe to capture the command output
r, w := io.Pipe()

file := File{
Path: path,
RelPath: relPath,
Info: info,
}
// create a command which will execute from the specified sub path within root
cmd := exec.Command("git", g.args...)
cmd.Dir = filepath.Join(g.root, g.path)
cmd.Stdout = w

g.stats.Add(stats.Traversed, 1)
g.filesCh <- &file
return nil
// execute the command in the background
g.eg.Go(func() error {
return w.CloseWithError(cmd.Run())
})
if err != nil {
return fmt.Errorf("failed to walk %s: %w", path, err)
}
}

return nil
}

func (g *GitReader) Read(ctx context.Context, files []*File) (n int, err error) {
idx := 0
// create a new scanner for reading the output
g.scanner = bufio.NewScanner(r)
}

LOOP:
for idx < len(files) {

for n < len(files) {
select {

// exit early if the context was cancelled
case <-ctx.Done():
return 0, ctx.Err()
case file, ok := <-g.filesCh:
if !ok {
return n, ctx.Err()

default:
// read the next file
if g.scanner.Scan() {
path := filepath.Join(g.root, g.path, g.scanner.Text())

g.log.Debugf("processing file: %s", path)

info, err := os.Stat(path)
if os.IsNotExist(err) {
// the underlying file might have been removed
g.log.Warnf(
"Path %s is in the worktree but appears to have been removed from the filesystem", path,
)
continue
} else if err != nil {
return n, fmt.Errorf("failed to stat %s: %w", path, err)
}

files[n] = &File{
Path: path,
RelPath: filepath.Join(g.path, g.scanner.Text()),
Info: info,
}
n++

} else {
// nothing more to read
err = io.EOF
break LOOP
}
files[idx] = file
idx++
}
}

return idx, err
return n, err
}

func (g *GitReader) Close() error {
return g.eg.Wait()
}

func NewGitReader(
func NewGitWorktreeReader(
root string,
path string,
statz *stats.Stats,
batchSize int,
) (*GitReader, error) {
repo, err := git.PlainOpen(root)
if err != nil {
return nil, fmt.Errorf("failed to open git repository: %w", err)
}

eg := &errgroup.Group{}

r := &GitReader{
root: root,
path: path,
stats: statz,
batchSize: batchSize,
log: log.WithPrefix("walk[git]"),
repo: repo,
filesCh: make(chan *File, batchSize*runtime.NumCPU()),
eg: eg,
}
// check if the root is a git repository
cmd := exec.Command("git", "rev-parse", "--is-inside-work-tree")
cmd.Dir = root

eg.Go(r.process)
if out, err := cmd.Output(); err != nil {
return nil, fmt.Errorf("failed to check if git repository is inside work tree: %w", err)
} else if strings.Trim(string(out), "\n") != "true" {
return nil, fmt.Errorf("git repository is not inside work tree")
}

return r, nil
return &GitReader{
root: root,
path: path,
args: []string{"ls-files"},
stats: statz,
eg: &errgroup.Group{},
log: log.WithPrefix("walk[git]"),
}, nil
}
4 changes: 2 additions & 2 deletions walk/git_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (
"github.com/stretchr/testify/require"
)

func TestGitReader(t *testing.T) {
func TestGitWorktreeReader(t *testing.T) {
as := require.New(t)

tempDir := test.TempExamples(t)
Expand All @@ -40,7 +40,7 @@ func TestGitReader(t *testing.T) {

statz := stats.New()

reader, err := walk.NewGitReader(tempDir, "", &statz, 1024)
reader, err := walk.NewGitWorktreeReader(tempDir, "", &statz)
as.NoError(err)

count := 0
Expand Down
Loading

0 comments on commit d48855f

Please sign in to comment.