From eb76e5d3e8c1e53dc6ad95412d49f9786f4d1601 Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Fri, 27 Jan 2023 16:04:58 +0100 Subject: [PATCH] Move git.FileSet to libs/fileset and make it aware of gitignores (#184) This moves `git.FileSet` to `libs/fileset` and decouples it from the Git package. It is made aware of gitignore rules in parent directories up to the repository root as well as gitignore files in underlying directories through the `fileset.Ignorer` interface. The recursive directory walker is reimplemented with [filepath.WalkDir]. Follow up to #182. --- git/fileset.go | 146 ++++------------------------ git/fileset_test.go | 91 ++++------------- git/testdata/a/a.ignore | 0 git/testdata/a/b/b.ignore | 0 git/testdata/a/b/world.txt | 0 git/testdata/a/hello.txt | 0 git/testdata/databricks.yml | 0 git/testdata/ignorethis/ignored.txt | 0 git/view.go | 21 ++++ libs/fileset/file.go | 20 ++++ libs/fileset/fileset.go | 72 ++++++++++++++ libs/fileset/ignorer.go | 19 ++++ libs/sync/snapshot.go | 4 +- libs/sync/snapshot_test.go | 14 ++- libs/sync/sync.go | 10 +- project/project.go | 16 +-- 16 files changed, 196 insertions(+), 217 deletions(-) create mode 100644 git/testdata/a/a.ignore create mode 100644 git/testdata/a/b/b.ignore create mode 100644 git/testdata/a/b/world.txt create mode 100644 git/testdata/a/hello.txt create mode 100644 git/testdata/databricks.yml create mode 100644 git/testdata/ignorethis/ignored.txt create mode 100644 libs/fileset/file.go create mode 100644 libs/fileset/fileset.go create mode 100644 libs/fileset/ignorer.go diff --git a/git/fileset.go b/git/fileset.go index fa9d5dabd..98e81a7bf 100644 --- a/git/fileset.go +++ b/git/fileset.go @@ -1,82 +1,31 @@ package git import ( - "fmt" - "io/fs" "os" "path/filepath" - "strings" - "time" - ignore "github.com/sabhiram/go-gitignore" - "golang.org/x/exp/slices" + "github.com/databricks/bricks/libs/fileset" ) -type File struct { - fs.DirEntry - Absolute, Relative string -} - -func (f File) Modified() (ts time.Time) { - info, err := f.Info() - if err != nil { - // return default time, beginning of epoch - return ts - } - return info.ModTime() -} - -// FileSet facilitates fast recursive tracked file listing -// with respect to patterns defined in `.gitignore` file -// -// root: Root of the git repository -// ignore: List of patterns defined in `.gitignore`. -// -// We do not sync files that match this pattern -type FileSet struct { - root string - ignore *ignore.GitIgnore -} - // Retuns FileSet for the git repo located at `root` -func NewFileSet(root string) *FileSet { - syncIgnoreLines := append(getGitIgnoreLines(root), getSyncIgnoreLines()...) - return &FileSet{ - root: root, - ignore: ignore.CompileIgnoreLines(syncIgnoreLines...), +func NewFileSet(root string) (*fileset.FileSet, error) { + w := fileset.New(root) + v, err := NewView(root) + if err != nil { + return nil, err } -} - -func getGitIgnoreLines(root string) []string { - gitIgnoreLines := []string{} - gitIgnorePath := fmt.Sprintf("%s/.gitignore", root) - rawIgnore, err := os.ReadFile(gitIgnorePath) - - if err == nil { - // add entries from .gitignore if the file exists (did read correctly) - for _, line := range strings.Split(string(rawIgnore), "\n") { - // underlying library doesn't behave well with Rule 5 of .gitignore, - // hence this workaround - gitIgnoreLines = append(gitIgnoreLines, strings.Trim(line, "/")) - } - } - return gitIgnoreLines -} - -// This contains additional files/dirs to be ignored while syncing that are -// not mentioned in .gitignore -func getSyncIgnoreLines() []string { - return []string{".git"} + w.SetIgnorer(v) + return w, nil } // Only call this function for a bricks project root // since it will create a .gitignore file if missing -func (w *FileSet) EnsureValidGitIgnoreExists() error { - gitIgnoreLines := getGitIgnoreLines(w.root) - gitIgnorePath := fmt.Sprintf("%s/.gitignore", w.root) - if slices.Contains(gitIgnoreLines, ".databricks") { +func EnsureValidGitIgnoreExists(w *fileset.FileSet) error { + if w.Ignorer().IgnoreDirectory(".databricks") { return nil } + + gitIgnorePath := filepath.Join(w.Root(), ".gitignore") f, err := os.OpenFile(gitIgnorePath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0644) if err != nil { return err @@ -86,69 +35,12 @@ func (w *FileSet) EnsureValidGitIgnoreExists() error { if err != nil { return err } - gitIgnoreLines = append(gitIgnoreLines, ".databricks") - w.ignore = ignore.CompileIgnoreLines(append(gitIgnoreLines, getSyncIgnoreLines()...)...) + + // Reload view to update ignore rules. + v, err := NewView(w.Root()) + if err != nil { + return err + } + w.SetIgnorer(v) return nil } - -// Return root for fileset. -func (w *FileSet) Root() string { - return w.root -} - -// Return all tracked files for Repo -func (w *FileSet) All() ([]File, error) { - return w.RecursiveListFiles(w.root) -} - -func (w *FileSet) IsGitIgnored(pattern string) bool { - return w.ignore.MatchesPath(pattern) -} - -// Recursively traverses dir in a depth first manner and returns a list of all files -// that are being tracked in the FileSet (ie not being ignored for matching one of the -// patterns in w.ignore) -func (w *FileSet) RecursiveListFiles(dir string) (fileList []File, err error) { - queue, err := readDir(dir, w.root) - if err != nil { - return nil, err - } - for len(queue) > 0 { - current := queue[0] - queue = queue[1:] - if w.ignore.MatchesPath(current.Relative) { - continue - } - if !current.IsDir() { - fileList = append(fileList, current) - continue - } - children, err := readDir(current.Absolute, w.root) - if err != nil { - return nil, err - } - queue = append(queue, children...) - } - return fileList, nil -} - -func readDir(dir, root string) (queue []File, err error) { - f, err := os.Open(dir) - if err != nil { - return - } - defer f.Close() - dirs, err := f.ReadDir(-1) - if err != nil { - return - } - for _, v := range dirs { - absolute := filepath.Join(dir, v.Name()) - relative, err := filepath.Rel(root, absolute) - if err != nil { - return nil, err - } - queue = append(queue, File{v, absolute, relative}) - } - return -} diff --git a/git/fileset_test.go b/git/fileset_test.go index 5c43d6bf5..f8c21573e 100644 --- a/git/fileset_test.go +++ b/git/fileset_test.go @@ -1,91 +1,34 @@ package git import ( - "os" "path/filepath" - "runtime" - "strings" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -func TestRecusiveListFile(t *testing.T) { - // Create .gitignore and ignore .gitignore and any files in - // ignored_dir - projectDir := t.TempDir() - f3, err := os.Create(filepath.Join(projectDir, ".gitignore")) - assert.NoError(t, err) - defer f3.Close() - f3.WriteString(".gitignore\nignored_dir") - - // create config file - f4, err := os.Create(filepath.Join(projectDir, "databricks.yml")) - assert.NoError(t, err) - defer f4.Close() - - // config file is returned - // .gitignore is not because we explictly ignore it in .gitignore - fileSet := NewFileSet(projectDir) - files, err := fileSet.RecursiveListFiles(projectDir) - assert.NoError(t, err) - assert.Len(t, files, 1) - assert.Equal(t, "databricks.yml", files[0].Relative) - - helloTxtRelativePath := filepath.Join("a/b/c", "hello.txt") - - // Check that newly added files not in .gitignore - // are being tracked - dir1 := filepath.Join(projectDir, "a", "b", "c") - dir2 := filepath.Join(projectDir, "ignored_dir", "e") - err = os.MkdirAll(dir2, 0o755) - assert.NoError(t, err) - err = os.MkdirAll(dir1, 0o755) - assert.NoError(t, err) - f1, err := os.Create(filepath.Join(projectDir, helloTxtRelativePath)) - assert.NoError(t, err) - defer f1.Close() - f2, err := os.Create(filepath.Join(projectDir, "ignored_dir/e/world.txt")) - assert.NoError(t, err) - defer f2.Close() - - files, err = fileSet.RecursiveListFiles(projectDir) - assert.NoError(t, err) - assert.Len(t, files, 2) - var fileNames []string - for _, v := range files { - fileNames = append(fileNames, v.Relative) - } - assert.Contains(t, fileNames, "databricks.yml") - assert.Contains(t, fileNames, helloTxtRelativePath) +func TestFileSetRecursiveListFiles(t *testing.T) { + fileSet, err := NewFileSet("./testdata") + require.NoError(t, err) + files, err := fileSet.RecursiveListFiles("./testdata") + require.NoError(t, err) + require.Len(t, files, 6) + assert.Equal(t, filepath.Join(".gitignore"), files[0].Relative) + assert.Equal(t, filepath.Join("a", ".gitignore"), files[1].Relative) + assert.Equal(t, filepath.Join("a", "b", ".gitignore"), files[2].Relative) + assert.Equal(t, filepath.Join("a", "b", "world.txt"), files[3].Relative) + assert.Equal(t, filepath.Join("a", "hello.txt"), files[4].Relative) + assert.Equal(t, filepath.Join("databricks.yml"), files[5].Relative) } func TestFileSetNonCleanRoot(t *testing.T) { // Test what happens if the root directory can be simplified. // Path simplification is done by most filepath functions. - - // remove this once equivalent tests for windows have been set up - // or this test has been fixed for windows - // date: 28 Nov 2022 - if runtime.GOOS == "windows" { - t.Skip("skipping temperorilty to make windows unit tests green") - } - - root := "./../git" - require.NotEqual(t, root, filepath.Clean(root)) - fs := NewFileSet(root) - files, err := fs.All() + // This should yield the same result as above test. + fileSet, err := NewFileSet("./testdata/../testdata") require.NoError(t, err) - - found := false - for _, f := range files { - if strings.Contains(f.Relative, "fileset_test") { - assert.Equal(t, "fileset_test.go", f.Relative) - assert.Equal(t, "../git/fileset_test.go", f.Absolute) - found = true - } - } - - assert.True(t, found) + files, err := fileSet.All() + require.NoError(t, err) + assert.Len(t, files, 6) } diff --git a/git/testdata/a/a.ignore b/git/testdata/a/a.ignore new file mode 100644 index 000000000..e69de29bb diff --git a/git/testdata/a/b/b.ignore b/git/testdata/a/b/b.ignore new file mode 100644 index 000000000..e69de29bb diff --git a/git/testdata/a/b/world.txt b/git/testdata/a/b/world.txt new file mode 100644 index 000000000..e69de29bb diff --git a/git/testdata/a/hello.txt b/git/testdata/a/hello.txt new file mode 100644 index 000000000..e69de29bb diff --git a/git/testdata/databricks.yml b/git/testdata/databricks.yml new file mode 100644 index 000000000..e69de29bb diff --git a/git/testdata/ignorethis/ignored.txt b/git/testdata/ignorethis/ignored.txt new file mode 100644 index 000000000..e69de29bb diff --git a/git/view.go b/git/view.go index 9a09806d6..1b81ac33c 100644 --- a/git/view.go +++ b/git/view.go @@ -38,6 +38,27 @@ func (v *View) Ignore(path string) bool { return v.repo.Ignore(filepath.Join(v.targetPath, path) + trailingSlash) } +// IgnoreFile returns if the gitignore rules in this fileset +// apply to the specified file path. +// +// This function is provided to implement [fileset.Ignorer]. +func (v *View) IgnoreFile(file string) bool { + return v.Ignore(file) +} + +// IgnoreDirectory returns if the gitignore rules in this fileset +// apply to the specified directory path. +// +// A gitignore rule may apply only to directories if it uses +// a trailing slash. Therefore this function checks the gitignore +// rules for the specified directory path first without and then +// with a trailing slash. +// +// This function is provided to implement [fileset.Ignorer]. +func (v *View) IgnoreDirectory(dir string) bool { + return v.Ignore(dir) || v.Ignore(dir+"/") +} + func NewView(path string) (*View, error) { path, err := filepath.Abs(path) if err != nil { diff --git a/libs/fileset/file.go b/libs/fileset/file.go new file mode 100644 index 000000000..6594de4ed --- /dev/null +++ b/libs/fileset/file.go @@ -0,0 +1,20 @@ +package fileset + +import ( + "io/fs" + "time" +) + +type File struct { + fs.DirEntry + Absolute, Relative string +} + +func (f File) Modified() (ts time.Time) { + info, err := f.Info() + if err != nil { + // return default time, beginning of epoch + return ts + } + return info.ModTime() +} diff --git a/libs/fileset/fileset.go b/libs/fileset/fileset.go new file mode 100644 index 000000000..98e6969d9 --- /dev/null +++ b/libs/fileset/fileset.go @@ -0,0 +1,72 @@ +package fileset + +import ( + "io/fs" + "path/filepath" +) + +// FileSet facilitates fast recursive file listing of a path. +// It optionally takes into account ignore rules through the [Ignorer] interface. +type FileSet struct { + root string + ignore Ignorer +} + +// New returns a [FileSet] for the given root path. +func New(root string) *FileSet { + return &FileSet{ + root: filepath.Clean(root), + ignore: nopIgnorer{}, + } +} + +// Ignorer returns the [FileSet]'s current ignorer. +func (w *FileSet) Ignorer() Ignorer { + return w.ignore +} + +// SetIgnorer sets the [Ignorer] interface for this [FileSet]. +func (w *FileSet) SetIgnorer(ignore Ignorer) { + w.ignore = ignore +} + +// Return root for fileset. +func (w *FileSet) Root() string { + return w.root +} + +// Return all tracked files for Repo +func (w *FileSet) All() ([]File, error) { + return w.RecursiveListFiles(w.root) +} + +// Recursively traverses dir in a depth first manner and returns a list of all files +// that are being tracked in the FileSet (ie not being ignored for matching one of the +// patterns in w.ignore) +func (w *FileSet) RecursiveListFiles(dir string) (fileList []File, err error) { + err = filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + + relPath, err := filepath.Rel(w.root, path) + if err != nil { + return err + } + + if d.IsDir() { + if w.ignore.IgnoreDirectory(relPath) { + return filepath.SkipDir + } + return nil + } + + if w.ignore.IgnoreFile(relPath) { + return nil + } + + fileList = append(fileList, File{d, path, relPath}) + return nil + }) + return +} diff --git a/libs/fileset/ignorer.go b/libs/fileset/ignorer.go new file mode 100644 index 000000000..611ce6071 --- /dev/null +++ b/libs/fileset/ignorer.go @@ -0,0 +1,19 @@ +package fileset + +// Ignorer is the interface for what determines if a path +// in the [FileSet] must be ignored or not. +type Ignorer interface { + IgnoreFile(path string) bool + IgnoreDirectory(path string) bool +} + +// nopIgnorer implements an [Ignorer] that doesn't ignore anything. +type nopIgnorer struct{} + +func (nopIgnorer) IgnoreFile(path string) bool { + return false +} + +func (nopIgnorer) IgnoreDirectory(path string) bool { + return false +} diff --git a/libs/sync/snapshot.go b/libs/sync/snapshot.go index 205070538..d77e41d23 100644 --- a/libs/sync/snapshot.go +++ b/libs/sync/snapshot.go @@ -15,7 +15,7 @@ import ( "crypto/md5" "encoding/hex" - "github.com/databricks/bricks/git" + "github.com/databricks/bricks/libs/fileset" ) // Bump it up every time a potentially breaking change is made to the snapshot schema @@ -191,7 +191,7 @@ func getNotebookDetails(path string) (isNotebook bool, typeOfNotebook string, er return false, "", nil } -func (s *Snapshot) diff(all []git.File) (change diff, err error) { +func (s *Snapshot) diff(all []fileset.File) (change diff, err error) { currentFilenames := map[string]bool{} lastModifiedTimes := s.LastUpdatedTimes remoteToLocalNames := s.RemoteToLocalNames diff --git a/libs/sync/snapshot_test.go b/libs/sync/snapshot_test.go index f02fde043..a5779ca36 100644 --- a/libs/sync/snapshot_test.go +++ b/libs/sync/snapshot_test.go @@ -26,7 +26,8 @@ func assertKeysOfMap(t *testing.T, m map[string]time.Time, expectedKeys []string func TestDiff(t *testing.T) { // Create temp project dir projectDir := t.TempDir() - fileSet := git.NewFileSet(projectDir) + fileSet, err := git.NewFileSet(projectDir) + require.NoError(t, err) state := Snapshot{ LastUpdatedTimes: make(map[string]time.Time), LocalToRemoteNames: make(map[string]string), @@ -85,14 +86,15 @@ func TestDiff(t *testing.T) { func TestFolderDiff(t *testing.T) { // Create temp project dir projectDir := t.TempDir() - fileSet := git.NewFileSet(projectDir) + fileSet, err := git.NewFileSet(projectDir) + require.NoError(t, err) state := Snapshot{ LastUpdatedTimes: make(map[string]time.Time), LocalToRemoteNames: make(map[string]string), RemoteToLocalNames: make(map[string]string), } - err := os.Mkdir(filepath.Join(projectDir, "foo"), os.ModePerm) + err = os.Mkdir(filepath.Join(projectDir, "foo"), os.ModePerm) assert.NoError(t, err) f1 := testfile.CreateFile(t, filepath.Join(projectDir, "foo", "bar.py")) defer f1.Close(t) @@ -119,7 +121,8 @@ func TestFolderDiff(t *testing.T) { func TestPythonNotebookDiff(t *testing.T) { // Create temp project dir projectDir := t.TempDir() - fileSet := git.NewFileSet(projectDir) + fileSet, err := git.NewFileSet(projectDir) + require.NoError(t, err) state := Snapshot{ LastUpdatedTimes: make(map[string]time.Time), LocalToRemoteNames: make(map[string]string), @@ -189,7 +192,8 @@ func TestPythonNotebookDiff(t *testing.T) { func TestErrorWhenIdenticalRemoteName(t *testing.T) { // Create temp project dir projectDir := t.TempDir() - fileSet := git.NewFileSet(projectDir) + fileSet, err := git.NewFileSet(projectDir) + require.NoError(t, err) state := Snapshot{ LastUpdatedTimes: make(map[string]time.Time), LocalToRemoteNames: make(map[string]string), diff --git a/libs/sync/sync.go b/libs/sync/sync.go index f11983652..bec76888e 100644 --- a/libs/sync/sync.go +++ b/libs/sync/sync.go @@ -8,6 +8,7 @@ import ( "time" "github.com/databricks/bricks/git" + "github.com/databricks/bricks/libs/fileset" "github.com/databricks/bricks/libs/sync/repofiles" "github.com/databricks/databricks-sdk-go" ) @@ -30,15 +31,18 @@ type SyncOptions struct { type Sync struct { *SyncOptions - fileSet *git.FileSet + fileSet *fileset.FileSet snapshot *Snapshot repoFiles *repofiles.RepoFiles } // New initializes and returns a new [Sync] instance. func New(ctx context.Context, opts SyncOptions) (*Sync, error) { - fileSet := git.NewFileSet(opts.LocalPath) - err := fileSet.EnsureValidGitIgnoreExists() + fileSet, err := git.NewFileSet(opts.LocalPath) + if err != nil { + return nil, err + } + err = git.EnsureValidGitIgnoreExists(fileSet) if err != nil { return nil, err } diff --git a/project/project.go b/project/project.go index 00b939350..54095bb57 100644 --- a/project/project.go +++ b/project/project.go @@ -8,6 +8,7 @@ import ( "sync" "github.com/databricks/bricks/git" + "github.com/databricks/bricks/libs/fileset" "github.com/databricks/databricks-sdk-go" "github.com/databricks/databricks-sdk-go/service/commands" "github.com/databricks/databricks-sdk-go/service/scim" @@ -26,7 +27,7 @@ type project struct { environment *Environment wsc *databricks.WorkspaceClient me *scim.User - fileSet *git.FileSet + fileSet *fileset.FileSet } // Configure is used as a PreRunE function for all commands that @@ -65,10 +66,13 @@ func Initialize(ctx context.Context, root, env string) (context.Context, error) return nil, fmt.Errorf("environment [%s] not defined", env) } - fileSet := git.NewFileSet(root) - err = fileSet.EnsureValidGitIgnoreExists() + fileSet, err := git.NewFileSet(root) if err != nil { - return ctx, nil + return nil, err + } + err = git.EnsureValidGitIgnoreExists(fileSet) + if err != nil { + return nil, err } p := project{ @@ -116,7 +120,7 @@ func (p *project) Root() string { return p.root } -func (p *project) GetFileSet() *git.FileSet { +func (p *project) GetFileSet() *fileset.FileSet { return p.fileSet } @@ -129,7 +133,7 @@ func (p *project) GetFileSet() *git.FileSet { // accidentally check into git func (p *project) CacheDir() (string, error) { // assert cache dir is present in git ignore - if !p.fileSet.IsGitIgnored(fmt.Sprintf("/%s/", CacheDirName)) { + if !p.fileSet.Ignorer().IgnoreDirectory(fmt.Sprintf("/%s/", CacheDirName)) { return "", fmt.Errorf("please add /%s/ to .gitignore", CacheDirName) }