diff --git a/git/fileset.go b/git/fileset.go index fa9d5dab..98e81a7b 100644 --- a/git/fileset.go +++ b/git/fileset.go @@ -1,82 +1,31 @@ package git import ( - "fmt" - "io/fs" "os" "path/filepath" - "strings" - "time" - ignore "github.com/sabhiram/go-gitignore" - "golang.org/x/exp/slices" + "github.com/databricks/bricks/libs/fileset" ) -type File struct { - fs.DirEntry - Absolute, Relative string -} - -func (f File) Modified() (ts time.Time) { - info, err := f.Info() - if err != nil { - // return default time, beginning of epoch - return ts - } - return info.ModTime() -} - -// FileSet facilitates fast recursive tracked file listing -// with respect to patterns defined in `.gitignore` file -// -// root: Root of the git repository -// ignore: List of patterns defined in `.gitignore`. -// -// We do not sync files that match this pattern -type FileSet struct { - root string - ignore *ignore.GitIgnore -} - // Retuns FileSet for the git repo located at `root` -func NewFileSet(root string) *FileSet { - syncIgnoreLines := append(getGitIgnoreLines(root), getSyncIgnoreLines()...) - return &FileSet{ - root: root, - ignore: ignore.CompileIgnoreLines(syncIgnoreLines...), +func NewFileSet(root string) (*fileset.FileSet, error) { + w := fileset.New(root) + v, err := NewView(root) + if err != nil { + return nil, err } -} - -func getGitIgnoreLines(root string) []string { - gitIgnoreLines := []string{} - gitIgnorePath := fmt.Sprintf("%s/.gitignore", root) - rawIgnore, err := os.ReadFile(gitIgnorePath) - - if err == nil { - // add entries from .gitignore if the file exists (did read correctly) - for _, line := range strings.Split(string(rawIgnore), "\n") { - // underlying library doesn't behave well with Rule 5 of .gitignore, - // hence this workaround - gitIgnoreLines = append(gitIgnoreLines, strings.Trim(line, "/")) - } - } - return gitIgnoreLines -} - -// This contains additional files/dirs to be ignored while syncing that are -// not mentioned in .gitignore -func getSyncIgnoreLines() []string { - return []string{".git"} + w.SetIgnorer(v) + return w, nil } // Only call this function for a bricks project root // since it will create a .gitignore file if missing -func (w *FileSet) EnsureValidGitIgnoreExists() error { - gitIgnoreLines := getGitIgnoreLines(w.root) - gitIgnorePath := fmt.Sprintf("%s/.gitignore", w.root) - if slices.Contains(gitIgnoreLines, ".databricks") { +func EnsureValidGitIgnoreExists(w *fileset.FileSet) error { + if w.Ignorer().IgnoreDirectory(".databricks") { return nil } + + gitIgnorePath := filepath.Join(w.Root(), ".gitignore") f, err := os.OpenFile(gitIgnorePath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0644) if err != nil { return err @@ -86,69 +35,12 @@ func (w *FileSet) EnsureValidGitIgnoreExists() error { if err != nil { return err } - gitIgnoreLines = append(gitIgnoreLines, ".databricks") - w.ignore = ignore.CompileIgnoreLines(append(gitIgnoreLines, getSyncIgnoreLines()...)...) + + // Reload view to update ignore rules. + v, err := NewView(w.Root()) + if err != nil { + return err + } + w.SetIgnorer(v) return nil } - -// Return root for fileset. -func (w *FileSet) Root() string { - return w.root -} - -// Return all tracked files for Repo -func (w *FileSet) All() ([]File, error) { - return w.RecursiveListFiles(w.root) -} - -func (w *FileSet) IsGitIgnored(pattern string) bool { - return w.ignore.MatchesPath(pattern) -} - -// Recursively traverses dir in a depth first manner and returns a list of all files -// that are being tracked in the FileSet (ie not being ignored for matching one of the -// patterns in w.ignore) -func (w *FileSet) RecursiveListFiles(dir string) (fileList []File, err error) { - queue, err := readDir(dir, w.root) - if err != nil { - return nil, err - } - for len(queue) > 0 { - current := queue[0] - queue = queue[1:] - if w.ignore.MatchesPath(current.Relative) { - continue - } - if !current.IsDir() { - fileList = append(fileList, current) - continue - } - children, err := readDir(current.Absolute, w.root) - if err != nil { - return nil, err - } - queue = append(queue, children...) - } - return fileList, nil -} - -func readDir(dir, root string) (queue []File, err error) { - f, err := os.Open(dir) - if err != nil { - return - } - defer f.Close() - dirs, err := f.ReadDir(-1) - if err != nil { - return - } - for _, v := range dirs { - absolute := filepath.Join(dir, v.Name()) - relative, err := filepath.Rel(root, absolute) - if err != nil { - return nil, err - } - queue = append(queue, File{v, absolute, relative}) - } - return -} diff --git a/git/fileset_test.go b/git/fileset_test.go index 5c43d6bf..f8c21573 100644 --- a/git/fileset_test.go +++ b/git/fileset_test.go @@ -1,91 +1,34 @@ package git import ( - "os" "path/filepath" - "runtime" - "strings" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -func TestRecusiveListFile(t *testing.T) { - // Create .gitignore and ignore .gitignore and any files in - // ignored_dir - projectDir := t.TempDir() - f3, err := os.Create(filepath.Join(projectDir, ".gitignore")) - assert.NoError(t, err) - defer f3.Close() - f3.WriteString(".gitignore\nignored_dir") - - // create config file - f4, err := os.Create(filepath.Join(projectDir, "databricks.yml")) - assert.NoError(t, err) - defer f4.Close() - - // config file is returned - // .gitignore is not because we explictly ignore it in .gitignore - fileSet := NewFileSet(projectDir) - files, err := fileSet.RecursiveListFiles(projectDir) - assert.NoError(t, err) - assert.Len(t, files, 1) - assert.Equal(t, "databricks.yml", files[0].Relative) - - helloTxtRelativePath := filepath.Join("a/b/c", "hello.txt") - - // Check that newly added files not in .gitignore - // are being tracked - dir1 := filepath.Join(projectDir, "a", "b", "c") - dir2 := filepath.Join(projectDir, "ignored_dir", "e") - err = os.MkdirAll(dir2, 0o755) - assert.NoError(t, err) - err = os.MkdirAll(dir1, 0o755) - assert.NoError(t, err) - f1, err := os.Create(filepath.Join(projectDir, helloTxtRelativePath)) - assert.NoError(t, err) - defer f1.Close() - f2, err := os.Create(filepath.Join(projectDir, "ignored_dir/e/world.txt")) - assert.NoError(t, err) - defer f2.Close() - - files, err = fileSet.RecursiveListFiles(projectDir) - assert.NoError(t, err) - assert.Len(t, files, 2) - var fileNames []string - for _, v := range files { - fileNames = append(fileNames, v.Relative) - } - assert.Contains(t, fileNames, "databricks.yml") - assert.Contains(t, fileNames, helloTxtRelativePath) +func TestFileSetRecursiveListFiles(t *testing.T) { + fileSet, err := NewFileSet("./testdata") + require.NoError(t, err) + files, err := fileSet.RecursiveListFiles("./testdata") + require.NoError(t, err) + require.Len(t, files, 6) + assert.Equal(t, filepath.Join(".gitignore"), files[0].Relative) + assert.Equal(t, filepath.Join("a", ".gitignore"), files[1].Relative) + assert.Equal(t, filepath.Join("a", "b", ".gitignore"), files[2].Relative) + assert.Equal(t, filepath.Join("a", "b", "world.txt"), files[3].Relative) + assert.Equal(t, filepath.Join("a", "hello.txt"), files[4].Relative) + assert.Equal(t, filepath.Join("databricks.yml"), files[5].Relative) } func TestFileSetNonCleanRoot(t *testing.T) { // Test what happens if the root directory can be simplified. // Path simplification is done by most filepath functions. - - // remove this once equivalent tests for windows have been set up - // or this test has been fixed for windows - // date: 28 Nov 2022 - if runtime.GOOS == "windows" { - t.Skip("skipping temperorilty to make windows unit tests green") - } - - root := "./../git" - require.NotEqual(t, root, filepath.Clean(root)) - fs := NewFileSet(root) - files, err := fs.All() + // This should yield the same result as above test. + fileSet, err := NewFileSet("./testdata/../testdata") require.NoError(t, err) - - found := false - for _, f := range files { - if strings.Contains(f.Relative, "fileset_test") { - assert.Equal(t, "fileset_test.go", f.Relative) - assert.Equal(t, "../git/fileset_test.go", f.Absolute) - found = true - } - } - - assert.True(t, found) + files, err := fileSet.All() + require.NoError(t, err) + assert.Len(t, files, 6) } diff --git a/git/testdata/a/a.ignore b/git/testdata/a/a.ignore new file mode 100644 index 00000000..e69de29b diff --git a/git/testdata/a/b/b.ignore b/git/testdata/a/b/b.ignore new file mode 100644 index 00000000..e69de29b diff --git a/git/testdata/a/b/world.txt b/git/testdata/a/b/world.txt new file mode 100644 index 00000000..e69de29b diff --git a/git/testdata/a/hello.txt b/git/testdata/a/hello.txt new file mode 100644 index 00000000..e69de29b diff --git a/git/testdata/databricks.yml b/git/testdata/databricks.yml new file mode 100644 index 00000000..e69de29b diff --git a/git/testdata/ignorethis/ignored.txt b/git/testdata/ignorethis/ignored.txt new file mode 100644 index 00000000..e69de29b diff --git a/git/view.go b/git/view.go index 9a09806d..1b81ac33 100644 --- a/git/view.go +++ b/git/view.go @@ -38,6 +38,27 @@ func (v *View) Ignore(path string) bool { return v.repo.Ignore(filepath.Join(v.targetPath, path) + trailingSlash) } +// IgnoreFile returns if the gitignore rules in this fileset +// apply to the specified file path. +// +// This function is provided to implement [fileset.Ignorer]. +func (v *View) IgnoreFile(file string) bool { + return v.Ignore(file) +} + +// IgnoreDirectory returns if the gitignore rules in this fileset +// apply to the specified directory path. +// +// A gitignore rule may apply only to directories if it uses +// a trailing slash. Therefore this function checks the gitignore +// rules for the specified directory path first without and then +// with a trailing slash. +// +// This function is provided to implement [fileset.Ignorer]. +func (v *View) IgnoreDirectory(dir string) bool { + return v.Ignore(dir) || v.Ignore(dir+"/") +} + func NewView(path string) (*View, error) { path, err := filepath.Abs(path) if err != nil { diff --git a/libs/fileset/file.go b/libs/fileset/file.go new file mode 100644 index 00000000..6594de4e --- /dev/null +++ b/libs/fileset/file.go @@ -0,0 +1,20 @@ +package fileset + +import ( + "io/fs" + "time" +) + +type File struct { + fs.DirEntry + Absolute, Relative string +} + +func (f File) Modified() (ts time.Time) { + info, err := f.Info() + if err != nil { + // return default time, beginning of epoch + return ts + } + return info.ModTime() +} diff --git a/libs/fileset/fileset.go b/libs/fileset/fileset.go new file mode 100644 index 00000000..98e6969d --- /dev/null +++ b/libs/fileset/fileset.go @@ -0,0 +1,72 @@ +package fileset + +import ( + "io/fs" + "path/filepath" +) + +// FileSet facilitates fast recursive file listing of a path. +// It optionally takes into account ignore rules through the [Ignorer] interface. +type FileSet struct { + root string + ignore Ignorer +} + +// New returns a [FileSet] for the given root path. +func New(root string) *FileSet { + return &FileSet{ + root: filepath.Clean(root), + ignore: nopIgnorer{}, + } +} + +// Ignorer returns the [FileSet]'s current ignorer. +func (w *FileSet) Ignorer() Ignorer { + return w.ignore +} + +// SetIgnorer sets the [Ignorer] interface for this [FileSet]. +func (w *FileSet) SetIgnorer(ignore Ignorer) { + w.ignore = ignore +} + +// Return root for fileset. +func (w *FileSet) Root() string { + return w.root +} + +// Return all tracked files for Repo +func (w *FileSet) All() ([]File, error) { + return w.RecursiveListFiles(w.root) +} + +// Recursively traverses dir in a depth first manner and returns a list of all files +// that are being tracked in the FileSet (ie not being ignored for matching one of the +// patterns in w.ignore) +func (w *FileSet) RecursiveListFiles(dir string) (fileList []File, err error) { + err = filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + + relPath, err := filepath.Rel(w.root, path) + if err != nil { + return err + } + + if d.IsDir() { + if w.ignore.IgnoreDirectory(relPath) { + return filepath.SkipDir + } + return nil + } + + if w.ignore.IgnoreFile(relPath) { + return nil + } + + fileList = append(fileList, File{d, path, relPath}) + return nil + }) + return +} diff --git a/libs/fileset/ignorer.go b/libs/fileset/ignorer.go new file mode 100644 index 00000000..611ce607 --- /dev/null +++ b/libs/fileset/ignorer.go @@ -0,0 +1,19 @@ +package fileset + +// Ignorer is the interface for what determines if a path +// in the [FileSet] must be ignored or not. +type Ignorer interface { + IgnoreFile(path string) bool + IgnoreDirectory(path string) bool +} + +// nopIgnorer implements an [Ignorer] that doesn't ignore anything. +type nopIgnorer struct{} + +func (nopIgnorer) IgnoreFile(path string) bool { + return false +} + +func (nopIgnorer) IgnoreDirectory(path string) bool { + return false +} diff --git a/libs/sync/snapshot.go b/libs/sync/snapshot.go index 20507053..d77e41d2 100644 --- a/libs/sync/snapshot.go +++ b/libs/sync/snapshot.go @@ -15,7 +15,7 @@ import ( "crypto/md5" "encoding/hex" - "github.com/databricks/bricks/git" + "github.com/databricks/bricks/libs/fileset" ) // Bump it up every time a potentially breaking change is made to the snapshot schema @@ -191,7 +191,7 @@ func getNotebookDetails(path string) (isNotebook bool, typeOfNotebook string, er return false, "", nil } -func (s *Snapshot) diff(all []git.File) (change diff, err error) { +func (s *Snapshot) diff(all []fileset.File) (change diff, err error) { currentFilenames := map[string]bool{} lastModifiedTimes := s.LastUpdatedTimes remoteToLocalNames := s.RemoteToLocalNames diff --git a/libs/sync/snapshot_test.go b/libs/sync/snapshot_test.go index f02fde04..a5779ca3 100644 --- a/libs/sync/snapshot_test.go +++ b/libs/sync/snapshot_test.go @@ -26,7 +26,8 @@ func assertKeysOfMap(t *testing.T, m map[string]time.Time, expectedKeys []string func TestDiff(t *testing.T) { // Create temp project dir projectDir := t.TempDir() - fileSet := git.NewFileSet(projectDir) + fileSet, err := git.NewFileSet(projectDir) + require.NoError(t, err) state := Snapshot{ LastUpdatedTimes: make(map[string]time.Time), LocalToRemoteNames: make(map[string]string), @@ -85,14 +86,15 @@ func TestDiff(t *testing.T) { func TestFolderDiff(t *testing.T) { // Create temp project dir projectDir := t.TempDir() - fileSet := git.NewFileSet(projectDir) + fileSet, err := git.NewFileSet(projectDir) + require.NoError(t, err) state := Snapshot{ LastUpdatedTimes: make(map[string]time.Time), LocalToRemoteNames: make(map[string]string), RemoteToLocalNames: make(map[string]string), } - err := os.Mkdir(filepath.Join(projectDir, "foo"), os.ModePerm) + err = os.Mkdir(filepath.Join(projectDir, "foo"), os.ModePerm) assert.NoError(t, err) f1 := testfile.CreateFile(t, filepath.Join(projectDir, "foo", "bar.py")) defer f1.Close(t) @@ -119,7 +121,8 @@ func TestFolderDiff(t *testing.T) { func TestPythonNotebookDiff(t *testing.T) { // Create temp project dir projectDir := t.TempDir() - fileSet := git.NewFileSet(projectDir) + fileSet, err := git.NewFileSet(projectDir) + require.NoError(t, err) state := Snapshot{ LastUpdatedTimes: make(map[string]time.Time), LocalToRemoteNames: make(map[string]string), @@ -189,7 +192,8 @@ func TestPythonNotebookDiff(t *testing.T) { func TestErrorWhenIdenticalRemoteName(t *testing.T) { // Create temp project dir projectDir := t.TempDir() - fileSet := git.NewFileSet(projectDir) + fileSet, err := git.NewFileSet(projectDir) + require.NoError(t, err) state := Snapshot{ LastUpdatedTimes: make(map[string]time.Time), LocalToRemoteNames: make(map[string]string), diff --git a/libs/sync/sync.go b/libs/sync/sync.go index f1198365..bec76888 100644 --- a/libs/sync/sync.go +++ b/libs/sync/sync.go @@ -8,6 +8,7 @@ import ( "time" "github.com/databricks/bricks/git" + "github.com/databricks/bricks/libs/fileset" "github.com/databricks/bricks/libs/sync/repofiles" "github.com/databricks/databricks-sdk-go" ) @@ -30,15 +31,18 @@ type SyncOptions struct { type Sync struct { *SyncOptions - fileSet *git.FileSet + fileSet *fileset.FileSet snapshot *Snapshot repoFiles *repofiles.RepoFiles } // New initializes and returns a new [Sync] instance. func New(ctx context.Context, opts SyncOptions) (*Sync, error) { - fileSet := git.NewFileSet(opts.LocalPath) - err := fileSet.EnsureValidGitIgnoreExists() + fileSet, err := git.NewFileSet(opts.LocalPath) + if err != nil { + return nil, err + } + err = git.EnsureValidGitIgnoreExists(fileSet) if err != nil { return nil, err } diff --git a/project/project.go b/project/project.go index 00b93935..54095bb5 100644 --- a/project/project.go +++ b/project/project.go @@ -8,6 +8,7 @@ import ( "sync" "github.com/databricks/bricks/git" + "github.com/databricks/bricks/libs/fileset" "github.com/databricks/databricks-sdk-go" "github.com/databricks/databricks-sdk-go/service/commands" "github.com/databricks/databricks-sdk-go/service/scim" @@ -26,7 +27,7 @@ type project struct { environment *Environment wsc *databricks.WorkspaceClient me *scim.User - fileSet *git.FileSet + fileSet *fileset.FileSet } // Configure is used as a PreRunE function for all commands that @@ -65,10 +66,13 @@ func Initialize(ctx context.Context, root, env string) (context.Context, error) return nil, fmt.Errorf("environment [%s] not defined", env) } - fileSet := git.NewFileSet(root) - err = fileSet.EnsureValidGitIgnoreExists() + fileSet, err := git.NewFileSet(root) if err != nil { - return ctx, nil + return nil, err + } + err = git.EnsureValidGitIgnoreExists(fileSet) + if err != nil { + return nil, err } p := project{ @@ -116,7 +120,7 @@ func (p *project) Root() string { return p.root } -func (p *project) GetFileSet() *git.FileSet { +func (p *project) GetFileSet() *fileset.FileSet { return p.fileSet } @@ -129,7 +133,7 @@ func (p *project) GetFileSet() *git.FileSet { // accidentally check into git func (p *project) CacheDir() (string, error) { // assert cache dir is present in git ignore - if !p.fileSet.IsGitIgnored(fmt.Sprintf("/%s/", CacheDirName)) { + if !p.fileSet.Ignorer().IgnoreDirectory(fmt.Sprintf("/%s/", CacheDirName)) { return "", fmt.Errorf("please add /%s/ to .gitignore", CacheDirName) }