Make fileset take optional list of paths to list (#1684)

## Changes

Before this change, the fileset library would take a single root path
and list all files in it. To support an allowlist of paths to list (much
like a Git `pathspec` without patterns; see [pathspec](pathspec)), this
change introduces an optional argument to `fileset.New` where the caller
can specify paths to list. If not specified, this argument defaults to
list `.` (i.e. list all files in the root).

The motivation for this change is that we wish to expose this pattern in
bundles. Users should be able to specify which paths to synchronize
instead of always only synchronizing the bundle root directory.

[pathspec]:
https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec

## Tests

New and existing unit tests.
This commit is contained in:
Pieter Noordhuis 2024-08-19 17:15:14 +02:00 committed by GitHub
parent ab4e8099fb
commit 7de7583b37
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 257 additions and 59 deletions

View File

@ -63,7 +63,7 @@ func checkPatterns(patterns []string, path string, rb bundle.ReadOnlyBundle) (di
return err
}
all, err := fs.All()
all, err := fs.Files()
if err != nil {
return err
}

View File

@ -18,7 +18,7 @@ func TestFromSlice(t *testing.T) {
testutil.Touch(t, tmpDir, "test2.py")
testutil.Touch(t, tmpDir, "test3.py")
files, err := fileset.All()
files, err := fileset.Files()
require.NoError(t, err)
f, err := FromSlice(files)
@ -38,7 +38,7 @@ func TestToSlice(t *testing.T) {
testutil.Touch(t, tmpDir, "test2.py")
testutil.Touch(t, tmpDir, "test3.py")
files, err := fileset.All()
files, err := fileset.Files()
require.NoError(t, err)
f, err := FromSlice(files)

View File

@ -23,7 +23,7 @@ func setupBundleForStateUpdate(t *testing.T) *bundle.Bundle {
testutil.Touch(t, tmpDir, "test1.py")
testutil.TouchNotebook(t, tmpDir, "test2.py")
files, err := fileset.New(vfs.MustNew(tmpDir)).All()
files, err := fileset.New(vfs.MustNew(tmpDir)).Files()
require.NoError(t, err)
return &bundle.Bundle{

View File

@ -3,25 +3,56 @@ package fileset
import (
"fmt"
"io/fs"
"os"
pathlib "path"
"path/filepath"
"slices"
"github.com/databricks/cli/libs/vfs"
)
// FileSet facilitates fast recursive file listing of a path.
// FileSet facilitates recursive file listing for paths rooted at a given directory.
// It optionally takes into account ignore rules through the [Ignorer] interface.
type FileSet struct {
// Root path of the fileset.
root vfs.Path
// Paths to include in the fileset.
// Files are included as-is (if not ignored) and directories are traversed recursively.
// Defaults to []string{"."} if not specified.
paths []string
// Ignorer interface to check if a file or directory should be ignored.
ignore Ignorer
}
// New returns a [FileSet] for the given root path.
func New(root vfs.Path) *FileSet {
// It optionally accepts a list of paths relative to the root to include in the fileset.
// If not specified, it defaults to including all files in the root path.
func New(root vfs.Path, args ...[]string) *FileSet {
// Default to including all files in the root path.
if len(args) == 0 {
args = [][]string{{"."}}
}
// Collect list of normalized and cleaned paths.
var paths []string
for _, arg := range args {
for _, path := range arg {
path = filepath.ToSlash(path)
path = pathlib.Clean(path)
// Skip path if it's already in the list.
if slices.Contains(paths, path) {
continue
}
paths = append(paths, path)
}
}
return &FileSet{
root: root,
paths: paths,
ignore: nopIgnorer{},
}
}
@ -36,30 +67,38 @@ func (w *FileSet) SetIgnorer(ignore Ignorer) {
w.ignore = ignore
}
// Return all tracked files for Repo
func (w *FileSet) All() ([]File, error) {
return w.recursiveListFiles()
// Files returns performs recursive listing on all configured paths and returns
// the collection of files it finds (and are not ignored).
// The returned slice does not contain duplicates.
// The order of files in the slice is stable.
func (w *FileSet) Files() (out []File, err error) {
seen := make(map[string]struct{})
for _, p := range w.paths {
files, err := w.recursiveListFiles(p, seen)
if err != nil {
return nil, err
}
out = append(out, files...)
}
return out, nil
}
// Recursively traverses dir in a depth first manner and returns a list of all files
// that are being tracked in the FileSet (ie not being ignored for matching one of the
// patterns in w.ignore)
func (w *FileSet) recursiveListFiles() (fileList []File, err error) {
err = fs.WalkDir(w.root, ".", func(name string, d fs.DirEntry, err error) error {
func (w *FileSet) recursiveListFiles(path string, seen map[string]struct{}) (out []File, err error) {
err = fs.WalkDir(w.root, path, func(name string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
// skip symlinks
info, err := d.Info()
if err != nil {
return err
}
if info.Mode()&os.ModeSymlink != 0 {
return nil
}
if d.IsDir() {
switch {
case info.Mode().IsDir():
ign, err := w.ignore.IgnoreDirectory(name)
if err != nil {
return fmt.Errorf("cannot check if %s should be ignored: %w", name, err)
@ -67,18 +106,28 @@ func (w *FileSet) recursiveListFiles() (fileList []File, err error) {
if ign {
return fs.SkipDir
}
return nil
case info.Mode().IsRegular():
ign, err := w.ignore.IgnoreFile(name)
if err != nil {
return fmt.Errorf("cannot check if %s should be ignored: %w", name, err)
}
if ign {
return nil
}
// Skip duplicates
if _, ok := seen[name]; ok {
return nil
}
seen[name] = struct{}{}
out = append(out, NewFile(w.root, d, name))
default:
// Skip non-regular files (e.g. symlinks).
}
ign, err := w.ignore.IgnoreFile(name)
if err != nil {
return fmt.Errorf("cannot check if %s should be ignored: %w", name, err)
}
if ign {
return nil
}
fileList = append(fileList, NewFile(w.root, d, name))
return nil
})
return

View File

@ -0,0 +1,144 @@
package fileset
import (
"errors"
"testing"
"github.com/databricks/cli/libs/vfs"
"github.com/stretchr/testify/assert"
)
func TestFileSet_NoPaths(t *testing.T) {
fs := New(vfs.MustNew("testdata"))
files, err := fs.Files()
if !assert.NoError(t, err) {
return
}
assert.Len(t, files, 4)
assert.Equal(t, "dir1/a", files[0].Relative)
assert.Equal(t, "dir1/b", files[1].Relative)
assert.Equal(t, "dir2/a", files[2].Relative)
assert.Equal(t, "dir2/b", files[3].Relative)
}
func TestFileSet_ParentPath(t *testing.T) {
fs := New(vfs.MustNew("testdata"), []string{".."})
_, err := fs.Files()
// It is impossible to escape the root directory.
assert.Error(t, err)
}
func TestFileSet_DuplicatePaths(t *testing.T) {
fs := New(vfs.MustNew("testdata"), []string{"dir1", "dir1"})
files, err := fs.Files()
if !assert.NoError(t, err) {
return
}
assert.Len(t, files, 2)
assert.Equal(t, "dir1/a", files[0].Relative)
assert.Equal(t, "dir1/b", files[1].Relative)
}
func TestFileSet_OverlappingPaths(t *testing.T) {
fs := New(vfs.MustNew("testdata"), []string{"dir1", "dir1/a"})
files, err := fs.Files()
if !assert.NoError(t, err) {
return
}
assert.Len(t, files, 2)
assert.Equal(t, "dir1/a", files[0].Relative)
assert.Equal(t, "dir1/b", files[1].Relative)
}
func TestFileSet_IgnoreDirError(t *testing.T) {
testError := errors.New("test error")
fs := New(vfs.MustNew("testdata"))
fs.SetIgnorer(testIgnorer{dirErr: testError})
_, err := fs.Files()
assert.ErrorIs(t, err, testError)
}
func TestFileSet_IgnoreDir(t *testing.T) {
fs := New(vfs.MustNew("testdata"))
fs.SetIgnorer(testIgnorer{dir: []string{"dir1"}})
files, err := fs.Files()
if !assert.NoError(t, err) {
return
}
assert.Len(t, files, 2)
assert.Equal(t, "dir2/a", files[0].Relative)
assert.Equal(t, "dir2/b", files[1].Relative)
}
func TestFileSet_IgnoreFileError(t *testing.T) {
testError := errors.New("test error")
fs := New(vfs.MustNew("testdata"))
fs.SetIgnorer(testIgnorer{fileErr: testError})
_, err := fs.Files()
assert.ErrorIs(t, err, testError)
}
func TestFileSet_IgnoreFile(t *testing.T) {
fs := New(vfs.MustNew("testdata"))
fs.SetIgnorer(testIgnorer{file: []string{"dir1/a"}})
files, err := fs.Files()
if !assert.NoError(t, err) {
return
}
assert.Len(t, files, 3)
assert.Equal(t, "dir1/b", files[0].Relative)
assert.Equal(t, "dir2/a", files[1].Relative)
assert.Equal(t, "dir2/b", files[2].Relative)
}
type testIgnorer struct {
// dir is a list of directories to ignore. Strings are compared verbatim.
dir []string
// dirErr is an error to return when IgnoreDirectory is called.
dirErr error
// file is a list of files to ignore. Strings are compared verbatim.
file []string
// fileErr is an error to return when IgnoreFile is called.
fileErr error
}
// IgnoreDirectory returns true if the path is in the dir list.
// If dirErr is set, it returns dirErr.
func (t testIgnorer) IgnoreDirectory(path string) (bool, error) {
if t.dirErr != nil {
return false, t.dirErr
}
for _, d := range t.dir {
if d == path {
return true, nil
}
}
return false, nil
}
// IgnoreFile returns true if the path is in the file list.
// If fileErr is set, it returns fileErr.
func (t testIgnorer) IgnoreFile(path string) (bool, error) {
if t.fileErr != nil {
return false, t.fileErr
}
for _, f := range t.file {
if f == path {
return true, nil
}
}
return false, nil
}

View File

@ -24,15 +24,19 @@ func TestGlobFileset(t *testing.T) {
entries, err := root.ReadDir(".")
require.NoError(t, err)
// Remove testdata folder from entries
entries = slices.DeleteFunc(entries, func(de fs.DirEntry) bool {
return de.Name() == "testdata"
})
g, err := NewGlobSet(root, []string{
"./*.go",
})
require.NoError(t, err)
files, err := g.All()
files, err := g.Files()
require.NoError(t, err)
// +1 as there's one folder in ../filer
require.Equal(t, len(files), len(entries))
for _, f := range files {
exists := slices.ContainsFunc(entries, func(de fs.DirEntry) bool {
@ -46,7 +50,7 @@ func TestGlobFileset(t *testing.T) {
})
require.NoError(t, err)
files, err = g.All()
files, err = g.Files()
require.NoError(t, err)
require.Equal(t, len(files), 0)
}
@ -61,7 +65,7 @@ func TestGlobFilesetWithRelativeRoot(t *testing.T) {
})
require.NoError(t, err)
files, err := g.All()
files, err := g.Files()
require.NoError(t, err)
require.Equal(t, len(files), len(entries))
}
@ -82,7 +86,7 @@ func TestGlobFilesetRecursively(t *testing.T) {
})
require.NoError(t, err)
files, err := g.All()
files, err := g.Files()
require.NoError(t, err)
require.ElementsMatch(t, entries, collectRelativePaths(files))
}
@ -103,7 +107,7 @@ func TestGlobFilesetDir(t *testing.T) {
})
require.NoError(t, err)
files, err := g.All()
files, err := g.Files()
require.NoError(t, err)
require.ElementsMatch(t, entries, collectRelativePaths(files))
}
@ -124,7 +128,7 @@ func TestGlobFilesetDoubleQuotesWithFilePatterns(t *testing.T) {
})
require.NoError(t, err)
files, err := g.All()
files, err := g.Files()
require.NoError(t, err)
require.ElementsMatch(t, entries, collectRelativePaths(files))
}

0
libs/fileset/testdata/dir1/a vendored Normal file
View File

0
libs/fileset/testdata/dir1/b vendored Normal file
View File

0
libs/fileset/testdata/dir2/a vendored Normal file
View File

0
libs/fileset/testdata/dir2/b vendored Normal file
View File

1
libs/fileset/testdata/dir3/a vendored Symbolic link
View File

@ -0,0 +1 @@
../dir1/a

View File

@ -7,15 +7,15 @@ import (
// FileSet is Git repository aware implementation of [fileset.FileSet].
// It forces checking if gitignore files have been modified every
// time a call to [FileSet.All] is made.
// time a call to [FileSet.Files] is made.
type FileSet struct {
fileset *fileset.FileSet
view *View
}
// NewFileSet returns [FileSet] for the Git repository located at `root`.
func NewFileSet(root vfs.Path) (*FileSet, error) {
fs := fileset.New(root)
func NewFileSet(root vfs.Path, paths ...[]string) (*FileSet, error) {
fs := fileset.New(root, paths...)
v, err := NewView(root)
if err != nil {
return nil, err
@ -35,9 +35,9 @@ func (f *FileSet) IgnoreDirectory(dir string) (bool, error) {
return f.view.IgnoreDirectory(dir)
}
func (f *FileSet) All() ([]fileset.File, error) {
func (f *FileSet) Files() ([]fileset.File, error) {
f.view.repo.taintIgnoreRules()
return f.fileset.All()
return f.fileset.Files()
}
func (f *FileSet) EnsureValidGitIgnoreExists() error {

View File

@ -15,7 +15,7 @@ import (
func testFileSetAll(t *testing.T, root string) {
fileSet, err := NewFileSet(vfs.MustNew(root))
require.NoError(t, err)
files, err := fileSet.All()
files, err := fileSet.Files()
require.NoError(t, err)
require.Len(t, files, 3)
assert.Equal(t, path.Join("a", "b", "world.txt"), files[0].Relative)
@ -37,7 +37,7 @@ func TestFileSetNonCleanRoot(t *testing.T) {
// This should yield the same result as above test.
fileSet, err := NewFileSet(vfs.MustNew("./testdata/../testdata"))
require.NoError(t, err)
files, err := fileSet.All()
files, err := fileSet.Files()
require.NoError(t, err)
assert.Len(t, files, 3)
}

View File

@ -13,7 +13,7 @@ import (
func TestSnapshotState(t *testing.T) {
fileSet := fileset.New(vfs.MustNew("./testdata/sync-fileset"))
files, err := fileSet.All()
files, err := fileSet.Files()
require.NoError(t, err)
// Assert initial contents of the fileset

View File

@ -47,7 +47,7 @@ func TestDiff(t *testing.T) {
defer f2.Close(t)
// New files are put
files, err := fileSet.All()
files, err := fileSet.Files()
assert.NoError(t, err)
change, err := state.diff(ctx, files)
assert.NoError(t, err)
@ -62,7 +62,7 @@ func TestDiff(t *testing.T) {
// world.txt is editted
f2.Overwrite(t, "bunnies are cute.")
assert.NoError(t, err)
files, err = fileSet.All()
files, err = fileSet.Files()
assert.NoError(t, err)
change, err = state.diff(ctx, files)
assert.NoError(t, err)
@ -77,7 +77,7 @@ func TestDiff(t *testing.T) {
// hello.txt is deleted
f1.Remove(t)
assert.NoError(t, err)
files, err = fileSet.All()
files, err = fileSet.Files()
assert.NoError(t, err)
change, err = state.diff(ctx, files)
assert.NoError(t, err)
@ -113,7 +113,7 @@ func TestSymlinkDiff(t *testing.T) {
err = os.Symlink(filepath.Join(projectDir, "foo"), filepath.Join(projectDir, "bar"))
assert.NoError(t, err)
files, err := fileSet.All()
files, err := fileSet.Files()
assert.NoError(t, err)
change, err := state.diff(ctx, files)
assert.NoError(t, err)
@ -141,7 +141,7 @@ func TestFolderDiff(t *testing.T) {
defer f1.Close(t)
f1.Overwrite(t, "# Databricks notebook source\nprint(\"abc\")")
files, err := fileSet.All()
files, err := fileSet.Files()
assert.NoError(t, err)
change, err := state.diff(ctx, files)
assert.NoError(t, err)
@ -153,7 +153,7 @@ func TestFolderDiff(t *testing.T) {
assert.Contains(t, change.put, "foo/bar.py")
f1.Remove(t)
files, err = fileSet.All()
files, err = fileSet.Files()
assert.NoError(t, err)
change, err = state.diff(ctx, files)
assert.NoError(t, err)
@ -184,7 +184,7 @@ func TestPythonNotebookDiff(t *testing.T) {
defer foo.Close(t)
// Case 1: notebook foo.py is uploaded
files, err := fileSet.All()
files, err := fileSet.Files()
assert.NoError(t, err)
foo.Overwrite(t, "# Databricks notebook source\nprint(\"abc\")")
change, err := state.diff(ctx, files)
@ -199,7 +199,7 @@ func TestPythonNotebookDiff(t *testing.T) {
// Case 2: notebook foo.py is converted to python script by removing
// magic keyword
foo.Overwrite(t, "print(\"abc\")")
files, err = fileSet.All()
files, err = fileSet.Files()
assert.NoError(t, err)
change, err = state.diff(ctx, files)
assert.NoError(t, err)
@ -213,7 +213,7 @@ func TestPythonNotebookDiff(t *testing.T) {
// Case 3: Python script foo.py is converted to a databricks notebook
foo.Overwrite(t, "# Databricks notebook source\nprint(\"def\")")
files, err = fileSet.All()
files, err = fileSet.Files()
assert.NoError(t, err)
change, err = state.diff(ctx, files)
assert.NoError(t, err)
@ -228,7 +228,7 @@ func TestPythonNotebookDiff(t *testing.T) {
// Case 4: Python notebook foo.py is deleted, and its remote name is used in change.delete
foo.Remove(t)
assert.NoError(t, err)
files, err = fileSet.All()
files, err = fileSet.Files()
assert.NoError(t, err)
change, err = state.diff(ctx, files)
assert.NoError(t, err)
@ -260,7 +260,7 @@ func TestErrorWhenIdenticalRemoteName(t *testing.T) {
defer pythonFoo.Close(t)
vanillaFoo := testfile.CreateFile(t, filepath.Join(projectDir, "foo"))
defer vanillaFoo.Close(t)
files, err := fileSet.All()
files, err := fileSet.Files()
assert.NoError(t, err)
change, err := state.diff(ctx, files)
assert.NoError(t, err)
@ -271,7 +271,7 @@ func TestErrorWhenIdenticalRemoteName(t *testing.T) {
// errors out because they point to the same destination
pythonFoo.Overwrite(t, "# Databricks notebook source\nprint(\"def\")")
files, err = fileSet.All()
files, err = fileSet.Files()
assert.NoError(t, err)
change, err = state.diff(ctx, files)
assert.ErrorContains(t, err, "both foo and foo.py point to the same remote file location foo. Please remove one of them from your local project")
@ -296,7 +296,7 @@ func TestNoErrorRenameWithIdenticalRemoteName(t *testing.T) {
pythonFoo := testfile.CreateFile(t, filepath.Join(projectDir, "foo.py"))
defer pythonFoo.Close(t)
pythonFoo.Overwrite(t, "# Databricks notebook source\n")
files, err := fileSet.All()
files, err := fileSet.Files()
assert.NoError(t, err)
change, err := state.diff(ctx, files)
assert.NoError(t, err)
@ -308,7 +308,7 @@ func TestNoErrorRenameWithIdenticalRemoteName(t *testing.T) {
sqlFoo := testfile.CreateFile(t, filepath.Join(projectDir, "foo.sql"))
defer sqlFoo.Close(t)
sqlFoo.Overwrite(t, "-- Databricks notebook source\n")
files, err = fileSet.All()
files, err = fileSet.Files()
assert.NoError(t, err)
change, err = state.diff(ctx, files)
assert.NoError(t, err)

View File

@ -195,14 +195,14 @@ func (s *Sync) GetFileList(ctx context.Context) ([]fileset.File, error) {
all := set.NewSetF(func(f fileset.File) string {
return f.Relative
})
gitFiles, err := s.fileSet.All()
gitFiles, err := s.fileSet.Files()
if err != nil {
log.Errorf(ctx, "cannot list files: %s", err)
return nil, err
}
all.Add(gitFiles...)
include, err := s.includeFileSet.All()
include, err := s.includeFileSet.Files()
if err != nil {
log.Errorf(ctx, "cannot list include files: %s", err)
return nil, err
@ -210,7 +210,7 @@ func (s *Sync) GetFileList(ctx context.Context) ([]fileset.File, error) {
all.Add(include...)
exclude, err := s.excludeFileSet.All()
exclude, err := s.excludeFileSet.Files()
if err != nil {
log.Errorf(ctx, "cannot list exclude files: %s", err)
return nil, err