Move git.FileSet to libs/fileset and make it aware of gitignores (#184)

This moves `git.FileSet` to `libs/fileset` and decouples it from the Git package.

It is made aware of gitignore rules in parent directories up to the
repository root as well as gitignore files in underlying directories
through the `fileset.Ignorer` interface.

The recursive directory walker is reimplemented with [filepath.WalkDir].

Follow up to #182.
This commit is contained in:
Pieter Noordhuis 2023-01-27 16:04:58 +01:00 committed by GitHub
parent 2eb10800a5
commit eb76e5d3e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 196 additions and 217 deletions

View File

@ -1,82 +1,31 @@
package git package git
import ( import (
"fmt"
"io/fs"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"time"
ignore "github.com/sabhiram/go-gitignore" "github.com/databricks/bricks/libs/fileset"
"golang.org/x/exp/slices"
) )
type File struct {
fs.DirEntry
Absolute, Relative string
}
func (f File) Modified() (ts time.Time) {
info, err := f.Info()
if err != nil {
// return default time, beginning of epoch
return ts
}
return info.ModTime()
}
// FileSet facilitates fast recursive tracked file listing
// with respect to patterns defined in `.gitignore` file
//
// root: Root of the git repository
// ignore: List of patterns defined in `.gitignore`.
//
// We do not sync files that match this pattern
type FileSet struct {
root string
ignore *ignore.GitIgnore
}
// Retuns FileSet for the git repo located at `root` // Retuns FileSet for the git repo located at `root`
func NewFileSet(root string) *FileSet { func NewFileSet(root string) (*fileset.FileSet, error) {
syncIgnoreLines := append(getGitIgnoreLines(root), getSyncIgnoreLines()...) w := fileset.New(root)
return &FileSet{ v, err := NewView(root)
root: root, if err != nil {
ignore: ignore.CompileIgnoreLines(syncIgnoreLines...), return nil, err
} }
} w.SetIgnorer(v)
return w, nil
func getGitIgnoreLines(root string) []string {
gitIgnoreLines := []string{}
gitIgnorePath := fmt.Sprintf("%s/.gitignore", root)
rawIgnore, err := os.ReadFile(gitIgnorePath)
if err == nil {
// add entries from .gitignore if the file exists (did read correctly)
for _, line := range strings.Split(string(rawIgnore), "\n") {
// underlying library doesn't behave well with Rule 5 of .gitignore,
// hence this workaround
gitIgnoreLines = append(gitIgnoreLines, strings.Trim(line, "/"))
}
}
return gitIgnoreLines
}
// This contains additional files/dirs to be ignored while syncing that are
// not mentioned in .gitignore
func getSyncIgnoreLines() []string {
return []string{".git"}
} }
// Only call this function for a bricks project root // Only call this function for a bricks project root
// since it will create a .gitignore file if missing // since it will create a .gitignore file if missing
func (w *FileSet) EnsureValidGitIgnoreExists() error { func EnsureValidGitIgnoreExists(w *fileset.FileSet) error {
gitIgnoreLines := getGitIgnoreLines(w.root) if w.Ignorer().IgnoreDirectory(".databricks") {
gitIgnorePath := fmt.Sprintf("%s/.gitignore", w.root)
if slices.Contains(gitIgnoreLines, ".databricks") {
return nil return nil
} }
gitIgnorePath := filepath.Join(w.Root(), ".gitignore")
f, err := os.OpenFile(gitIgnorePath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0644) f, err := os.OpenFile(gitIgnorePath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0644)
if err != nil { if err != nil {
return err return err
@ -86,69 +35,12 @@ func (w *FileSet) EnsureValidGitIgnoreExists() error {
if err != nil { if err != nil {
return err return err
} }
gitIgnoreLines = append(gitIgnoreLines, ".databricks")
w.ignore = ignore.CompileIgnoreLines(append(gitIgnoreLines, getSyncIgnoreLines()...)...) // Reload view to update ignore rules.
v, err := NewView(w.Root())
if err != nil {
return err
}
w.SetIgnorer(v)
return nil return nil
} }
// Return root for fileset.
func (w *FileSet) Root() string {
return w.root
}
// Return all tracked files for Repo
func (w *FileSet) All() ([]File, error) {
return w.RecursiveListFiles(w.root)
}
func (w *FileSet) IsGitIgnored(pattern string) bool {
return w.ignore.MatchesPath(pattern)
}
// Recursively traverses dir in a depth first manner and returns a list of all files
// that are being tracked in the FileSet (ie not being ignored for matching one of the
// patterns in w.ignore)
func (w *FileSet) RecursiveListFiles(dir string) (fileList []File, err error) {
queue, err := readDir(dir, w.root)
if err != nil {
return nil, err
}
for len(queue) > 0 {
current := queue[0]
queue = queue[1:]
if w.ignore.MatchesPath(current.Relative) {
continue
}
if !current.IsDir() {
fileList = append(fileList, current)
continue
}
children, err := readDir(current.Absolute, w.root)
if err != nil {
return nil, err
}
queue = append(queue, children...)
}
return fileList, nil
}
func readDir(dir, root string) (queue []File, err error) {
f, err := os.Open(dir)
if err != nil {
return
}
defer f.Close()
dirs, err := f.ReadDir(-1)
if err != nil {
return
}
for _, v := range dirs {
absolute := filepath.Join(dir, v.Name())
relative, err := filepath.Rel(root, absolute)
if err != nil {
return nil, err
}
queue = append(queue, File{v, absolute, relative})
}
return
}

View File

@ -1,91 +1,34 @@
package git package git
import ( import (
"os"
"path/filepath" "path/filepath"
"runtime"
"strings"
"testing" "testing"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
func TestRecusiveListFile(t *testing.T) { func TestFileSetRecursiveListFiles(t *testing.T) {
// Create .gitignore and ignore .gitignore and any files in fileSet, err := NewFileSet("./testdata")
// ignored_dir require.NoError(t, err)
projectDir := t.TempDir() files, err := fileSet.RecursiveListFiles("./testdata")
f3, err := os.Create(filepath.Join(projectDir, ".gitignore")) require.NoError(t, err)
assert.NoError(t, err) require.Len(t, files, 6)
defer f3.Close() assert.Equal(t, filepath.Join(".gitignore"), files[0].Relative)
f3.WriteString(".gitignore\nignored_dir") assert.Equal(t, filepath.Join("a", ".gitignore"), files[1].Relative)
assert.Equal(t, filepath.Join("a", "b", ".gitignore"), files[2].Relative)
// create config file assert.Equal(t, filepath.Join("a", "b", "world.txt"), files[3].Relative)
f4, err := os.Create(filepath.Join(projectDir, "databricks.yml")) assert.Equal(t, filepath.Join("a", "hello.txt"), files[4].Relative)
assert.NoError(t, err) assert.Equal(t, filepath.Join("databricks.yml"), files[5].Relative)
defer f4.Close()
// config file is returned
// .gitignore is not because we explictly ignore it in .gitignore
fileSet := NewFileSet(projectDir)
files, err := fileSet.RecursiveListFiles(projectDir)
assert.NoError(t, err)
assert.Len(t, files, 1)
assert.Equal(t, "databricks.yml", files[0].Relative)
helloTxtRelativePath := filepath.Join("a/b/c", "hello.txt")
// Check that newly added files not in .gitignore
// are being tracked
dir1 := filepath.Join(projectDir, "a", "b", "c")
dir2 := filepath.Join(projectDir, "ignored_dir", "e")
err = os.MkdirAll(dir2, 0o755)
assert.NoError(t, err)
err = os.MkdirAll(dir1, 0o755)
assert.NoError(t, err)
f1, err := os.Create(filepath.Join(projectDir, helloTxtRelativePath))
assert.NoError(t, err)
defer f1.Close()
f2, err := os.Create(filepath.Join(projectDir, "ignored_dir/e/world.txt"))
assert.NoError(t, err)
defer f2.Close()
files, err = fileSet.RecursiveListFiles(projectDir)
assert.NoError(t, err)
assert.Len(t, files, 2)
var fileNames []string
for _, v := range files {
fileNames = append(fileNames, v.Relative)
}
assert.Contains(t, fileNames, "databricks.yml")
assert.Contains(t, fileNames, helloTxtRelativePath)
} }
func TestFileSetNonCleanRoot(t *testing.T) { func TestFileSetNonCleanRoot(t *testing.T) {
// Test what happens if the root directory can be simplified. // Test what happens if the root directory can be simplified.
// Path simplification is done by most filepath functions. // Path simplification is done by most filepath functions.
// This should yield the same result as above test.
// remove this once equivalent tests for windows have been set up fileSet, err := NewFileSet("./testdata/../testdata")
// or this test has been fixed for windows
// date: 28 Nov 2022
if runtime.GOOS == "windows" {
t.Skip("skipping temperorilty to make windows unit tests green")
}
root := "./../git"
require.NotEqual(t, root, filepath.Clean(root))
fs := NewFileSet(root)
files, err := fs.All()
require.NoError(t, err) require.NoError(t, err)
files, err := fileSet.All()
found := false require.NoError(t, err)
for _, f := range files { assert.Len(t, files, 6)
if strings.Contains(f.Relative, "fileset_test") {
assert.Equal(t, "fileset_test.go", f.Relative)
assert.Equal(t, "../git/fileset_test.go", f.Absolute)
found = true
}
}
assert.True(t, found)
} }

0
git/testdata/a/a.ignore vendored Normal file
View File

0
git/testdata/a/b/b.ignore vendored Normal file
View File

0
git/testdata/a/b/world.txt vendored Normal file
View File

0
git/testdata/a/hello.txt vendored Normal file
View File

0
git/testdata/databricks.yml vendored Normal file
View File

0
git/testdata/ignorethis/ignored.txt vendored Normal file
View File

View File

@ -38,6 +38,27 @@ func (v *View) Ignore(path string) bool {
return v.repo.Ignore(filepath.Join(v.targetPath, path) + trailingSlash) return v.repo.Ignore(filepath.Join(v.targetPath, path) + trailingSlash)
} }
// IgnoreFile returns if the gitignore rules in this fileset
// apply to the specified file path.
//
// This function is provided to implement [fileset.Ignorer].
func (v *View) IgnoreFile(file string) bool {
return v.Ignore(file)
}
// IgnoreDirectory returns if the gitignore rules in this fileset
// apply to the specified directory path.
//
// A gitignore rule may apply only to directories if it uses
// a trailing slash. Therefore this function checks the gitignore
// rules for the specified directory path first without and then
// with a trailing slash.
//
// This function is provided to implement [fileset.Ignorer].
func (v *View) IgnoreDirectory(dir string) bool {
return v.Ignore(dir) || v.Ignore(dir+"/")
}
func NewView(path string) (*View, error) { func NewView(path string) (*View, error) {
path, err := filepath.Abs(path) path, err := filepath.Abs(path)
if err != nil { if err != nil {

20
libs/fileset/file.go Normal file
View File

@ -0,0 +1,20 @@
package fileset
import (
"io/fs"
"time"
)
type File struct {
fs.DirEntry
Absolute, Relative string
}
func (f File) Modified() (ts time.Time) {
info, err := f.Info()
if err != nil {
// return default time, beginning of epoch
return ts
}
return info.ModTime()
}

72
libs/fileset/fileset.go Normal file
View File

@ -0,0 +1,72 @@
package fileset
import (
"io/fs"
"path/filepath"
)
// FileSet facilitates fast recursive file listing of a path.
// It optionally takes into account ignore rules through the [Ignorer] interface.
type FileSet struct {
root string
ignore Ignorer
}
// New returns a [FileSet] for the given root path.
func New(root string) *FileSet {
return &FileSet{
root: filepath.Clean(root),
ignore: nopIgnorer{},
}
}
// Ignorer returns the [FileSet]'s current ignorer.
func (w *FileSet) Ignorer() Ignorer {
return w.ignore
}
// SetIgnorer sets the [Ignorer] interface for this [FileSet].
func (w *FileSet) SetIgnorer(ignore Ignorer) {
w.ignore = ignore
}
// Return root for fileset.
func (w *FileSet) Root() string {
return w.root
}
// Return all tracked files for Repo
func (w *FileSet) All() ([]File, error) {
return w.RecursiveListFiles(w.root)
}
// Recursively traverses dir in a depth first manner and returns a list of all files
// that are being tracked in the FileSet (ie not being ignored for matching one of the
// patterns in w.ignore)
func (w *FileSet) RecursiveListFiles(dir string) (fileList []File, err error) {
err = filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
relPath, err := filepath.Rel(w.root, path)
if err != nil {
return err
}
if d.IsDir() {
if w.ignore.IgnoreDirectory(relPath) {
return filepath.SkipDir
}
return nil
}
if w.ignore.IgnoreFile(relPath) {
return nil
}
fileList = append(fileList, File{d, path, relPath})
return nil
})
return
}

19
libs/fileset/ignorer.go Normal file
View File

@ -0,0 +1,19 @@
package fileset
// Ignorer is the interface for what determines if a path
// in the [FileSet] must be ignored or not.
type Ignorer interface {
IgnoreFile(path string) bool
IgnoreDirectory(path string) bool
}
// nopIgnorer implements an [Ignorer] that doesn't ignore anything.
type nopIgnorer struct{}
func (nopIgnorer) IgnoreFile(path string) bool {
return false
}
func (nopIgnorer) IgnoreDirectory(path string) bool {
return false
}

View File

@ -15,7 +15,7 @@ import (
"crypto/md5" "crypto/md5"
"encoding/hex" "encoding/hex"
"github.com/databricks/bricks/git" "github.com/databricks/bricks/libs/fileset"
) )
// Bump it up every time a potentially breaking change is made to the snapshot schema // Bump it up every time a potentially breaking change is made to the snapshot schema
@ -191,7 +191,7 @@ func getNotebookDetails(path string) (isNotebook bool, typeOfNotebook string, er
return false, "", nil return false, "", nil
} }
func (s *Snapshot) diff(all []git.File) (change diff, err error) { func (s *Snapshot) diff(all []fileset.File) (change diff, err error) {
currentFilenames := map[string]bool{} currentFilenames := map[string]bool{}
lastModifiedTimes := s.LastUpdatedTimes lastModifiedTimes := s.LastUpdatedTimes
remoteToLocalNames := s.RemoteToLocalNames remoteToLocalNames := s.RemoteToLocalNames

View File

@ -26,7 +26,8 @@ func assertKeysOfMap(t *testing.T, m map[string]time.Time, expectedKeys []string
func TestDiff(t *testing.T) { func TestDiff(t *testing.T) {
// Create temp project dir // Create temp project dir
projectDir := t.TempDir() projectDir := t.TempDir()
fileSet := git.NewFileSet(projectDir) fileSet, err := git.NewFileSet(projectDir)
require.NoError(t, err)
state := Snapshot{ state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time), LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string), LocalToRemoteNames: make(map[string]string),
@ -85,14 +86,15 @@ func TestDiff(t *testing.T) {
func TestFolderDiff(t *testing.T) { func TestFolderDiff(t *testing.T) {
// Create temp project dir // Create temp project dir
projectDir := t.TempDir() projectDir := t.TempDir()
fileSet := git.NewFileSet(projectDir) fileSet, err := git.NewFileSet(projectDir)
require.NoError(t, err)
state := Snapshot{ state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time), LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string), LocalToRemoteNames: make(map[string]string),
RemoteToLocalNames: make(map[string]string), RemoteToLocalNames: make(map[string]string),
} }
err := os.Mkdir(filepath.Join(projectDir, "foo"), os.ModePerm) err = os.Mkdir(filepath.Join(projectDir, "foo"), os.ModePerm)
assert.NoError(t, err) assert.NoError(t, err)
f1 := testfile.CreateFile(t, filepath.Join(projectDir, "foo", "bar.py")) f1 := testfile.CreateFile(t, filepath.Join(projectDir, "foo", "bar.py"))
defer f1.Close(t) defer f1.Close(t)
@ -119,7 +121,8 @@ func TestFolderDiff(t *testing.T) {
func TestPythonNotebookDiff(t *testing.T) { func TestPythonNotebookDiff(t *testing.T) {
// Create temp project dir // Create temp project dir
projectDir := t.TempDir() projectDir := t.TempDir()
fileSet := git.NewFileSet(projectDir) fileSet, err := git.NewFileSet(projectDir)
require.NoError(t, err)
state := Snapshot{ state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time), LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string), LocalToRemoteNames: make(map[string]string),
@ -189,7 +192,8 @@ func TestPythonNotebookDiff(t *testing.T) {
func TestErrorWhenIdenticalRemoteName(t *testing.T) { func TestErrorWhenIdenticalRemoteName(t *testing.T) {
// Create temp project dir // Create temp project dir
projectDir := t.TempDir() projectDir := t.TempDir()
fileSet := git.NewFileSet(projectDir) fileSet, err := git.NewFileSet(projectDir)
require.NoError(t, err)
state := Snapshot{ state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time), LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string), LocalToRemoteNames: make(map[string]string),

View File

@ -8,6 +8,7 @@ import (
"time" "time"
"github.com/databricks/bricks/git" "github.com/databricks/bricks/git"
"github.com/databricks/bricks/libs/fileset"
"github.com/databricks/bricks/libs/sync/repofiles" "github.com/databricks/bricks/libs/sync/repofiles"
"github.com/databricks/databricks-sdk-go" "github.com/databricks/databricks-sdk-go"
) )
@ -30,15 +31,18 @@ type SyncOptions struct {
type Sync struct { type Sync struct {
*SyncOptions *SyncOptions
fileSet *git.FileSet fileSet *fileset.FileSet
snapshot *Snapshot snapshot *Snapshot
repoFiles *repofiles.RepoFiles repoFiles *repofiles.RepoFiles
} }
// New initializes and returns a new [Sync] instance. // New initializes and returns a new [Sync] instance.
func New(ctx context.Context, opts SyncOptions) (*Sync, error) { func New(ctx context.Context, opts SyncOptions) (*Sync, error) {
fileSet := git.NewFileSet(opts.LocalPath) fileSet, err := git.NewFileSet(opts.LocalPath)
err := fileSet.EnsureValidGitIgnoreExists() if err != nil {
return nil, err
}
err = git.EnsureValidGitIgnoreExists(fileSet)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -8,6 +8,7 @@ import (
"sync" "sync"
"github.com/databricks/bricks/git" "github.com/databricks/bricks/git"
"github.com/databricks/bricks/libs/fileset"
"github.com/databricks/databricks-sdk-go" "github.com/databricks/databricks-sdk-go"
"github.com/databricks/databricks-sdk-go/service/commands" "github.com/databricks/databricks-sdk-go/service/commands"
"github.com/databricks/databricks-sdk-go/service/scim" "github.com/databricks/databricks-sdk-go/service/scim"
@ -26,7 +27,7 @@ type project struct {
environment *Environment environment *Environment
wsc *databricks.WorkspaceClient wsc *databricks.WorkspaceClient
me *scim.User me *scim.User
fileSet *git.FileSet fileSet *fileset.FileSet
} }
// Configure is used as a PreRunE function for all commands that // Configure is used as a PreRunE function for all commands that
@ -65,10 +66,13 @@ func Initialize(ctx context.Context, root, env string) (context.Context, error)
return nil, fmt.Errorf("environment [%s] not defined", env) return nil, fmt.Errorf("environment [%s] not defined", env)
} }
fileSet := git.NewFileSet(root) fileSet, err := git.NewFileSet(root)
err = fileSet.EnsureValidGitIgnoreExists()
if err != nil { if err != nil {
return ctx, nil return nil, err
}
err = git.EnsureValidGitIgnoreExists(fileSet)
if err != nil {
return nil, err
} }
p := project{ p := project{
@ -116,7 +120,7 @@ func (p *project) Root() string {
return p.root return p.root
} }
func (p *project) GetFileSet() *git.FileSet { func (p *project) GetFileSet() *fileset.FileSet {
return p.fileSet return p.fileSet
} }
@ -129,7 +133,7 @@ func (p *project) GetFileSet() *git.FileSet {
// accidentally check into git // accidentally check into git
func (p *project) CacheDir() (string, error) { func (p *project) CacheDir() (string, error) {
// assert cache dir is present in git ignore // assert cache dir is present in git ignore
if !p.fileSet.IsGitIgnored(fmt.Sprintf("/%s/", CacheDirName)) { if !p.fileSet.Ignorer().IgnoreDirectory(fmt.Sprintf("/%s/", CacheDirName)) {
return "", fmt.Errorf("please add /%s/ to .gitignore", CacheDirName) return "", fmt.Errorf("please add /%s/ to .gitignore", CacheDirName)
} }