Move git.FileSet to libs/fileset and make it aware of gitignores (#184)

This moves `git.FileSet` to `libs/fileset` and decouples it from the Git package.

It is made aware of gitignore rules in parent directories up to the
repository root as well as gitignore files in underlying directories
through the `fileset.Ignorer` interface.

The recursive directory walker is reimplemented with [filepath.WalkDir].

Follow up to #182.
This commit is contained in:
Pieter Noordhuis 2023-01-27 16:04:58 +01:00 committed by GitHub
parent 2eb10800a5
commit eb76e5d3e8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 196 additions and 217 deletions

View File

@ -1,82 +1,31 @@
package git
import (
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"time"
ignore "github.com/sabhiram/go-gitignore"
"golang.org/x/exp/slices"
"github.com/databricks/bricks/libs/fileset"
)
type File struct {
fs.DirEntry
Absolute, Relative string
}
func (f File) Modified() (ts time.Time) {
info, err := f.Info()
if err != nil {
// return default time, beginning of epoch
return ts
}
return info.ModTime()
}
// FileSet facilitates fast recursive tracked file listing
// with respect to patterns defined in `.gitignore` file
//
// root: Root of the git repository
// ignore: List of patterns defined in `.gitignore`.
//
// We do not sync files that match this pattern
type FileSet struct {
root string
ignore *ignore.GitIgnore
}
// Retuns FileSet for the git repo located at `root`
func NewFileSet(root string) *FileSet {
syncIgnoreLines := append(getGitIgnoreLines(root), getSyncIgnoreLines()...)
return &FileSet{
root: root,
ignore: ignore.CompileIgnoreLines(syncIgnoreLines...),
func NewFileSet(root string) (*fileset.FileSet, error) {
w := fileset.New(root)
v, err := NewView(root)
if err != nil {
return nil, err
}
}
func getGitIgnoreLines(root string) []string {
gitIgnoreLines := []string{}
gitIgnorePath := fmt.Sprintf("%s/.gitignore", root)
rawIgnore, err := os.ReadFile(gitIgnorePath)
if err == nil {
// add entries from .gitignore if the file exists (did read correctly)
for _, line := range strings.Split(string(rawIgnore), "\n") {
// underlying library doesn't behave well with Rule 5 of .gitignore,
// hence this workaround
gitIgnoreLines = append(gitIgnoreLines, strings.Trim(line, "/"))
}
}
return gitIgnoreLines
}
// This contains additional files/dirs to be ignored while syncing that are
// not mentioned in .gitignore
func getSyncIgnoreLines() []string {
return []string{".git"}
w.SetIgnorer(v)
return w, nil
}
// Only call this function for a bricks project root
// since it will create a .gitignore file if missing
func (w *FileSet) EnsureValidGitIgnoreExists() error {
gitIgnoreLines := getGitIgnoreLines(w.root)
gitIgnorePath := fmt.Sprintf("%s/.gitignore", w.root)
if slices.Contains(gitIgnoreLines, ".databricks") {
func EnsureValidGitIgnoreExists(w *fileset.FileSet) error {
if w.Ignorer().IgnoreDirectory(".databricks") {
return nil
}
gitIgnorePath := filepath.Join(w.Root(), ".gitignore")
f, err := os.OpenFile(gitIgnorePath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0644)
if err != nil {
return err
@ -86,69 +35,12 @@ func (w *FileSet) EnsureValidGitIgnoreExists() error {
if err != nil {
return err
}
gitIgnoreLines = append(gitIgnoreLines, ".databricks")
w.ignore = ignore.CompileIgnoreLines(append(gitIgnoreLines, getSyncIgnoreLines()...)...)
// Reload view to update ignore rules.
v, err := NewView(w.Root())
if err != nil {
return err
}
w.SetIgnorer(v)
return nil
}
// Return root for fileset.
func (w *FileSet) Root() string {
return w.root
}
// Return all tracked files for Repo
func (w *FileSet) All() ([]File, error) {
return w.RecursiveListFiles(w.root)
}
func (w *FileSet) IsGitIgnored(pattern string) bool {
return w.ignore.MatchesPath(pattern)
}
// Recursively traverses dir in a depth first manner and returns a list of all files
// that are being tracked in the FileSet (ie not being ignored for matching one of the
// patterns in w.ignore)
func (w *FileSet) RecursiveListFiles(dir string) (fileList []File, err error) {
queue, err := readDir(dir, w.root)
if err != nil {
return nil, err
}
for len(queue) > 0 {
current := queue[0]
queue = queue[1:]
if w.ignore.MatchesPath(current.Relative) {
continue
}
if !current.IsDir() {
fileList = append(fileList, current)
continue
}
children, err := readDir(current.Absolute, w.root)
if err != nil {
return nil, err
}
queue = append(queue, children...)
}
return fileList, nil
}
func readDir(dir, root string) (queue []File, err error) {
f, err := os.Open(dir)
if err != nil {
return
}
defer f.Close()
dirs, err := f.ReadDir(-1)
if err != nil {
return
}
for _, v := range dirs {
absolute := filepath.Join(dir, v.Name())
relative, err := filepath.Rel(root, absolute)
if err != nil {
return nil, err
}
queue = append(queue, File{v, absolute, relative})
}
return
}

View File

@ -1,91 +1,34 @@
package git
import (
"os"
"path/filepath"
"runtime"
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRecusiveListFile(t *testing.T) {
// Create .gitignore and ignore .gitignore and any files in
// ignored_dir
projectDir := t.TempDir()
f3, err := os.Create(filepath.Join(projectDir, ".gitignore"))
assert.NoError(t, err)
defer f3.Close()
f3.WriteString(".gitignore\nignored_dir")
// create config file
f4, err := os.Create(filepath.Join(projectDir, "databricks.yml"))
assert.NoError(t, err)
defer f4.Close()
// config file is returned
// .gitignore is not because we explictly ignore it in .gitignore
fileSet := NewFileSet(projectDir)
files, err := fileSet.RecursiveListFiles(projectDir)
assert.NoError(t, err)
assert.Len(t, files, 1)
assert.Equal(t, "databricks.yml", files[0].Relative)
helloTxtRelativePath := filepath.Join("a/b/c", "hello.txt")
// Check that newly added files not in .gitignore
// are being tracked
dir1 := filepath.Join(projectDir, "a", "b", "c")
dir2 := filepath.Join(projectDir, "ignored_dir", "e")
err = os.MkdirAll(dir2, 0o755)
assert.NoError(t, err)
err = os.MkdirAll(dir1, 0o755)
assert.NoError(t, err)
f1, err := os.Create(filepath.Join(projectDir, helloTxtRelativePath))
assert.NoError(t, err)
defer f1.Close()
f2, err := os.Create(filepath.Join(projectDir, "ignored_dir/e/world.txt"))
assert.NoError(t, err)
defer f2.Close()
files, err = fileSet.RecursiveListFiles(projectDir)
assert.NoError(t, err)
assert.Len(t, files, 2)
var fileNames []string
for _, v := range files {
fileNames = append(fileNames, v.Relative)
}
assert.Contains(t, fileNames, "databricks.yml")
assert.Contains(t, fileNames, helloTxtRelativePath)
func TestFileSetRecursiveListFiles(t *testing.T) {
fileSet, err := NewFileSet("./testdata")
require.NoError(t, err)
files, err := fileSet.RecursiveListFiles("./testdata")
require.NoError(t, err)
require.Len(t, files, 6)
assert.Equal(t, filepath.Join(".gitignore"), files[0].Relative)
assert.Equal(t, filepath.Join("a", ".gitignore"), files[1].Relative)
assert.Equal(t, filepath.Join("a", "b", ".gitignore"), files[2].Relative)
assert.Equal(t, filepath.Join("a", "b", "world.txt"), files[3].Relative)
assert.Equal(t, filepath.Join("a", "hello.txt"), files[4].Relative)
assert.Equal(t, filepath.Join("databricks.yml"), files[5].Relative)
}
func TestFileSetNonCleanRoot(t *testing.T) {
// Test what happens if the root directory can be simplified.
// Path simplification is done by most filepath functions.
// remove this once equivalent tests for windows have been set up
// or this test has been fixed for windows
// date: 28 Nov 2022
if runtime.GOOS == "windows" {
t.Skip("skipping temperorilty to make windows unit tests green")
}
root := "./../git"
require.NotEqual(t, root, filepath.Clean(root))
fs := NewFileSet(root)
files, err := fs.All()
// This should yield the same result as above test.
fileSet, err := NewFileSet("./testdata/../testdata")
require.NoError(t, err)
found := false
for _, f := range files {
if strings.Contains(f.Relative, "fileset_test") {
assert.Equal(t, "fileset_test.go", f.Relative)
assert.Equal(t, "../git/fileset_test.go", f.Absolute)
found = true
}
}
assert.True(t, found)
files, err := fileSet.All()
require.NoError(t, err)
assert.Len(t, files, 6)
}

0
git/testdata/a/a.ignore vendored Normal file
View File

0
git/testdata/a/b/b.ignore vendored Normal file
View File

0
git/testdata/a/b/world.txt vendored Normal file
View File

0
git/testdata/a/hello.txt vendored Normal file
View File

0
git/testdata/databricks.yml vendored Normal file
View File

0
git/testdata/ignorethis/ignored.txt vendored Normal file
View File

View File

@ -38,6 +38,27 @@ func (v *View) Ignore(path string) bool {
return v.repo.Ignore(filepath.Join(v.targetPath, path) + trailingSlash)
}
// IgnoreFile returns if the gitignore rules in this fileset
// apply to the specified file path.
//
// This function is provided to implement [fileset.Ignorer].
func (v *View) IgnoreFile(file string) bool {
return v.Ignore(file)
}
// IgnoreDirectory returns if the gitignore rules in this fileset
// apply to the specified directory path.
//
// A gitignore rule may apply only to directories if it uses
// a trailing slash. Therefore this function checks the gitignore
// rules for the specified directory path first without and then
// with a trailing slash.
//
// This function is provided to implement [fileset.Ignorer].
func (v *View) IgnoreDirectory(dir string) bool {
return v.Ignore(dir) || v.Ignore(dir+"/")
}
func NewView(path string) (*View, error) {
path, err := filepath.Abs(path)
if err != nil {

20
libs/fileset/file.go Normal file
View File

@ -0,0 +1,20 @@
package fileset
import (
"io/fs"
"time"
)
type File struct {
fs.DirEntry
Absolute, Relative string
}
func (f File) Modified() (ts time.Time) {
info, err := f.Info()
if err != nil {
// return default time, beginning of epoch
return ts
}
return info.ModTime()
}

72
libs/fileset/fileset.go Normal file
View File

@ -0,0 +1,72 @@
package fileset
import (
"io/fs"
"path/filepath"
)
// FileSet facilitates fast recursive file listing of a path.
// It optionally takes into account ignore rules through the [Ignorer] interface.
type FileSet struct {
root string
ignore Ignorer
}
// New returns a [FileSet] for the given root path.
func New(root string) *FileSet {
return &FileSet{
root: filepath.Clean(root),
ignore: nopIgnorer{},
}
}
// Ignorer returns the [FileSet]'s current ignorer.
func (w *FileSet) Ignorer() Ignorer {
return w.ignore
}
// SetIgnorer sets the [Ignorer] interface for this [FileSet].
func (w *FileSet) SetIgnorer(ignore Ignorer) {
w.ignore = ignore
}
// Return root for fileset.
func (w *FileSet) Root() string {
return w.root
}
// Return all tracked files for Repo
func (w *FileSet) All() ([]File, error) {
return w.RecursiveListFiles(w.root)
}
// Recursively traverses dir in a depth first manner and returns a list of all files
// that are being tracked in the FileSet (ie not being ignored for matching one of the
// patterns in w.ignore)
func (w *FileSet) RecursiveListFiles(dir string) (fileList []File, err error) {
err = filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
relPath, err := filepath.Rel(w.root, path)
if err != nil {
return err
}
if d.IsDir() {
if w.ignore.IgnoreDirectory(relPath) {
return filepath.SkipDir
}
return nil
}
if w.ignore.IgnoreFile(relPath) {
return nil
}
fileList = append(fileList, File{d, path, relPath})
return nil
})
return
}

19
libs/fileset/ignorer.go Normal file
View File

@ -0,0 +1,19 @@
package fileset
// Ignorer is the interface for what determines if a path
// in the [FileSet] must be ignored or not.
type Ignorer interface {
IgnoreFile(path string) bool
IgnoreDirectory(path string) bool
}
// nopIgnorer implements an [Ignorer] that doesn't ignore anything.
type nopIgnorer struct{}
func (nopIgnorer) IgnoreFile(path string) bool {
return false
}
func (nopIgnorer) IgnoreDirectory(path string) bool {
return false
}

View File

@ -15,7 +15,7 @@ import (
"crypto/md5"
"encoding/hex"
"github.com/databricks/bricks/git"
"github.com/databricks/bricks/libs/fileset"
)
// Bump it up every time a potentially breaking change is made to the snapshot schema
@ -191,7 +191,7 @@ func getNotebookDetails(path string) (isNotebook bool, typeOfNotebook string, er
return false, "", nil
}
func (s *Snapshot) diff(all []git.File) (change diff, err error) {
func (s *Snapshot) diff(all []fileset.File) (change diff, err error) {
currentFilenames := map[string]bool{}
lastModifiedTimes := s.LastUpdatedTimes
remoteToLocalNames := s.RemoteToLocalNames

View File

@ -26,7 +26,8 @@ func assertKeysOfMap(t *testing.T, m map[string]time.Time, expectedKeys []string
func TestDiff(t *testing.T) {
// Create temp project dir
projectDir := t.TempDir()
fileSet := git.NewFileSet(projectDir)
fileSet, err := git.NewFileSet(projectDir)
require.NoError(t, err)
state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string),
@ -85,14 +86,15 @@ func TestDiff(t *testing.T) {
func TestFolderDiff(t *testing.T) {
// Create temp project dir
projectDir := t.TempDir()
fileSet := git.NewFileSet(projectDir)
fileSet, err := git.NewFileSet(projectDir)
require.NoError(t, err)
state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string),
RemoteToLocalNames: make(map[string]string),
}
err := os.Mkdir(filepath.Join(projectDir, "foo"), os.ModePerm)
err = os.Mkdir(filepath.Join(projectDir, "foo"), os.ModePerm)
assert.NoError(t, err)
f1 := testfile.CreateFile(t, filepath.Join(projectDir, "foo", "bar.py"))
defer f1.Close(t)
@ -119,7 +121,8 @@ func TestFolderDiff(t *testing.T) {
func TestPythonNotebookDiff(t *testing.T) {
// Create temp project dir
projectDir := t.TempDir()
fileSet := git.NewFileSet(projectDir)
fileSet, err := git.NewFileSet(projectDir)
require.NoError(t, err)
state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string),
@ -189,7 +192,8 @@ func TestPythonNotebookDiff(t *testing.T) {
func TestErrorWhenIdenticalRemoteName(t *testing.T) {
// Create temp project dir
projectDir := t.TempDir()
fileSet := git.NewFileSet(projectDir)
fileSet, err := git.NewFileSet(projectDir)
require.NoError(t, err)
state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string),

View File

@ -8,6 +8,7 @@ import (
"time"
"github.com/databricks/bricks/git"
"github.com/databricks/bricks/libs/fileset"
"github.com/databricks/bricks/libs/sync/repofiles"
"github.com/databricks/databricks-sdk-go"
)
@ -30,15 +31,18 @@ type SyncOptions struct {
type Sync struct {
*SyncOptions
fileSet *git.FileSet
fileSet *fileset.FileSet
snapshot *Snapshot
repoFiles *repofiles.RepoFiles
}
// New initializes and returns a new [Sync] instance.
func New(ctx context.Context, opts SyncOptions) (*Sync, error) {
fileSet := git.NewFileSet(opts.LocalPath)
err := fileSet.EnsureValidGitIgnoreExists()
fileSet, err := git.NewFileSet(opts.LocalPath)
if err != nil {
return nil, err
}
err = git.EnsureValidGitIgnoreExists(fileSet)
if err != nil {
return nil, err
}

View File

@ -8,6 +8,7 @@ import (
"sync"
"github.com/databricks/bricks/git"
"github.com/databricks/bricks/libs/fileset"
"github.com/databricks/databricks-sdk-go"
"github.com/databricks/databricks-sdk-go/service/commands"
"github.com/databricks/databricks-sdk-go/service/scim"
@ -26,7 +27,7 @@ type project struct {
environment *Environment
wsc *databricks.WorkspaceClient
me *scim.User
fileSet *git.FileSet
fileSet *fileset.FileSet
}
// Configure is used as a PreRunE function for all commands that
@ -65,10 +66,13 @@ func Initialize(ctx context.Context, root, env string) (context.Context, error)
return nil, fmt.Errorf("environment [%s] not defined", env)
}
fileSet := git.NewFileSet(root)
err = fileSet.EnsureValidGitIgnoreExists()
fileSet, err := git.NewFileSet(root)
if err != nil {
return ctx, nil
return nil, err
}
err = git.EnsureValidGitIgnoreExists(fileSet)
if err != nil {
return nil, err
}
p := project{
@ -116,7 +120,7 @@ func (p *project) Root() string {
return p.root
}
func (p *project) GetFileSet() *git.FileSet {
func (p *project) GetFileSet() *fileset.FileSet {
return p.fileSet
}
@ -129,7 +133,7 @@ func (p *project) GetFileSet() *git.FileSet {
// accidentally check into git
func (p *project) CacheDir() (string, error) {
// assert cache dir is present in git ignore
if !p.fileSet.IsGitIgnored(fmt.Sprintf("/%s/", CacheDirName)) {
if !p.fileSet.Ignorer().IgnoreDirectory(fmt.Sprintf("/%s/", CacheDirName)) {
return "", fmt.Errorf("please add /%s/ to .gitignore", CacheDirName)
}