From 44e3928d6ad8f85f04041f8dd4cde25f32bfbe19 Mon Sep 17 00:00:00 2001 From: shreyas-goenka <88374338+shreyas-goenka@users.noreply.github.com> Date: Mon, 17 Jun 2024 15:18:52 +0530 Subject: [PATCH] Avoid multiple file tree traversals on bundle deploy (#1493) ## Changes To run bundle deploy from DBR we use an abstraction over the workspace import / export APIs to create a `filer.Filer` and abstract the file system. Walking the file tree in such a filer is expensive and requires multiple API calls. This PR remove the two duplicate file tree walks that happen by caching the result. --- bundle/bundle.go | 4 ++ bundle/deploy/files/upload.go | 2 +- bundle/deploy/state_update.go | 16 +---- bundle/deploy/state_update_test.go | 101 +++++++++++++---------------- cmd/bundle/sync.go | 3 +- cmd/sync/sync.go | 2 +- libs/sync/sync.go | 21 +++--- 7 files changed, 68 insertions(+), 81 deletions(-) diff --git a/bundle/bundle.go b/bundle/bundle.go index 1dc98656..482614b9 100644 --- a/bundle/bundle.go +++ b/bundle/bundle.go @@ -16,6 +16,7 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/env" "github.com/databricks/cli/bundle/metadata" + "github.com/databricks/cli/libs/fileset" "github.com/databricks/cli/libs/folders" "github.com/databricks/cli/libs/git" "github.com/databricks/cli/libs/locker" @@ -50,6 +51,9 @@ type Bundle struct { clientOnce sync.Once client *databricks.WorkspaceClient + // Files that are synced to the workspace.file_path + Files []fileset.File + // Stores an initialized copy of this bundle's Terraform wrapper. Terraform *tfexec.Terraform diff --git a/bundle/deploy/files/upload.go b/bundle/deploy/files/upload.go index fa20ed4e..2c126623 100644 --- a/bundle/deploy/files/upload.go +++ b/bundle/deploy/files/upload.go @@ -23,7 +23,7 @@ func (m *upload) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { return diag.FromErr(err) } - err = sync.RunOnce(ctx) + b.Files, err = sync.RunOnce(ctx) if err != nil { return diag.FromErr(err) } diff --git a/bundle/deploy/state_update.go b/bundle/deploy/state_update.go index 6903a9f8..bfdb308c 100644 --- a/bundle/deploy/state_update.go +++ b/bundle/deploy/state_update.go @@ -11,7 +11,6 @@ import ( "time" "github.com/databricks/cli/bundle" - "github.com/databricks/cli/bundle/deploy/files" "github.com/databricks/cli/internal/build" "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/log" @@ -40,19 +39,8 @@ func (s *stateUpdate) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnost state.CliVersion = build.GetInfo().Version state.Version = DeploymentStateVersion - // Get the current file list. - sync, err := files.GetSync(ctx, bundle.ReadOnly(b)) - if err != nil { - return diag.FromErr(err) - } - - files, err := sync.GetFileList(ctx) - if err != nil { - return diag.FromErr(err) - } - - // Update the state with the current file list. - fl, err := FromSlice(files) + // Update the state with the current list of synced files. + fl, err := FromSlice(b.Files) if err != nil { return diag.FromErr(err) } diff --git a/bundle/deploy/state_update_test.go b/bundle/deploy/state_update_test.go index dd8a1336..ed72439d 100644 --- a/bundle/deploy/state_update_test.go +++ b/bundle/deploy/state_update_test.go @@ -10,19 +10,23 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/internal/build" "github.com/databricks/cli/internal/testutil" - databrickscfg "github.com/databricks/databricks-sdk-go/config" - "github.com/databricks/databricks-sdk-go/experimental/mocks" + "github.com/databricks/cli/libs/fileset" + "github.com/databricks/cli/libs/vfs" "github.com/databricks/databricks-sdk-go/service/iam" - "github.com/databricks/databricks-sdk-go/service/workspace" - "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" ) -func TestStateUpdate(t *testing.T) { - s := &stateUpdate{} +func setupBundleForStateUpdate(t *testing.T) *bundle.Bundle { + tmpDir := t.TempDir() - b := &bundle.Bundle{ - RootPath: t.TempDir(), + testutil.Touch(t, tmpDir, "test1.py") + testutil.TouchNotebook(t, tmpDir, "test2.py") + + files, err := fileset.New(vfs.MustNew(tmpDir)).All() + require.NoError(t, err) + + return &bundle.Bundle{ + RootPath: tmpDir, Config: config.Root{ Bundle: config.Bundle{ Target: "default", @@ -37,22 +41,14 @@ func TestStateUpdate(t *testing.T) { }, }, }, + Files: files, } +} - testutil.Touch(t, b.RootPath, "test1.py") - testutil.Touch(t, b.RootPath, "test2.py") - - m := mocks.NewMockWorkspaceClient(t) - m.WorkspaceClient.Config = &databrickscfg.Config{ - Host: "https://test.com", - } - b.SetWorkpaceClient(m.WorkspaceClient) - - wsApi := m.GetMockWorkspaceAPI() - wsApi.EXPECT().GetStatusByPath(mock.Anything, "/files").Return(&workspace.ObjectInfo{ - ObjectType: "DIRECTORY", - }, nil) +func TestStateUpdate(t *testing.T) { + s := &stateUpdate{} + b := setupBundleForStateUpdate(t) ctx := context.Background() diags := bundle.Apply(ctx, b, s) @@ -63,7 +59,15 @@ func TestStateUpdate(t *testing.T) { require.NoError(t, err) require.Equal(t, int64(1), state.Seq) - require.Len(t, state.Files, 3) + require.Equal(t, state.Files, Filelist{ + { + LocalPath: "test1.py", + }, + { + LocalPath: "test2.py", + IsNotebook: true, + }, + }) require.Equal(t, build.GetInfo().Version, state.CliVersion) diags = bundle.Apply(ctx, b, s) @@ -74,45 +78,22 @@ func TestStateUpdate(t *testing.T) { require.NoError(t, err) require.Equal(t, int64(2), state.Seq) - require.Len(t, state.Files, 3) + require.Equal(t, state.Files, Filelist{ + { + LocalPath: "test1.py", + }, + { + LocalPath: "test2.py", + IsNotebook: true, + }, + }) require.Equal(t, build.GetInfo().Version, state.CliVersion) } func TestStateUpdateWithExistingState(t *testing.T) { s := &stateUpdate{} - b := &bundle.Bundle{ - RootPath: t.TempDir(), - Config: config.Root{ - Bundle: config.Bundle{ - Target: "default", - }, - Workspace: config.Workspace{ - StatePath: "/state", - FilePath: "/files", - CurrentUser: &config.User{ - User: &iam.User{ - UserName: "test-user", - }, - }, - }, - }, - } - - testutil.Touch(t, b.RootPath, "test1.py") - testutil.Touch(t, b.RootPath, "test2.py") - - m := mocks.NewMockWorkspaceClient(t) - m.WorkspaceClient.Config = &databrickscfg.Config{ - Host: "https://test.com", - } - b.SetWorkpaceClient(m.WorkspaceClient) - - wsApi := m.GetMockWorkspaceAPI() - wsApi.EXPECT().GetStatusByPath(mock.Anything, "/files").Return(&workspace.ObjectInfo{ - ObjectType: "DIRECTORY", - }, nil) - + b := setupBundleForStateUpdate(t) ctx := context.Background() // Create an existing state file. @@ -144,6 +125,14 @@ func TestStateUpdateWithExistingState(t *testing.T) { require.NoError(t, err) require.Equal(t, int64(11), state.Seq) - require.Len(t, state.Files, 3) + require.Equal(t, state.Files, Filelist{ + { + LocalPath: "test1.py", + }, + { + LocalPath: "test2.py", + IsNotebook: true, + }, + }) require.Equal(t, build.GetInfo().Version, state.CliVersion) } diff --git a/cmd/bundle/sync.go b/cmd/bundle/sync.go index 72ad8eb3..df3e087c 100644 --- a/cmd/bundle/sync.go +++ b/cmd/bundle/sync.go @@ -72,7 +72,8 @@ func newSyncCommand() *cobra.Command { return s.RunContinuous(ctx) } - return s.RunOnce(ctx) + _, err = s.RunOnce(ctx) + return err } return cmd diff --git a/cmd/sync/sync.go b/cmd/sync/sync.go index e5f1bfc9..bab45159 100644 --- a/cmd/sync/sync.go +++ b/cmd/sync/sync.go @@ -135,7 +135,7 @@ func New() *cobra.Command { if f.watch { err = s.RunContinuous(ctx) } else { - err = s.RunOnce(ctx) + _, err = s.RunOnce(ctx) } s.Close() diff --git a/libs/sync/sync.go b/libs/sync/sync.go index 585e8a88..12b1f1d0 100644 --- a/libs/sync/sync.go +++ b/libs/sync/sync.go @@ -152,36 +152,41 @@ func (s *Sync) notifyComplete(ctx context.Context, d diff) { s.seq++ } -func (s *Sync) RunOnce(ctx context.Context) error { +// Upload all files in the file tree rooted at the local path configured in the +// SyncOptions to the remote path configured in the SyncOptions. +// +// Returns the list of files tracked (and synchronized) by the syncer during the run, +// and an error if any occurred. +func (s *Sync) RunOnce(ctx context.Context) ([]fileset.File, error) { files, err := s.GetFileList(ctx) if err != nil { - return err + return files, err } change, err := s.snapshot.diff(ctx, files) if err != nil { - return err + return files, err } s.notifyStart(ctx, change) if change.IsEmpty() { s.notifyComplete(ctx, change) - return nil + return files, nil } err = s.applyDiff(ctx, change) if err != nil { - return err + return files, err } err = s.snapshot.Save(ctx) if err != nil { log.Errorf(ctx, "cannot store snapshot: %s", err) - return err + return files, err } s.notifyComplete(ctx, change) - return nil + return files, nil } func (s *Sync) GetFileList(ctx context.Context) ([]fileset.File, error) { @@ -231,7 +236,7 @@ func (s *Sync) RunContinuous(ctx context.Context) error { case <-ctx.Done(): return ctx.Err() case <-ticker.C: - err := s.RunOnce(ctx) + _, err := s.RunOnce(ctx) if err != nil { return err }