From 41961226be98aa66a63a5514cb309d53881f9418 Mon Sep 17 00:00:00 2001 From: Andrew Nester Date: Tue, 4 Mar 2025 15:03:51 +0000 Subject: [PATCH] Switch to use GET workspaces-files/{name} instead of workspace/export for state files (#2423) ## Changes Switch to use GET workspaces-files/{name} instead of workspace/export for state files. ## Why `/api/2.0./workspaces-files/{name}` has a higher limit which allows to export state files larger than 10 MBs (which is the current limit for `workspace/export`). We don't use the same API for read in other places and fully replacing existing Filer because it doesn't correct get the file content for notebooks and returns "File Not Found" error instead. ## Tests All existing tests pass --- acceptance/bundle/state/databricks.yml | 21 ++++++ acceptance/bundle/state/out.state.txt | 4 + acceptance/bundle/state/output.txt | 12 +++ acceptance/bundle/state/script | 4 + acceptance/bundle/state/test.py | 1 + acceptance/bundle/state/test.toml | 2 + acceptance/server_test.go | 5 ++ bundle/deploy/filer.go | 86 +++++++++++++++++++++- bundle/deploy/state_push.go | 13 ---- bundle/deploy/terraform/state_push.go | 11 --- bundle/deploy/terraform/state_push_test.go | 27 ------- libs/testserver/fake_workspace.go | 7 ++ 12 files changed, 139 insertions(+), 54 deletions(-) create mode 100644 acceptance/bundle/state/databricks.yml create mode 100644 acceptance/bundle/state/out.state.txt create mode 100644 acceptance/bundle/state/output.txt create mode 100644 acceptance/bundle/state/script create mode 100644 acceptance/bundle/state/test.py create mode 100644 acceptance/bundle/state/test.toml diff --git a/acceptance/bundle/state/databricks.yml b/acceptance/bundle/state/databricks.yml new file mode 100644 index 000000000..4775ba33d --- /dev/null +++ b/acceptance/bundle/state/databricks.yml @@ -0,0 +1,21 @@ +bundle: + name: state + +resources: + jobs: + test: + name: "test" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + data_security_mode: SINGLE_USER + num_workers: 0 + spark_conf: + spark.master: "local[*, 4]" + spark.databricks.cluster.profile: singleNode + custom_tags: + ResourceClass: SingleNode diff --git a/acceptance/bundle/state/out.state.txt b/acceptance/bundle/state/out.state.txt new file mode 100644 index 000000000..96966812b --- /dev/null +++ b/acceptance/bundle/state/out.state.txt @@ -0,0 +1,4 @@ +{ + "method": "GET", + "path": "/api/2.0/workspace-files/Workspace/Users/[USERNAME]/.bundle/state/default/state/terraform.tfstate" +} diff --git a/acceptance/bundle/state/output.txt b/acceptance/bundle/state/output.txt new file mode 100644 index 000000000..ac13a7ba4 --- /dev/null +++ b/acceptance/bundle/state/output.txt @@ -0,0 +1,12 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/state/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/state/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/state/script b/acceptance/bundle/state/script new file mode 100644 index 000000000..9977f7d52 --- /dev/null +++ b/acceptance/bundle/state/script @@ -0,0 +1,4 @@ +trace $CLI bundle deploy +trace $CLI bundle deploy # We do 2 deploys because only 2nd deploy will pull state from remote after 1st created it +jq 'select(.path == "/api/2.0/workspace-files/Workspace/Users/[USERNAME]/.bundle/state/default/state/terraform.tfstate")' out.requests.txt > out.state.txt +rm out.requests.txt diff --git a/acceptance/bundle/state/test.py b/acceptance/bundle/state/test.py new file mode 100644 index 000000000..f1a18139c --- /dev/null +++ b/acceptance/bundle/state/test.py @@ -0,0 +1 @@ +print("Hello world!") diff --git a/acceptance/bundle/state/test.toml b/acceptance/bundle/state/test.toml new file mode 100644 index 000000000..7061886de --- /dev/null +++ b/acceptance/bundle/state/test.toml @@ -0,0 +1,2 @@ +Cloud = false +RecordRequests = true diff --git a/acceptance/server_test.go b/acceptance/server_test.go index 402e3ca5f..7ef705931 100644 --- a/acceptance/server_test.go +++ b/acceptance/server_test.go @@ -111,6 +111,11 @@ func AddHandlers(server *testserver.Server) { return "" }) + server.Handle("GET", "/api/2.0/workspace-files/{path:.*}", func(req testserver.Request) any { + path := req.Vars["path"] + return req.Workspace.WorkspaceFilesExportFile(path) + }) + server.Handle("GET", "/api/2.1/unity-catalog/current-metastore-assignment", func(req testserver.Request) any { return testMetastore }) diff --git a/bundle/deploy/filer.go b/bundle/deploy/filer.go index c0fd839ef..b65f08a67 100644 --- a/bundle/deploy/filer.go +++ b/bundle/deploy/filer.go @@ -1,14 +1,94 @@ package deploy import ( + "bytes" + "context" + "fmt" + "io" + "io/fs" + "net/http" + "net/url" + "strings" + "github.com/databricks/cli/bundle" "github.com/databricks/cli/libs/filer" + "github.com/databricks/databricks-sdk-go/client" ) // FilerFactory is a function that returns a filer.Filer. type FilerFactory func(b *bundle.Bundle) (filer.Filer, error) -// StateFiler returns a filer.Filer that can be used to read/write state files. -func StateFiler(b *bundle.Bundle) (filer.Filer, error) { - return filer.NewWorkspaceFilesClient(b.WorkspaceClient(), b.Config.Workspace.StatePath) +type stateFiler struct { + filer filer.Filer + + apiClient *client.DatabricksClient + root filer.WorkspaceRootPath +} + +func (s stateFiler) Delete(ctx context.Context, path string, mode ...filer.DeleteMode) error { + return s.filer.Delete(ctx, path, mode...) +} + +// Mkdir implements filer.Filer. +func (s stateFiler) Mkdir(ctx context.Context, path string) error { + return s.filer.Mkdir(ctx, path) +} + +func (s stateFiler) Read(ctx context.Context, path string) (io.ReadCloser, error) { + absPath, err := s.root.Join(path) + if err != nil { + return nil, err + } + + stat, err := s.Stat(ctx, path) + if err != nil { + return nil, err + } + if stat.IsDir() { + return nil, fmt.Errorf("not a file: %s", absPath) + } + + var buf bytes.Buffer + urlPath := "/api/2.0/workspace-files/" + url.PathEscape(strings.TrimLeft(absPath, "/")) + err = s.apiClient.Do(ctx, http.MethodGet, urlPath, nil, nil, nil, &buf) + if err != nil { + return nil, err + } + + return io.NopCloser(&buf), nil +} + +func (s stateFiler) ReadDir(ctx context.Context, path string) ([]fs.DirEntry, error) { + return s.filer.ReadDir(ctx, path) +} + +func (s stateFiler) Stat(ctx context.Context, name string) (fs.FileInfo, error) { + return s.filer.Stat(ctx, name) +} + +func (s stateFiler) Write(ctx context.Context, path string, reader io.Reader, mode ...filer.WriteMode) error { + return s.filer.Write(ctx, path, reader, mode...) +} + +// StateFiler returns a filer.Filer that can be used to read/write state files. +// We use a custom workspace filer which uses workspace-files API to read state files. +// This API has a higher than 10 MB limits and allows to export large state files. +// We don't use the same API for read because it doesn't correct get the file content for notebooks and returns +// "File Not Found" error instead. +func StateFiler(b *bundle.Bundle) (filer.Filer, error) { + f, err := filer.NewWorkspaceFilesClient(b.WorkspaceClient(), b.Config.Workspace.StatePath) + if err != nil { + return nil, err + } + + apiClient, err := client.New(b.WorkspaceClient().Config) + if err != nil { + return nil, fmt.Errorf("failed to create API client: %w", err) + } + + return stateFiler{ + filer: f, + root: filer.NewWorkspaceRootPath(b.Config.Workspace.StatePath), + apiClient: apiClient, + }, nil } diff --git a/bundle/deploy/state_push.go b/bundle/deploy/state_push.go index 6912414c1..176a907c8 100644 --- a/bundle/deploy/state_push.go +++ b/bundle/deploy/state_push.go @@ -10,8 +10,6 @@ import ( "github.com/databricks/cli/libs/log" ) -const MaxStateFileSize = 10 * 1024 * 1024 // 10MB - type statePush struct { filerFactory FilerFactory } @@ -37,17 +35,6 @@ func (s *statePush) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostic } defer local.Close() - if !b.Config.Bundle.Force { - state, err := local.Stat() - if err != nil { - return diag.FromErr(err) - } - - if state.Size() > MaxStateFileSize { - return diag.Errorf("Deployment state file size exceeds the maximum allowed size of %d bytes. Please reduce the number of resources in your bundle, split your bundle into multiple or re-run the command with --force flag.", MaxStateFileSize) - } - } - log.Infof(ctx, "Writing local deployment state file to remote state directory") err = f.Write(ctx, DeploymentStateFileName, local, filer.CreateParentDirectories, filer.OverwriteIfExists) if err != nil { diff --git a/bundle/deploy/terraform/state_push.go b/bundle/deploy/terraform/state_push.go index 84d8e7670..6cdde1371 100644 --- a/bundle/deploy/terraform/state_push.go +++ b/bundle/deploy/terraform/state_push.go @@ -47,17 +47,6 @@ func (l *statePush) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostic } defer local.Close() - if !b.Config.Bundle.Force { - state, err := local.Stat() - if err != nil { - return diag.FromErr(err) - } - - if state.Size() > deploy.MaxStateFileSize { - return diag.Errorf("Terraform state file size exceeds the maximum allowed size of %d bytes. Please reduce the number of resources in your bundle, split your bundle into multiple or re-run the command with --force flag", deploy.MaxStateFileSize) - } - } - // Upload state file from local cache directory to filer. cmdio.LogString(ctx, "Updating deployment state...") log.Infof(ctx, "Writing local state file to remote state directory") diff --git a/bundle/deploy/terraform/state_push_test.go b/bundle/deploy/terraform/state_push_test.go index 54e7f621c..e022dee1b 100644 --- a/bundle/deploy/terraform/state_push_test.go +++ b/bundle/deploy/terraform/state_push_test.go @@ -3,7 +3,6 @@ package terraform import ( "context" "encoding/json" - "fmt" "io" "testing" @@ -60,29 +59,3 @@ func TestStatePush(t *testing.T) { diags := bundle.Apply(ctx, b, m) assert.NoError(t, diags.Error()) } - -func TestStatePushLargeState(t *testing.T) { - mock := mockfiler.NewMockFiler(t) - m := &statePush{ - identityFiler(mock), - } - - ctx := context.Background() - b := statePushTestBundle(t) - - largeState := map[string]any{} - for i := range 1000000 { - largeState[fmt.Sprintf("field_%d", i)] = i - } - - // Write a stale local state file. - writeLocalState(t, ctx, b, largeState) - diags := bundle.Apply(ctx, b, m) - assert.ErrorContains(t, diags.Error(), "Terraform state file size exceeds the maximum allowed size of 10485760 bytes. Please reduce the number of resources in your bundle, split your bundle into multiple or re-run the command with --force flag") - - // Force the write. - b = statePushTestBundle(t) - b.Config.Bundle.Force = true - diags = bundle.Apply(ctx, b, m) - assert.NoError(t, diags.Error()) -} diff --git a/libs/testserver/fake_workspace.go b/libs/testserver/fake_workspace.go index 4e943f828..80e88941d 100644 --- a/libs/testserver/fake_workspace.go +++ b/libs/testserver/fake_workspace.go @@ -83,6 +83,13 @@ func (s *FakeWorkspace) WorkspaceFilesImportFile(path string, body []byte) { s.files[path] = body } +func (s *FakeWorkspace) WorkspaceFilesExportFile(path string) []byte { + if !strings.HasPrefix(path, "/") { + path = "/" + path + } + return s.files[path] +} + func (s *FakeWorkspace) JobsCreate(request jobs.CreateJob) Response { jobId := s.nextJobId s.nextJobId++