From 5a8cd0c5bc3f12805019b16bcede8d6edd7d5697 Mon Sep 17 00:00:00 2001 From: shreyas-goenka <88374338+shreyas-goenka@users.noreply.github.com> Date: Fri, 27 Oct 2023 14:55:43 +0200 Subject: [PATCH] Persist deployment metadata in WSFS (#845) ## Changes This PR introduces a metadata struct that stores a subset of bundle configuration that we wish to expose to other Databricks services that wish to integrate with bundles. This metadata file is uploaded to a file `${bundle.workspace.state_path}/metadata.json` in the WSFS destination of the bundle deployment. Documentation for emitted metadata fields: * `version`: Version for the metadata file schema * `config.bundle.git.branch`: Name of the git branch the bundle was deployed from. * `config.bundle.git.origin_url`: URL for git remote "origin" * `config.bundle.git.bundle_root_path`: Relative path of the bundle root from the root of the git repository. Is set to "." if they are the same. * `config.bundle.git.commit`: SHA-1 commit hash of the exact commit this bundle was deployed from. Note, the deployment might not exactly match this commit version if there are changes that have not been committed to git at deploy time, * `file_path`: Path in workspace where we sync bundle files to. * `resources.jobs.[job-ref].id`: Id of the job * `resources.jobs.[job-ref].relative_path`: Relative path of the yaml config file from the bundle root where this job was defined. Example metadata object when bundle root and git root are the same: ```json { "version": 1, "config": { "bundle": { "lock": {}, "git": { "branch": "master", "origin_url": "www.host.com", "commit": "7af8e5d3f5dceffff9295d42d21606ccf056dce0", "bundle_root_path": "." } }, "workspace": { "file_path": "/Users/shreyas.goenka@databricks.com/.bundle/pipeline-progress/default/files" }, "resources": { "jobs": { "bar": { "id": "245921165354846", "relative_path": "databricks.yml" } } }, "sync": {} } } ``` Example metadata when the git root is one level above the bundle repo: ```json { "version": 1, "config": { "bundle": { "lock": {}, "git": { "branch": "dev-branch", "origin_url": "www.my-repo.com", "commit": "3db46ef750998952b00a2b3e7991e31787e4b98b", "bundle_root_path": "pipeline-progress" } }, "workspace": { "file_path": "/Users/shreyas.goenka@databricks.com/.bundle/pipeline-progress/default/files" }, "resources": { "jobs": { "bar": { "id": "245921165354846", "relative_path": "databricks.yml" } } }, "sync": {} } } ``` This unblocks integration to the jobs break glass UI for bundles. ## Tests Unit tests and integration tests. --- bundle/bundle.go | 9 ++ bundle/config/bundle.go | 2 +- bundle/config/git.go | 3 + bundle/config/lock.go | 4 +- bundle/config/mutator/load_git_details.go | 13 +++ bundle/config/paths/paths.go | 4 +- bundle/deploy/metadata/compute.go | 51 +++++++++ bundle/deploy/metadata/compute_test.go | 100 +++++++++++++++++ bundle/deploy/metadata/upload.go | 36 ++++++ bundle/metadata/metadata.go | 45 ++++++++ bundle/phases/deploy.go | 8 +- .../databricks_template_schema.json | 2 +- .../databricks_template_schema.json | 16 +++ .../bundles/job_metadata/template/a/b/bar.py | 2 + .../template/a/b/resources.yml.tmpl | 12 ++ .../job_metadata/template/databricks.yml.tmpl | 21 ++++ .../bundles/job_metadata/template/foo.py | 2 + internal/bundle/job_metadata_test.go | 105 ++++++++++++++++++ libs/git/repository.go | 2 +- 19 files changed, 429 insertions(+), 8 deletions(-) create mode 100644 bundle/deploy/metadata/compute.go create mode 100644 bundle/deploy/metadata/compute_test.go create mode 100644 bundle/deploy/metadata/upload.go create mode 100644 bundle/metadata/metadata.go create mode 100644 internal/bundle/bundles/job_metadata/databricks_template_schema.json create mode 100644 internal/bundle/bundles/job_metadata/template/a/b/bar.py create mode 100644 internal/bundle/bundles/job_metadata/template/a/b/resources.yml.tmpl create mode 100644 internal/bundle/bundles/job_metadata/template/databricks.yml.tmpl create mode 100644 internal/bundle/bundles/job_metadata/template/foo.py create mode 100644 internal/bundle/job_metadata_test.go diff --git a/bundle/bundle.go b/bundle/bundle.go index fd9c131f..a2d774bb 100644 --- a/bundle/bundle.go +++ b/bundle/bundle.go @@ -15,6 +15,7 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/env" + "github.com/databricks/cli/bundle/metadata" "github.com/databricks/cli/folders" "github.com/databricks/cli/libs/git" "github.com/databricks/cli/libs/locker" @@ -31,6 +32,14 @@ const internalFolder = ".internal" type Bundle struct { Config config.Root + // Metadata about the bundle deployment. This is the interface Databricks services + // rely on to integrate with bundles when they need additional information about + // a bundle deployment. + // + // After deploy, a file containing the metadata (metadata.json) can be found + // in the WSFS location containing the bundle state. + Metadata metadata.Metadata + // Store a pointer to the workspace client. // It can be initialized on demand after loading the configuration. clientOnce sync.Once diff --git a/bundle/config/bundle.go b/bundle/config/bundle.go index d444f507..933e88bf 100644 --- a/bundle/config/bundle.go +++ b/bundle/config/bundle.go @@ -29,7 +29,7 @@ type Bundle struct { Lock Lock `json:"lock" bundle:"readonly"` // Force-override Git branch validation. - Force bool `json:"force" bundle:"readonly"` + Force bool `json:"force,omitempty" bundle:"readonly"` // Contains Git information like current commit, current branch and // origin url. Automatically loaded by reading .git directory if not specified diff --git a/bundle/config/git.go b/bundle/config/git.go index 760134a8..58a5d54d 100644 --- a/bundle/config/git.go +++ b/bundle/config/git.go @@ -5,6 +5,9 @@ type Git struct { OriginURL string `json:"origin_url,omitempty"` Commit string `json:"commit,omitempty" bundle:"readonly"` + // Path to bundle root relative to the git repository root. + BundleRootPath string `json:"bundle_root_path,omitempty" bundle:"readonly"` + // Inferred is set to true if the Git details were inferred and weren't set explicitly Inferred bool `json:"-" bundle:"readonly"` diff --git a/bundle/config/lock.go b/bundle/config/lock.go index 28d5a5ac..760099a9 100644 --- a/bundle/config/lock.go +++ b/bundle/config/lock.go @@ -4,11 +4,11 @@ type Lock struct { // Enabled toggles deployment lock. True by default. // Use a pointer value so that only explicitly configured values are set // and we don't merge configuration with zero-initialized values. - Enabled *bool `json:"enabled"` + Enabled *bool `json:"enabled,omitempty"` // Force acquisition of deployment lock even if it is currently held. // This may be necessary if a prior deployment failed to release the lock. - Force bool `json:"force"` + Force bool `json:"force,omitempty"` } func (lock Lock) IsEnabled() bool { diff --git a/bundle/config/mutator/load_git_details.go b/bundle/config/mutator/load_git_details.go index ab47677d..3a50d683 100644 --- a/bundle/config/mutator/load_git_details.go +++ b/bundle/config/mutator/load_git_details.go @@ -2,6 +2,7 @@ package mutator import ( "context" + "path/filepath" "github.com/databricks/cli/bundle" "github.com/databricks/cli/libs/git" @@ -52,5 +53,17 @@ func (m *loadGitDetails) Apply(ctx context.Context, b *bundle.Bundle) error { remoteUrl := repo.OriginUrl() b.Config.Bundle.Git.OriginURL = remoteUrl } + + // Compute relative path of the bundle root from the Git repo root. + absBundlePath, err := filepath.Abs(b.Config.Path) + if err != nil { + return err + } + // repo.Root() returns the absolute path of the repo + relBundlePath, err := filepath.Rel(repo.Root(), absBundlePath) + if err != nil { + return err + } + b.Config.Bundle.Git.BundleRootPath = filepath.ToSlash(relBundlePath) return nil } diff --git a/bundle/config/paths/paths.go b/bundle/config/paths/paths.go index c2cbcb7d..2c9ecb8c 100644 --- a/bundle/config/paths/paths.go +++ b/bundle/config/paths/paths.go @@ -6,8 +6,8 @@ import ( ) type Paths struct { - // ConfigFilePath holds the path to the configuration file that - // described the resource that this type is embedded in. + // Absolute path on the local file system to the configuration file that holds + // the definition of this resource. ConfigFilePath string `json:"-" bundle:"readonly"` } diff --git a/bundle/deploy/metadata/compute.go b/bundle/deploy/metadata/compute.go new file mode 100644 index 00000000..9a3ae0e3 --- /dev/null +++ b/bundle/deploy/metadata/compute.go @@ -0,0 +1,51 @@ +package metadata + +import ( + "context" + "fmt" + "path/filepath" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/metadata" +) + +type compute struct{} + +func Compute() bundle.Mutator { + return &compute{} +} + +func (m *compute) Name() string { + return "metadata.Compute" +} + +func (m *compute) Apply(_ context.Context, b *bundle.Bundle) error { + b.Metadata = metadata.Metadata{ + Version: metadata.Version, + Config: metadata.Config{}, + } + + // Set git details in metadata + b.Metadata.Config.Bundle.Git = b.Config.Bundle.Git + + // Set job config paths in metadata + jobsMetadata := make(map[string]*metadata.Job) + for name, job := range b.Config.Resources.Jobs { + // Compute config file path the job is defined in, relative to the bundle + // root + relativePath, err := filepath.Rel(b.Config.Path, job.ConfigFilePath) + if err != nil { + return fmt.Errorf("failed to compute relative path for job %s: %w", name, err) + } + // Metadata for the job + jobsMetadata[name] = &metadata.Job{ + ID: job.ID, + RelativePath: filepath.ToSlash(relativePath), + } + } + b.Metadata.Config.Resources.Jobs = jobsMetadata + + // Set file upload destination of the bundle in metadata + b.Metadata.Config.Workspace.FilesPath = b.Config.Workspace.FilesPath + return nil +} diff --git a/bundle/deploy/metadata/compute_test.go b/bundle/deploy/metadata/compute_test.go new file mode 100644 index 00000000..9e4b475c --- /dev/null +++ b/bundle/deploy/metadata/compute_test.go @@ -0,0 +1,100 @@ +package metadata + +import ( + "context" + "testing" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config" + "github.com/databricks/cli/bundle/config/paths" + "github.com/databricks/cli/bundle/config/resources" + "github.com/databricks/cli/bundle/metadata" + "github.com/databricks/databricks-sdk-go/service/jobs" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestComputeMetadataMutator(t *testing.T) { + b := &bundle.Bundle{ + Config: config.Root{ + Workspace: config.Workspace{ + RootPath: "/Users/shreyas.goenka@databricks.com", + ArtifactsPath: "/Users/shreyas.goenka@databricks.com/artifacts", + FilesPath: "/Users/shreyas.goenka@databricks.com/files", + }, + Bundle: config.Bundle{ + Name: "my-bundle", + Target: "development", + Git: config.Git{ + Branch: "my-branch", + OriginURL: "www.host.com", + Commit: "abcd", + BundleRootPath: "a/b/c/d", + }, + }, + Resources: config.Resources{ + Jobs: map[string]*resources.Job{ + "my-job-1": { + Paths: paths.Paths{ + ConfigFilePath: "a/b/c", + }, + ID: "1111", + JobSettings: &jobs.JobSettings{ + Name: "My Job One", + }, + }, + "my-job-2": { + Paths: paths.Paths{ + ConfigFilePath: "d/e/f", + }, + ID: "2222", + JobSettings: &jobs.JobSettings{ + Name: "My Job Two", + }, + }, + }, + Pipelines: map[string]*resources.Pipeline{ + "my-pipeline": { + Paths: paths.Paths{ + ConfigFilePath: "abc", + }, + }, + }, + }, + }, + } + + expectedMetadata := metadata.Metadata{ + Version: metadata.Version, + Config: metadata.Config{ + Workspace: metadata.Workspace{ + FilesPath: "/Users/shreyas.goenka@databricks.com/files", + }, + Bundle: metadata.Bundle{ + Git: config.Git{ + Branch: "my-branch", + OriginURL: "www.host.com", + Commit: "abcd", + BundleRootPath: "a/b/c/d", + }, + }, + Resources: metadata.Resources{ + Jobs: map[string]*metadata.Job{ + "my-job-1": { + RelativePath: "a/b/c", + ID: "1111", + }, + "my-job-2": { + RelativePath: "d/e/f", + ID: "2222", + }, + }, + }, + }, + } + + err := Compute().Apply(context.Background(), b) + require.NoError(t, err) + + assert.Equal(t, expectedMetadata, b.Metadata) +} diff --git a/bundle/deploy/metadata/upload.go b/bundle/deploy/metadata/upload.go new file mode 100644 index 00000000..f550a66e --- /dev/null +++ b/bundle/deploy/metadata/upload.go @@ -0,0 +1,36 @@ +package metadata + +import ( + "bytes" + "context" + "encoding/json" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/libs/filer" +) + +const MetadataFileName = "metadata.json" + +type upload struct{} + +func Upload() bundle.Mutator { + return &upload{} +} + +func (m *upload) Name() string { + return "metadata.Upload" +} + +func (m *upload) Apply(ctx context.Context, b *bundle.Bundle) error { + f, err := filer.NewWorkspaceFilesClient(b.WorkspaceClient(), b.Config.Workspace.StatePath) + if err != nil { + return err + } + + metadata, err := json.MarshalIndent(b.Metadata, "", " ") + if err != nil { + return err + } + + return f.Write(ctx, MetadataFileName, bytes.NewReader(metadata), filer.CreateParentDirectories, filer.OverwriteIfExists) +} diff --git a/bundle/metadata/metadata.go b/bundle/metadata/metadata.go new file mode 100644 index 00000000..27edd584 --- /dev/null +++ b/bundle/metadata/metadata.go @@ -0,0 +1,45 @@ +package metadata + +import ( + "github.com/databricks/cli/bundle/config" +) + +const Version = 1 + +type Bundle struct { + Git config.Git `json:"git,omitempty"` +} + +type Workspace struct { + FilesPath string `json:"file_path,omitempty"` +} + +type Job struct { + ID string `json:"id,omitempty"` + + // Relative path from the bundle root to the configuration file that holds + // the definition of this resource. + RelativePath string `json:"relative_path,omitempty"` +} + +type Resources struct { + Jobs map[string]*Job `json:"jobs,omitempty"` +} + +type Config struct { + Bundle Bundle `json:"bundle,omitempty"` + Workspace Workspace `json:"workspace,omitempty"` + Resources Resources `json:"resources,omitempty"` +} + +// Metadata about the bundle deployment. This is the interface Databricks services +// rely on to integrate with bundles when they need additional information about +// a bundle deployment. +// +// After deploy, a file containing the metadata (metadata.json) can be found +// in the WSFS location containing the bundle state. +type Metadata struct { + Version int `json:"version"` + + Config Config `json:"config"` +} diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 6c75218b..805bae80 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -7,6 +7,7 @@ import ( "github.com/databricks/cli/bundle/config/mutator" "github.com/databricks/cli/bundle/deploy/files" "github.com/databricks/cli/bundle/deploy/lock" + "github.com/databricks/cli/bundle/deploy/metadata" "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/libraries" "github.com/databricks/cli/bundle/python" @@ -31,7 +32,12 @@ func Deploy() bundle.Mutator { terraform.StatePull(), bundle.Defer( terraform.Apply(), - terraform.StatePush(), + bundle.Seq( + terraform.StatePush(), + terraform.Load(), + metadata.Compute(), + metadata.Upload(), + ), ), ), lock.Release(lock.GoalDeploy), diff --git a/internal/bundle/bundles/deploy_then_remove_resources/databricks_template_schema.json b/internal/bundle/bundles/deploy_then_remove_resources/databricks_template_schema.json index cfed842c..8fca7a7c 100644 --- a/internal/bundle/bundles/deploy_then_remove_resources/databricks_template_schema.json +++ b/internal/bundle/bundles/deploy_then_remove_resources/databricks_template_schema.json @@ -2,7 +2,7 @@ "properties": { "unique_id": { "type": "string", - "description": "Unique ID for job name" + "description": "Unique ID for pipeline name" } } } diff --git a/internal/bundle/bundles/job_metadata/databricks_template_schema.json b/internal/bundle/bundles/job_metadata/databricks_template_schema.json new file mode 100644 index 00000000..c1c5cf12 --- /dev/null +++ b/internal/bundle/bundles/job_metadata/databricks_template_schema.json @@ -0,0 +1,16 @@ +{ + "properties": { + "unique_id": { + "type": "string", + "description": "Unique ID for job name" + }, + "spark_version": { + "type": "string", + "description": "Spark version used for job cluster" + }, + "node_type_id": { + "type": "string", + "description": "Node type id for job cluster" + } + } +} diff --git a/internal/bundle/bundles/job_metadata/template/a/b/bar.py b/internal/bundle/bundles/job_metadata/template/a/b/bar.py new file mode 100644 index 00000000..6f463767 --- /dev/null +++ b/internal/bundle/bundles/job_metadata/template/a/b/bar.py @@ -0,0 +1,2 @@ +# Databricks notebook source +print("bye") diff --git a/internal/bundle/bundles/job_metadata/template/a/b/resources.yml.tmpl b/internal/bundle/bundles/job_metadata/template/a/b/resources.yml.tmpl new file mode 100644 index 00000000..bdba05f5 --- /dev/null +++ b/internal/bundle/bundles/job_metadata/template/a/b/resources.yml.tmpl @@ -0,0 +1,12 @@ +resources: + jobs: + bar: + name: test-job-metadata-2-{{.unique_id}} + tasks: + - task_key: my_notebook_task + new_cluster: + num_workers: 1 + spark_version: "{{.spark_version}}" + node_type_id: "{{.node_type_id}}" + notebook_task: + notebook_path: "./bar.py" diff --git a/internal/bundle/bundles/job_metadata/template/databricks.yml.tmpl b/internal/bundle/bundles/job_metadata/template/databricks.yml.tmpl new file mode 100644 index 00000000..7aaabadd --- /dev/null +++ b/internal/bundle/bundles/job_metadata/template/databricks.yml.tmpl @@ -0,0 +1,21 @@ +bundle: + name: job-metadata + +workspace: + root_path: "~/.bundle/{{.unique_id}}" + +include: + - "a/b/*.yml" + +resources: + jobs: + foo: + name: test-job-metadata-1-{{.unique_id}} + tasks: + - task_key: my_notebook_task + new_cluster: + num_workers: 1 + spark_version: "{{.spark_version}}" + node_type_id: "{{.node_type_id}}" + notebook_task: + notebook_path: "./foo.py" diff --git a/internal/bundle/bundles/job_metadata/template/foo.py b/internal/bundle/bundles/job_metadata/template/foo.py new file mode 100644 index 00000000..4914a743 --- /dev/null +++ b/internal/bundle/bundles/job_metadata/template/foo.py @@ -0,0 +1,2 @@ +# Databricks notebook source +print("hello") diff --git a/internal/bundle/job_metadata_test.go b/internal/bundle/job_metadata_test.go new file mode 100644 index 00000000..70962c4c --- /dev/null +++ b/internal/bundle/job_metadata_test.go @@ -0,0 +1,105 @@ +package bundle + +import ( + "context" + "encoding/json" + "fmt" + "io" + "path" + "strconv" + "testing" + + "github.com/databricks/cli/bundle/config" + "github.com/databricks/cli/bundle/metadata" + "github.com/databricks/cli/internal" + "github.com/databricks/cli/libs/filer" + "github.com/databricks/databricks-sdk-go" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestAccJobsMetadataFile(t *testing.T) { + env := internal.GetEnvOrSkipTest(t, "CLOUD_ENV") + t.Log(env) + + w, err := databricks.NewWorkspaceClient() + require.NoError(t, err) + + nodeTypeId := internal.GetNodeTypeId(env) + uniqueId := uuid.New().String() + bundleRoot, err := initTestTemplate(t, "job_metadata", map[string]any{ + "unique_id": uniqueId, + "node_type_id": nodeTypeId, + "spark_version": "13.2.x-snapshot-scala2.12", + }) + require.NoError(t, err) + + // deploy bundle + err = deployBundle(t, bundleRoot) + require.NoError(t, err) + + // Cleanup the deployed bundle + t.Cleanup(func() { + err = destroyBundle(t, bundleRoot) + require.NoError(t, err) + }) + + // assert job 1 is created + jobName := "test-job-metadata-1-" + uniqueId + job1, err := w.Jobs.GetBySettingsName(context.Background(), jobName) + require.NoError(t, err) + assert.Equal(t, job1.Settings.Name, jobName) + + // assert job 2 is created + jobName = "test-job-metadata-2-" + uniqueId + job2, err := w.Jobs.GetBySettingsName(context.Background(), jobName) + require.NoError(t, err) + assert.Equal(t, job2.Settings.Name, jobName) + + // Compute root path for the bundle deployment + me, err := w.CurrentUser.Me(context.Background()) + require.NoError(t, err) + root := fmt.Sprintf("/Users/%s/.bundle/%s", me.UserName, uniqueId) + f, err := filer.NewWorkspaceFilesClient(w, root) + require.NoError(t, err) + + // Read metadata object from the workspace + r, err := f.Read(context.Background(), "state/metadata.json") + require.NoError(t, err) + b, err := io.ReadAll(r) + require.NoError(t, err) + actualMetadata := metadata.Metadata{} + err = json.Unmarshal(b, &actualMetadata) + require.NoError(t, err) + + // expected value for the metadata + expectedMetadata := metadata.Metadata{ + Version: metadata.Version, + Config: metadata.Config{ + Bundle: metadata.Bundle{ + Git: config.Git{ + BundleRootPath: ".", + }, + }, + Workspace: metadata.Workspace{ + FilesPath: path.Join(root, "files"), + }, + Resources: metadata.Resources{ + Jobs: map[string]*metadata.Job{ + "foo": { + ID: strconv.FormatInt(job1.JobId, 10), + RelativePath: "databricks.yml", + }, + "bar": { + ID: strconv.FormatInt(job2.JobId, 10), + RelativePath: "a/b/resources.yml", + }, + }, + }, + }, + } + + // Assert metadata matches what we expected. + assert.Equal(t, expectedMetadata, actualMetadata) +} diff --git a/libs/git/repository.go b/libs/git/repository.go index 9c847687..d1641118 100644 --- a/libs/git/repository.go +++ b/libs/git/repository.go @@ -40,7 +40,7 @@ type Repository struct { config *config } -// Root returns the repository root. +// Root returns the absolute path to the repository root. func (r *Repository) Root() string { return r.rootPath }