Add JobTaskClusterSpec validate mutator (#1784)

## Changes Add JobTaskClusterSpec validate mutator. It catches the case when tasks don't which cluster to use. For example, we can get this error with minor modifications to `default-python` template: ```yaml tasks: - task_key: python_file_task spark_python_task: python_file: ../src/my_project_10/main.py ``` ``` % databricks bundle validate Error: Missing required cluster or environment settings at resources.jobs.my_project_10_job.tasks[0] in resources/my_project_10_job.yml:17:11 Task "print_github_stars" requires a cluster or an environment to run. Specify one of the following fields: job_cluster_key, environment_key, existing_cluster_id, new_cluster. ``` We implicitly rely on "one of" validation, which does not exist. Many bundle fields can't co-exist, for instance, specifying: `JobTask.{existing_cluster_id,job_cluster_key}`, `Library.{whl,pypi}`, `JobTask.{notebook_task,python_wheel_task}`, etc. ## Tests Unit tests --------- Co-authored-by: Pieter Noordhuis <pcnoordhuis@gmail.com>
2024-09-25 13:30:14 +02:00 · 2024-09-25 13:30:14 +02:00 · 3d9decdda9
parent 490259a14a
commit 3d9decdda9
3 changed files with 365 additions and 0 deletions
--- a/bundle/config/validate/job_task_cluster_spec.go
+++ b/bundle/config/validate/job_task_cluster_spec.go
@ -0,0 +1,161 @@
 package validate
 import (
 	"context"
 	"fmt"
 	"strings"
 	"github.com/databricks/cli/bundle"
 	"github.com/databricks/cli/libs/diag"
 	"github.com/databricks/cli/libs/dyn"
 	"github.com/databricks/databricks-sdk-go/service/jobs"
 )
 // JobTaskClusterSpec validates that job tasks have cluster spec defined
 // if task requires a cluster
 func JobTaskClusterSpec() bundle.ReadOnlyMutator {
 	return &jobTaskClusterSpec{}
 }
 type jobTaskClusterSpec struct {
 }
 func (v *jobTaskClusterSpec) Name() string {
 	return "validate:job_task_cluster_spec"
 }
 func (v *jobTaskClusterSpec) Apply(ctx context.Context, rb bundle.ReadOnlyBundle) diag.Diagnostics {
 	diags := diag.Diagnostics{}
 	jobsPath := dyn.NewPath(dyn.Key("resources"), dyn.Key("jobs"))
 	for resourceName, job := range rb.Config().Resources.Jobs {
 		resourcePath := jobsPath.Append(dyn.Key(resourceName))
 		for taskIndex, task := range job.Tasks {
 			taskPath := resourcePath.Append(dyn.Key("tasks"), dyn.Index(taskIndex))
 			diags = diags.Extend(validateJobTask(rb, task, taskPath))
 		}
 	}
 	return diags
 }
 func validateJobTask(rb bundle.ReadOnlyBundle, task jobs.Task, taskPath dyn.Path) diag.Diagnostics {
 	diags := diag.Diagnostics{}
 	var specified []string
 	var unspecified []string
 	if task.JobClusterKey != "" {
 		specified = append(specified, "job_cluster_key")
 	} else {
 		unspecified = append(unspecified, "job_cluster_key")
 	}
 	if task.EnvironmentKey != "" {
 		specified = append(specified, "environment_key")
 	} else {
 		unspecified = append(unspecified, "environment_key")
 	}
 	if task.ExistingClusterId != "" {
 		specified = append(specified, "existing_cluster_id")
 	} else {
 		unspecified = append(unspecified, "existing_cluster_id")
 	}
 	if task.NewCluster != nil {
 		specified = append(specified, "new_cluster")
 	} else {
 		unspecified = append(unspecified, "new_cluster")
 	}
 	if task.ForEachTask != nil {
 		forEachTaskPath := taskPath.Append(dyn.Key("for_each_task"), dyn.Key("task"))
 		diags = diags.Extend(validateJobTask(rb, task.ForEachTask.Task, forEachTaskPath))
 	}
 	if isComputeTask(task) && len(specified) == 0 {
 		if task.NotebookTask != nil {
 			// notebook tasks without cluster spec will use notebook environment
 		} else {
 			// path might be not very helpful, adding user-specified task key clarifies the context
 			detail := fmt.Sprintf(
 				"Task %q requires a cluster or an environment to run.\nSpecify one of the following fields: %s.",
 				task.TaskKey,
 				strings.Join(unspecified, ", "),
 			)
 			diags = diags.Append(diag.Diagnostic{
 				Severity:  diag.Error,
 				Summary:   "Missing required cluster or environment settings",
 				Detail:    detail,
 				Locations: rb.Config().GetLocations(taskPath.String()),
 				Paths:     []dyn.Path{taskPath},
 			})
 		}
 	}
 	return diags
 }
 // isComputeTask returns true if the task runs on a cluster or serverless GC
 func isComputeTask(task jobs.Task) bool {
 	if task.NotebookTask != nil {
 		// if warehouse_id is set, it's SQL notebook that doesn't need cluster or serverless GC
 		if task.NotebookTask.WarehouseId != "" {
 			return false
 		} else {
 			// task settings don't require specifying a cluster/serverless GC, but task itself can run on one
 			// we handle that case separately in validateJobTask
 			return true
 		}
 	}
 	if task.PythonWheelTask != nil {
 		return true
 	}
 	if task.DbtTask != nil {
 		return true
 	}
 	if task.SparkJarTask != nil {
 		return true
 	}
 	if task.SparkSubmitTask != nil {
 		return true
 	}
 	if task.SparkPythonTask != nil {
 		return true
 	}
 	if task.SqlTask != nil {
 		return false
 	}
 	if task.PipelineTask != nil {
 		// while pipelines use clusters, pipeline tasks don't, they only trigger pipelines
 		return false
 	}
 	if task.RunJobTask != nil {
 		return false
 	}
 	if task.ConditionTask != nil {
 		return false
 	}
 	// for each task doesn't use clusters, underlying task(s) can though
 	if task.ForEachTask != nil {
 		return false
 	}
 	return false
 }
--- a/bundle/config/validate/job_task_cluster_spec_test.go
+++ b/bundle/config/validate/job_task_cluster_spec_test.go
@ -0,0 +1,203 @@
 package validate
 import (
 	"context"
 	"testing"
 	"github.com/databricks/cli/bundle"
 	"github.com/databricks/cli/bundle/config"
 	"github.com/databricks/cli/bundle/config/resources"
 	"github.com/databricks/databricks-sdk-go/service/compute"
 	"github.com/databricks/databricks-sdk-go/service/jobs"
 	"github.com/stretchr/testify/assert"
 )
 func TestJobTaskClusterSpec(t *testing.T) {
 	expectedSummary := "Missing required cluster or environment settings"
 	type testCase struct {
 		name         string
 		task         jobs.Task
 		errorPath    string
 		errorDetail  string
 		errorSummary string
 	}
 	testCases := []testCase{
 		{
 			name: "valid notebook task",
 			task: jobs.Task{
 				// while a cluster is needed, it will use notebook environment to create one
 				NotebookTask: &jobs.NotebookTask{},
 			},
 		},
 		{
 			name: "valid notebook task (job_cluster_key)",
 			task: jobs.Task{
 				JobClusterKey: "cluster1",
 				NotebookTask:  &jobs.NotebookTask{},
 			},
 		},
 		{
 			name: "valid notebook task (new_cluster)",
 			task: jobs.Task{
 				NewCluster:   &compute.ClusterSpec{},
 				NotebookTask: &jobs.NotebookTask{},
 			},
 		},
 		{
 			name: "valid notebook task (existing_cluster_id)",
 			task: jobs.Task{
 				ExistingClusterId: "cluster1",
 				NotebookTask:      &jobs.NotebookTask{},
 			},
 		},
 		{
 			name: "valid SQL notebook task",
 			task: jobs.Task{
 				NotebookTask: &jobs.NotebookTask{
 					WarehouseId: "warehouse1",
 				},
 			},
 		},
 		{
 			name: "valid python wheel task",
 			task: jobs.Task{
 				JobClusterKey:   "cluster1",
 				PythonWheelTask: &jobs.PythonWheelTask{},
 			},
 		},
 		{
 			name: "valid python wheel task (environment_key)",
 			task: jobs.Task{
 				EnvironmentKey:  "environment1",
 				PythonWheelTask: &jobs.PythonWheelTask{},
 			},
 		},
 		{
 			name: "valid dbt task",
 			task: jobs.Task{
 				JobClusterKey: "cluster1",
 				DbtTask:       &jobs.DbtTask{},
 			},
 		},
 		{
 			name: "valid spark jar task",
 			task: jobs.Task{
 				JobClusterKey: "cluster1",
 				SparkJarTask:  &jobs.SparkJarTask{},
 			},
 		},
 		{
 			name: "valid spark submit",
 			task: jobs.Task{
 				NewCluster:      &compute.ClusterSpec{},
 				SparkSubmitTask: &jobs.SparkSubmitTask{},
 			},
 		},
 		{
 			name: "valid spark python task",
 			task: jobs.Task{
 				JobClusterKey:   "cluster1",
 				SparkPythonTask: &jobs.SparkPythonTask{},
 			},
 		},
 		{
 			name: "valid SQL task",
 			task: jobs.Task{
 				SqlTask: &jobs.SqlTask{},
 			},
 		},
 		{
 			name: "valid pipeline task",
 			task: jobs.Task{
 				PipelineTask: &jobs.PipelineTask{},
 			},
 		},
 		{
 			name: "valid run job task",
 			task: jobs.Task{
 				RunJobTask: &jobs.RunJobTask{},
 			},
 		},
 		{
 			name: "valid condition task",
 			task: jobs.Task{
 				ConditionTask: &jobs.ConditionTask{},
 			},
 		},
 		{
 			name: "valid for each task",
 			task: jobs.Task{
 				ForEachTask: &jobs.ForEachTask{
 					Task: jobs.Task{
 						JobClusterKey: "cluster1",
 						NotebookTask:  &jobs.NotebookTask{},
 					},
 				},
 			},
 		},
 		{
 			name: "invalid python wheel task",
 			task: jobs.Task{
 				PythonWheelTask: &jobs.PythonWheelTask{},
 				TaskKey:         "my_task",
 			},
 			errorPath: "resources.jobs.job1.tasks[0]",
 			errorDetail: `Task "my_task" requires a cluster or an environment to run.
 Specify one of the following fields: job_cluster_key, environment_key, existing_cluster_id, new_cluster.`,
 			errorSummary: expectedSummary,
 		},
 		{
 			name: "invalid for each task",
 			task: jobs.Task{
 				ForEachTask: &jobs.ForEachTask{
 					Task: jobs.Task{
 						PythonWheelTask: &jobs.PythonWheelTask{},
 						TaskKey:         "my_task",
 					},
 				},
 			},
 			errorPath: "resources.jobs.job1.tasks[0].for_each_task.task",
 			errorDetail: `Task "my_task" requires a cluster or an environment to run.
 Specify one of the following fields: job_cluster_key, environment_key, existing_cluster_id, new_cluster.`,
 			errorSummary: expectedSummary,
 		},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			job := &resources.Job{
 				JobSettings: &jobs.JobSettings{
 					Tasks: []jobs.Task{tc.task},
 				},
 			}
 			b := createBundle(map[string]*resources.Job{"job1": job})
 			diags := bundle.ApplyReadOnly(context.Background(), bundle.ReadOnly(b), JobTaskClusterSpec())
 			if tc.errorPath != "" || tc.errorDetail != "" || tc.errorSummary != "" {
 				assert.Len(t, diags, 1)
 				assert.Len(t, diags[0].Paths, 1)
 				diag := diags[0]
 				assert.Equal(t, tc.errorPath, diag.Paths[0].String())
 				assert.Equal(t, tc.errorSummary, diag.Summary)
 				assert.Equal(t, tc.errorDetail, diag.Detail)
 			} else {
 				assert.ElementsMatch(t, []string{}, diags)
 			}
 		})
 	}
 }
 func createBundle(jobs map[string]*resources.Job) *bundle.Bundle {
 	return &bundle.Bundle{
 		Config: config.Root{
 			Resources: config.Resources{
 				Jobs: jobs,
 			},
 		},
 	}
 }
--- a/bundle/config/validate/validate.go
+++ b/bundle/config/validate/validate.go
@ -34,6 +34,7 @@ func (v *validate) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics
 		JobClusterKeyDefined(),
 		FilesToSync(),
 		ValidateSyncPatterns(),
 		JobTaskClusterSpec(),
 	))
 }