Add JobTaskClusterSpec validate mutator (#1784)
## Changes
Add JobTaskClusterSpec validate mutator. It catches the case when tasks
don't which cluster to use.
For example, we can get this error with minor modifications to
`default-python` template:
```yaml
tasks:
- task_key: python_file_task
spark_python_task:
python_file: ../src/my_project_10/main.py
```
```
% databricks bundle validate
Error: Missing required cluster or environment settings
at resources.jobs.my_project_10_job.tasks[0]
in resources/my_project_10_job.yml:17:11
Task "print_github_stars" requires a cluster or an environment to run.
Specify one of the following fields: job_cluster_key, environment_key, existing_cluster_id, new_cluster.
```
We implicitly rely on "one of" validation, which does not exist. Many
bundle fields can't co-exist, for instance, specifying:
`JobTask.{existing_cluster_id,job_cluster_key}`, `Library.{whl,pypi}`,
`JobTask.{notebook_task,python_wheel_task}`, etc.
## Tests
Unit tests
---------
Co-authored-by: Pieter Noordhuis <pcnoordhuis@gmail.com>
2024-09-25 11:30:14 +00:00
|
|
|
package validate
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"strings"
|
|
|
|
|
|
|
|
"github.com/databricks/cli/bundle"
|
|
|
|
"github.com/databricks/cli/libs/diag"
|
|
|
|
"github.com/databricks/cli/libs/dyn"
|
|
|
|
"github.com/databricks/databricks-sdk-go/service/jobs"
|
|
|
|
)
|
|
|
|
|
|
|
|
// JobTaskClusterSpec validates that job tasks have cluster spec defined
|
|
|
|
// if task requires a cluster
|
|
|
|
func JobTaskClusterSpec() bundle.ReadOnlyMutator {
|
|
|
|
return &jobTaskClusterSpec{}
|
|
|
|
}
|
|
|
|
|
2024-12-12 09:28:42 +00:00
|
|
|
type jobTaskClusterSpec struct{}
|
Add JobTaskClusterSpec validate mutator (#1784)
## Changes
Add JobTaskClusterSpec validate mutator. It catches the case when tasks
don't which cluster to use.
For example, we can get this error with minor modifications to
`default-python` template:
```yaml
tasks:
- task_key: python_file_task
spark_python_task:
python_file: ../src/my_project_10/main.py
```
```
% databricks bundle validate
Error: Missing required cluster or environment settings
at resources.jobs.my_project_10_job.tasks[0]
in resources/my_project_10_job.yml:17:11
Task "print_github_stars" requires a cluster or an environment to run.
Specify one of the following fields: job_cluster_key, environment_key, existing_cluster_id, new_cluster.
```
We implicitly rely on "one of" validation, which does not exist. Many
bundle fields can't co-exist, for instance, specifying:
`JobTask.{existing_cluster_id,job_cluster_key}`, `Library.{whl,pypi}`,
`JobTask.{notebook_task,python_wheel_task}`, etc.
## Tests
Unit tests
---------
Co-authored-by: Pieter Noordhuis <pcnoordhuis@gmail.com>
2024-09-25 11:30:14 +00:00
|
|
|
|
|
|
|
func (v *jobTaskClusterSpec) Name() string {
|
|
|
|
return "validate:job_task_cluster_spec"
|
|
|
|
}
|
|
|
|
|
|
|
|
func (v *jobTaskClusterSpec) Apply(ctx context.Context, rb bundle.ReadOnlyBundle) diag.Diagnostics {
|
|
|
|
diags := diag.Diagnostics{}
|
|
|
|
|
|
|
|
jobsPath := dyn.NewPath(dyn.Key("resources"), dyn.Key("jobs"))
|
|
|
|
|
|
|
|
for resourceName, job := range rb.Config().Resources.Jobs {
|
|
|
|
resourcePath := jobsPath.Append(dyn.Key(resourceName))
|
|
|
|
|
|
|
|
for taskIndex, task := range job.Tasks {
|
|
|
|
taskPath := resourcePath.Append(dyn.Key("tasks"), dyn.Index(taskIndex))
|
|
|
|
|
|
|
|
diags = diags.Extend(validateJobTask(rb, task, taskPath))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return diags
|
|
|
|
}
|
|
|
|
|
|
|
|
func validateJobTask(rb bundle.ReadOnlyBundle, task jobs.Task, taskPath dyn.Path) diag.Diagnostics {
|
|
|
|
diags := diag.Diagnostics{}
|
|
|
|
|
|
|
|
var specified []string
|
|
|
|
var unspecified []string
|
|
|
|
|
|
|
|
if task.JobClusterKey != "" {
|
|
|
|
specified = append(specified, "job_cluster_key")
|
|
|
|
} else {
|
|
|
|
unspecified = append(unspecified, "job_cluster_key")
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.EnvironmentKey != "" {
|
|
|
|
specified = append(specified, "environment_key")
|
|
|
|
} else {
|
|
|
|
unspecified = append(unspecified, "environment_key")
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.ExistingClusterId != "" {
|
|
|
|
specified = append(specified, "existing_cluster_id")
|
|
|
|
} else {
|
|
|
|
unspecified = append(unspecified, "existing_cluster_id")
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.NewCluster != nil {
|
|
|
|
specified = append(specified, "new_cluster")
|
|
|
|
} else {
|
|
|
|
unspecified = append(unspecified, "new_cluster")
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.ForEachTask != nil {
|
|
|
|
forEachTaskPath := taskPath.Append(dyn.Key("for_each_task"), dyn.Key("task"))
|
|
|
|
|
|
|
|
diags = diags.Extend(validateJobTask(rb, task.ForEachTask.Task, forEachTaskPath))
|
|
|
|
}
|
|
|
|
|
|
|
|
if isComputeTask(task) && len(specified) == 0 {
|
|
|
|
if task.NotebookTask != nil {
|
|
|
|
// notebook tasks without cluster spec will use notebook environment
|
|
|
|
} else {
|
|
|
|
// path might be not very helpful, adding user-specified task key clarifies the context
|
|
|
|
detail := fmt.Sprintf(
|
|
|
|
"Task %q requires a cluster or an environment to run.\nSpecify one of the following fields: %s.",
|
|
|
|
task.TaskKey,
|
|
|
|
strings.Join(unspecified, ", "),
|
|
|
|
)
|
|
|
|
|
|
|
|
diags = diags.Append(diag.Diagnostic{
|
|
|
|
Severity: diag.Error,
|
|
|
|
Summary: "Missing required cluster or environment settings",
|
|
|
|
Detail: detail,
|
|
|
|
Locations: rb.Config().GetLocations(taskPath.String()),
|
|
|
|
Paths: []dyn.Path{taskPath},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return diags
|
|
|
|
}
|
|
|
|
|
|
|
|
// isComputeTask returns true if the task runs on a cluster or serverless GC
|
|
|
|
func isComputeTask(task jobs.Task) bool {
|
|
|
|
if task.NotebookTask != nil {
|
|
|
|
// if warehouse_id is set, it's SQL notebook that doesn't need cluster or serverless GC
|
|
|
|
if task.NotebookTask.WarehouseId != "" {
|
|
|
|
return false
|
|
|
|
} else {
|
|
|
|
// task settings don't require specifying a cluster/serverless GC, but task itself can run on one
|
|
|
|
// we handle that case separately in validateJobTask
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.PythonWheelTask != nil {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.DbtTask != nil {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.SparkJarTask != nil {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.SparkSubmitTask != nil {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.SparkPythonTask != nil {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.SqlTask != nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.PipelineTask != nil {
|
|
|
|
// while pipelines use clusters, pipeline tasks don't, they only trigger pipelines
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.RunJobTask != nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
if task.ConditionTask != nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// for each task doesn't use clusters, underlying task(s) can though
|
|
|
|
if task.ForEachTask != nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|