Use dynamic walking to validate unique resource keys (#1614)

## Changes
This PR:
1. Uses dynamic walking (via the `dyn.MapByPattern` func) to validate no
two resources have the same resource key. The allows us to remove this
validation at merge time.
2. Modifies `dyn.Mapping` to always return a sorted slice of pairs. This
makes traversal functions like `dyn.Walk` or `dyn.MapByPattern`
deterministic.

## Tests
Unit tests. Also manually.
This commit is contained in:
shreyas-goenka 2024-07-29 18:34:02 +05:30 committed by GitHub
parent 383d580917
commit a52b188e99
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 304 additions and 315 deletions

View File

@ -5,6 +5,7 @@ import (
"github.com/databricks/cli/bundle/config"
"github.com/databricks/cli/bundle/config/loader"
pythonmutator "github.com/databricks/cli/bundle/config/mutator/python"
"github.com/databricks/cli/bundle/config/validate"
"github.com/databricks/cli/bundle/scripts"
)
@ -26,5 +27,9 @@ func DefaultMutators() []bundle.Mutator {
DefineDefaultTarget(),
LoadGitDetails(),
pythonmutator.PythonMutator(pythonmutator.PythonMutatorPhaseLoad),
// Note: This mutator must run before the target overrides are merged.
// See the mutator for more details.
validate.UniqueResourceKeys(),
}
}

View File

@ -20,126 +20,6 @@ type Resources struct {
QualityMonitors map[string]*resources.QualityMonitor `json:"quality_monitors,omitempty"`
}
type UniqueResourceIdTracker struct {
Type map[string]string
ConfigPath map[string]string
}
// verifies merging is safe by checking no duplicate identifiers exist
func (r *Resources) VerifySafeMerge(other *Resources) error {
rootTracker, err := r.VerifyUniqueResourceIdentifiers()
if err != nil {
return err
}
otherTracker, err := other.VerifyUniqueResourceIdentifiers()
if err != nil {
return err
}
for k := range otherTracker.Type {
if _, ok := rootTracker.Type[k]; ok {
return fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
rootTracker.Type[k],
rootTracker.ConfigPath[k],
otherTracker.Type[k],
otherTracker.ConfigPath[k],
)
}
}
return nil
}
// This function verifies there are no duplicate names used for the resource definations
func (r *Resources) VerifyUniqueResourceIdentifiers() (*UniqueResourceIdTracker, error) {
tracker := &UniqueResourceIdTracker{
Type: make(map[string]string),
ConfigPath: make(map[string]string),
}
for k := range r.Jobs {
tracker.Type[k] = "job"
tracker.ConfigPath[k] = r.Jobs[k].ConfigFilePath
}
for k := range r.Pipelines {
if _, ok := tracker.Type[k]; ok {
return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
tracker.Type[k],
tracker.ConfigPath[k],
"pipeline",
r.Pipelines[k].ConfigFilePath,
)
}
tracker.Type[k] = "pipeline"
tracker.ConfigPath[k] = r.Pipelines[k].ConfigFilePath
}
for k := range r.Models {
if _, ok := tracker.Type[k]; ok {
return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
tracker.Type[k],
tracker.ConfigPath[k],
"mlflow_model",
r.Models[k].ConfigFilePath,
)
}
tracker.Type[k] = "mlflow_model"
tracker.ConfigPath[k] = r.Models[k].ConfigFilePath
}
for k := range r.Experiments {
if _, ok := tracker.Type[k]; ok {
return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
tracker.Type[k],
tracker.ConfigPath[k],
"mlflow_experiment",
r.Experiments[k].ConfigFilePath,
)
}
tracker.Type[k] = "mlflow_experiment"
tracker.ConfigPath[k] = r.Experiments[k].ConfigFilePath
}
for k := range r.ModelServingEndpoints {
if _, ok := tracker.Type[k]; ok {
return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
tracker.Type[k],
tracker.ConfigPath[k],
"model_serving_endpoint",
r.ModelServingEndpoints[k].ConfigFilePath,
)
}
tracker.Type[k] = "model_serving_endpoint"
tracker.ConfigPath[k] = r.ModelServingEndpoints[k].ConfigFilePath
}
for k := range r.RegisteredModels {
if _, ok := tracker.Type[k]; ok {
return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
tracker.Type[k],
tracker.ConfigPath[k],
"registered_model",
r.RegisteredModels[k].ConfigFilePath,
)
}
tracker.Type[k] = "registered_model"
tracker.ConfigPath[k] = r.RegisteredModels[k].ConfigFilePath
}
for k := range r.QualityMonitors {
if _, ok := tracker.Type[k]; ok {
return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
tracker.Type[k],
tracker.ConfigPath[k],
"quality_monitor",
r.QualityMonitors[k].ConfigFilePath,
)
}
tracker.Type[k] = "quality_monitor"
tracker.ConfigPath[k] = r.QualityMonitors[k].ConfigFilePath
}
return tracker, nil
}
type resource struct {
resource ConfigResource
resource_type string

View File

@ -5,129 +5,9 @@ import (
"reflect"
"testing"
"github.com/databricks/cli/bundle/config/paths"
"github.com/databricks/cli/bundle/config/resources"
"github.com/stretchr/testify/assert"
)
func TestVerifyUniqueResourceIdentifiers(t *testing.T) {
r := Resources{
Jobs: map[string]*resources.Job{
"foo": {
Paths: paths.Paths{
ConfigFilePath: "foo.yml",
},
},
},
Models: map[string]*resources.MlflowModel{
"bar": {
Paths: paths.Paths{
ConfigFilePath: "bar.yml",
},
},
},
Experiments: map[string]*resources.MlflowExperiment{
"foo": {
Paths: paths.Paths{
ConfigFilePath: "foo2.yml",
},
},
},
}
_, err := r.VerifyUniqueResourceIdentifiers()
assert.ErrorContains(t, err, "multiple resources named foo (job at foo.yml, mlflow_experiment at foo2.yml)")
}
func TestVerifySafeMerge(t *testing.T) {
r := Resources{
Jobs: map[string]*resources.Job{
"foo": {
Paths: paths.Paths{
ConfigFilePath: "foo.yml",
},
},
},
Models: map[string]*resources.MlflowModel{
"bar": {
Paths: paths.Paths{
ConfigFilePath: "bar.yml",
},
},
},
}
other := Resources{
Pipelines: map[string]*resources.Pipeline{
"foo": {
Paths: paths.Paths{
ConfigFilePath: "foo2.yml",
},
},
},
}
err := r.VerifySafeMerge(&other)
assert.ErrorContains(t, err, "multiple resources named foo (job at foo.yml, pipeline at foo2.yml)")
}
func TestVerifySafeMergeForSameResourceType(t *testing.T) {
r := Resources{
Jobs: map[string]*resources.Job{
"foo": {
Paths: paths.Paths{
ConfigFilePath: "foo.yml",
},
},
},
Models: map[string]*resources.MlflowModel{
"bar": {
Paths: paths.Paths{
ConfigFilePath: "bar.yml",
},
},
},
}
other := Resources{
Jobs: map[string]*resources.Job{
"foo": {
Paths: paths.Paths{
ConfigFilePath: "foo2.yml",
},
},
},
}
err := r.VerifySafeMerge(&other)
assert.ErrorContains(t, err, "multiple resources named foo (job at foo.yml, job at foo2.yml)")
}
func TestVerifySafeMergeForRegisteredModels(t *testing.T) {
r := Resources{
Jobs: map[string]*resources.Job{
"foo": {
Paths: paths.Paths{
ConfigFilePath: "foo.yml",
},
},
},
RegisteredModels: map[string]*resources.RegisteredModel{
"bar": {
Paths: paths.Paths{
ConfigFilePath: "bar.yml",
},
},
},
}
other := Resources{
RegisteredModels: map[string]*resources.RegisteredModel{
"bar": {
Paths: paths.Paths{
ConfigFilePath: "bar2.yml",
},
},
},
}
err := r.VerifySafeMerge(&other)
assert.ErrorContains(t, err, "multiple resources named bar (registered_model at bar.yml, registered_model at bar2.yml)")
}
// This test ensures that all resources have a custom marshaller and unmarshaller.
// This is required because DABs resources map to Databricks APIs, and they do so
// by embedding the corresponding Go SDK structs.

View File

@ -100,11 +100,6 @@ func LoadFromBytes(path string, raw []byte) (*Root, diag.Diagnostics) {
if err != nil {
return nil, diag.Errorf("failed to load %s: %v", path, err)
}
_, err = r.Resources.VerifyUniqueResourceIdentifiers()
if err != nil {
diags = diags.Extend(diag.FromErr(err))
}
return &r, diags
}
@ -281,12 +276,6 @@ func (r *Root) InitializeVariables(vars []string) error {
}
func (r *Root) Merge(other *Root) error {
// Check for safe merge, protecting against duplicate resource identifiers
err := r.Resources.VerifySafeMerge(&other.Resources)
if err != nil {
return err
}
// Merge dynamic configuration values.
return r.Mutate(func(root dyn.Value) (dyn.Value, error) {
return merge.Merge(root, other.value)

View File

@ -30,22 +30,6 @@ func TestRootLoad(t *testing.T) {
assert.Equal(t, "basic", root.Bundle.Name)
}
func TestDuplicateIdOnLoadReturnsError(t *testing.T) {
_, diags := Load("./testdata/duplicate_resource_names_in_root/databricks.yml")
assert.ErrorContains(t, diags.Error(), "multiple resources named foo (job at ./testdata/duplicate_resource_names_in_root/databricks.yml, pipeline at ./testdata/duplicate_resource_names_in_root/databricks.yml)")
}
func TestDuplicateIdOnMergeReturnsError(t *testing.T) {
root, diags := Load("./testdata/duplicate_resource_name_in_subconfiguration/databricks.yml")
require.NoError(t, diags.Error())
other, diags := Load("./testdata/duplicate_resource_name_in_subconfiguration/resources.yml")
require.NoError(t, diags.Error())
err := root.Merge(other)
assert.ErrorContains(t, err, "multiple resources named foo (job at ./testdata/duplicate_resource_name_in_subconfiguration/databricks.yml, pipeline at ./testdata/duplicate_resource_name_in_subconfiguration/resources.yml)")
}
func TestInitializeVariables(t *testing.T) {
fooDefault := "abc"
root := &Root{

View File

@ -0,0 +1,116 @@
package validate
import (
"context"
"fmt"
"slices"
"sort"
"github.com/databricks/cli/bundle"
"github.com/databricks/cli/libs/diag"
"github.com/databricks/cli/libs/dyn"
)
// This mutator validates that:
//
// 1. Each resource key is unique across different resource types. No two resources
// of the same type can have the same key. This is because command like "bundle run"
// rely on the resource key to identify the resource to run.
// Eg: jobs.foo and pipelines.foo are not allowed simultaneously.
//
// 2. Each resource definition is contained within a single file, and is not spread
// across multiple files. Note: This is not applicable to resource configuration
// defined in a target override. That is why this mutator MUST run before the target
// overrides are merged.
func UniqueResourceKeys() bundle.Mutator {
return &uniqueResourceKeys{}
}
type uniqueResourceKeys struct{}
func (m *uniqueResourceKeys) Name() string {
return "validate:unique_resource_keys"
}
func (m *uniqueResourceKeys) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics {
diags := diag.Diagnostics{}
type metadata struct {
locations []dyn.Location
paths []dyn.Path
}
// Maps of resource key to the paths and locations the resource is defined at.
resourceMetadata := map[string]*metadata{}
rv := b.Config.Value().Get("resources")
// return early if no resources are defined or the resources block is empty.
if rv.Kind() == dyn.KindInvalid || rv.Kind() == dyn.KindNil {
return diags
}
// Gather the paths and locations of all resources.
_, err := dyn.MapByPattern(
rv,
dyn.NewPattern(dyn.AnyKey(), dyn.AnyKey()),
func(p dyn.Path, v dyn.Value) (dyn.Value, error) {
// The key for the resource. Eg: "my_job" for jobs.my_job.
k := p[1].Key()
m, ok := resourceMetadata[k]
if !ok {
m = &metadata{
paths: []dyn.Path{},
locations: []dyn.Location{},
}
}
// dyn.Path under the hood is a slice. The code that walks the configuration
// tree uses the same underlying slice to track the path as it walks
// the tree. So, we need to clone it here.
m.paths = append(m.paths, slices.Clone(p))
m.locations = append(m.locations, v.Locations()...)
resourceMetadata[k] = m
return v, nil
},
)
if err != nil {
return diag.FromErr(err)
}
for k, v := range resourceMetadata {
if len(v.locations) <= 1 {
continue
}
// Sort the locations and paths for consistent error messages. This helps
// with unit testing.
sort.Slice(v.locations, func(i, j int) bool {
l1 := v.locations[i]
l2 := v.locations[j]
if l1.File != l2.File {
return l1.File < l2.File
}
if l1.Line != l2.Line {
return l1.Line < l2.Line
}
return l1.Column < l2.Column
})
sort.Slice(v.paths, func(i, j int) bool {
return v.paths[i].String() < v.paths[j].String()
})
// If there are multiple resources with the same key, report an error.
diags = append(diags, diag.Diagnostic{
Severity: diag.Error,
Summary: fmt.Sprintf("multiple resources have been defined with the same key: %s", k),
Locations: v.locations,
Paths: v.paths,
})
}
return diags
}

View File

@ -1,42 +0,0 @@
package config_tests
import (
"context"
"fmt"
"path/filepath"
"testing"
"github.com/databricks/cli/bundle"
"github.com/databricks/cli/bundle/phases"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestConflictingResourceIdsNoSubconfig(t *testing.T) {
ctx := context.Background()
b, err := bundle.Load(ctx, "./conflicting_resource_ids/no_subconfigurations")
require.NoError(t, err)
diags := bundle.Apply(ctx, b, phases.Load())
bundleConfigPath := filepath.FromSlash("conflicting_resource_ids/no_subconfigurations/databricks.yml")
assert.ErrorContains(t, diags.Error(), fmt.Sprintf("multiple resources named foo (job at %s, pipeline at %s)", bundleConfigPath, bundleConfigPath))
}
func TestConflictingResourceIdsOneSubconfig(t *testing.T) {
ctx := context.Background()
b, err := bundle.Load(ctx, "./conflicting_resource_ids/one_subconfiguration")
require.NoError(t, err)
diags := bundle.Apply(ctx, b, phases.Load())
bundleConfigPath := filepath.FromSlash("conflicting_resource_ids/one_subconfiguration/databricks.yml")
resourcesConfigPath := filepath.FromSlash("conflicting_resource_ids/one_subconfiguration/resources.yml")
assert.ErrorContains(t, diags.Error(), fmt.Sprintf("multiple resources named foo (job at %s, pipeline at %s)", bundleConfigPath, resourcesConfigPath))
}
func TestConflictingResourceIdsTwoSubconfigs(t *testing.T) {
ctx := context.Background()
b, err := bundle.Load(ctx, "./conflicting_resource_ids/two_subconfigurations")
require.NoError(t, err)
diags := bundle.Apply(ctx, b, phases.Load())
resources1ConfigPath := filepath.FromSlash("conflicting_resource_ids/two_subconfigurations/resources1.yml")
resources2ConfigPath := filepath.FromSlash("conflicting_resource_ids/two_subconfigurations/resources2.yml")
assert.ErrorContains(t, diags.Error(), fmt.Sprintf("multiple resources named foo (job at %s, pipeline at %s)", resources1ConfigPath, resources2ConfigPath))
}

View File

@ -4,10 +4,10 @@ bundle:
workspace:
profile: test
include:
- ./*.yml
resources:
jobs:
foo:
name: job foo
pipelines:
foo:
name: pipeline foo
name: job foo 1

View File

@ -0,0 +1,8 @@
resources:
jobs:
foo:
name: job foo 3
experiments:
foo:
name: experiment foo

View File

@ -5,7 +5,7 @@ workspace:
profile: test
include:
- "*.yml"
- ./resources.yml
resources:
jobs:

View File

@ -0,0 +1,4 @@
resources:
jobs:
foo:
name: job foo 2

View File

@ -0,0 +1,18 @@
bundle:
name: test
workspace:
profile: test
resources:
jobs:
foo:
name: job foo
bar:
name: job bar
pipelines:
baz:
name: pipeline baz
experiments:
foo:
name: experiment foo

View File

@ -0,0 +1,139 @@
package config_tests
import (
"context"
"path/filepath"
"testing"
"github.com/databricks/cli/bundle"
"github.com/databricks/cli/bundle/phases"
"github.com/databricks/cli/libs/diag"
"github.com/databricks/cli/libs/dyn"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestValidateUniqueResourceIdentifiers(t *testing.T) {
tcases := []struct {
name string
diagnostics diag.Diagnostics
}{
{
name: "duplicate_resource_names_in_root_job_and_pipeline",
diagnostics: diag.Diagnostics{
{
Severity: diag.Error,
Summary: "multiple resources have been defined with the same key: foo",
Locations: []dyn.Location{
{File: filepath.FromSlash("validate/duplicate_resource_names_in_root_job_and_pipeline/databricks.yml"), Line: 10, Column: 7},
{File: filepath.FromSlash("validate/duplicate_resource_names_in_root_job_and_pipeline/databricks.yml"), Line: 13, Column: 7},
},
Paths: []dyn.Path{
dyn.MustPathFromString("jobs.foo"),
dyn.MustPathFromString("pipelines.foo"),
},
},
},
},
{
name: "duplicate_resource_names_in_root_job_and_experiment",
diagnostics: diag.Diagnostics{
{
Severity: diag.Error,
Summary: "multiple resources have been defined with the same key: foo",
Locations: []dyn.Location{
{File: filepath.FromSlash("validate/duplicate_resource_names_in_root_job_and_experiment/databricks.yml"), Line: 10, Column: 7},
{File: filepath.FromSlash("validate/duplicate_resource_names_in_root_job_and_experiment/databricks.yml"), Line: 18, Column: 7},
},
Paths: []dyn.Path{
dyn.MustPathFromString("experiments.foo"),
dyn.MustPathFromString("jobs.foo"),
},
},
},
},
{
name: "duplicate_resource_name_in_subconfiguration",
diagnostics: diag.Diagnostics{
{
Severity: diag.Error,
Summary: "multiple resources have been defined with the same key: foo",
Locations: []dyn.Location{
{File: filepath.FromSlash("validate/duplicate_resource_name_in_subconfiguration/databricks.yml"), Line: 13, Column: 7},
{File: filepath.FromSlash("validate/duplicate_resource_name_in_subconfiguration/resources.yml"), Line: 4, Column: 7},
},
Paths: []dyn.Path{
dyn.MustPathFromString("jobs.foo"),
dyn.MustPathFromString("pipelines.foo"),
},
},
},
},
{
name: "duplicate_resource_name_in_subconfiguration_job_and_job",
diagnostics: diag.Diagnostics{
{
Severity: diag.Error,
Summary: "multiple resources have been defined with the same key: foo",
Locations: []dyn.Location{
{File: filepath.FromSlash("validate/duplicate_resource_name_in_subconfiguration_job_and_job/databricks.yml"), Line: 13, Column: 7},
{File: filepath.FromSlash("validate/duplicate_resource_name_in_subconfiguration_job_and_job/resources.yml"), Line: 4, Column: 7},
},
Paths: []dyn.Path{
dyn.MustPathFromString("jobs.foo"),
},
},
},
},
{
name: "duplicate_resource_names_in_different_subconfiguations",
diagnostics: diag.Diagnostics{
{
Severity: diag.Error,
Summary: "multiple resources have been defined with the same key: foo",
Locations: []dyn.Location{
{File: filepath.FromSlash("validate/duplicate_resource_names_in_different_subconfiguations/resources1.yml"), Line: 4, Column: 7},
{File: filepath.FromSlash("validate/duplicate_resource_names_in_different_subconfiguations/resources2.yml"), Line: 4, Column: 7},
},
Paths: []dyn.Path{
dyn.MustPathFromString("jobs.foo"),
dyn.MustPathFromString("pipelines.foo"),
},
},
},
},
{
name: "duplicate_resource_name_in_multiple_locations",
diagnostics: diag.Diagnostics{
{
Severity: diag.Error,
Summary: "multiple resources have been defined with the same key: foo",
Locations: []dyn.Location{
{File: filepath.FromSlash("validate/duplicate_resource_name_in_multiple_locations/databricks.yml"), Line: 13, Column: 7},
{File: filepath.FromSlash("validate/duplicate_resource_name_in_multiple_locations/resources1.yml"), Line: 4, Column: 7},
{File: filepath.FromSlash("validate/duplicate_resource_name_in_multiple_locations/resources1.yml"), Line: 8, Column: 7},
{File: filepath.FromSlash("validate/duplicate_resource_name_in_multiple_locations/resources2.yml"), Line: 4, Column: 7},
{File: filepath.FromSlash("validate/duplicate_resource_name_in_multiple_locations/resources2.yml"), Line: 8, Column: 7},
},
Paths: []dyn.Path{
dyn.MustPathFromString("experiments.foo"),
dyn.MustPathFromString("jobs.foo"),
dyn.MustPathFromString("pipelines.foo"),
},
},
},
},
}
for _, tc := range tcases {
t.Run(tc.name, func(t *testing.T) {
ctx := context.Background()
b, err := bundle.Load(ctx, "./validate/"+tc.name)
require.NoError(t, err)
// The UniqueResourceKeys mutator is run as part of the Load phase.
diags := bundle.Apply(ctx, b, phases.Load())
assert.Equal(t, tc.diagnostics, diags)
})
}
}

View File

@ -46,7 +46,8 @@ func newMappingFromGoMap(vin map[string]Value) Mapping {
return m
}
// Pairs returns all the key-value pairs in the Mapping.
// Pairs returns all the key-value pairs in the Mapping. The pairs are sorted by
// their key in lexicographic order.
func (m Mapping) Pairs() []Pair {
return m.pairs
}