Detect duplicate identifiers in bundle config (#332)

## Changes
This PR adds checks during bundle config load and merge to error out if
there are duplicate keys for resource definitions

## Tests
Using unit tests and manually
This commit is contained in:
shreyas-goenka 2023-04-17 12:21:21 +02:00 committed by GitHub
parent eab29603fc
commit 93d57dd00f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 312 additions and 1 deletions

View File

@ -1,6 +1,8 @@
package config package config
import ( import (
"fmt"
"github.com/databricks/bricks/bundle/config/resources" "github.com/databricks/bricks/bundle/config/resources"
) )
@ -13,6 +15,87 @@ type Resources struct {
Experiments map[string]*resources.MlflowExperiment `json:"experiments,omitempty"` Experiments map[string]*resources.MlflowExperiment `json:"experiments,omitempty"`
} }
type UniqueResourceIdTracker struct {
Type map[string]string
ConfigPath map[string]string
}
// verifies merging is safe by checking no duplicate identifiers exist
func (r *Resources) VerifySafeMerge(other *Resources) error {
rootTracker, err := r.VerifyUniqueResourceIdentifiers()
if err != nil {
return err
}
otherTracker, err := other.VerifyUniqueResourceIdentifiers()
if err != nil {
return err
}
for k := range otherTracker.Type {
if _, ok := rootTracker.Type[k]; ok {
return fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
rootTracker.Type[k],
rootTracker.ConfigPath[k],
otherTracker.Type[k],
otherTracker.ConfigPath[k],
)
}
}
return nil
}
// This function verifies there are no duplicate names used for the resource definations
func (r *Resources) VerifyUniqueResourceIdentifiers() (*UniqueResourceIdTracker, error) {
tracker := &UniqueResourceIdTracker{
Type: make(map[string]string),
ConfigPath: make(map[string]string),
}
for k := range r.Jobs {
tracker.Type[k] = "job"
tracker.ConfigPath[k] = r.Jobs[k].ConfigFilePath
}
for k := range r.Pipelines {
if _, ok := tracker.Type[k]; ok {
return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
tracker.Type[k],
tracker.ConfigPath[k],
"pipeline",
r.Pipelines[k].ConfigFilePath,
)
}
tracker.Type[k] = "pipeline"
tracker.ConfigPath[k] = r.Pipelines[k].ConfigFilePath
}
for k := range r.Models {
if _, ok := tracker.Type[k]; ok {
return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
tracker.Type[k],
tracker.ConfigPath[k],
"mlflow_model",
r.Models[k].ConfigFilePath,
)
}
tracker.Type[k] = "mlflow_model"
tracker.ConfigPath[k] = r.Models[k].ConfigFilePath
}
for k := range r.Experiments {
if _, ok := tracker.Type[k]; ok {
return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)",
k,
tracker.Type[k],
tracker.ConfigPath[k],
"mlflow_experiment",
r.Experiments[k].ConfigFilePath,
)
}
tracker.Type[k] = "mlflow_experiment"
tracker.ConfigPath[k] = r.Experiments[k].ConfigFilePath
}
return tracker, nil
}
// SetConfigFilePath sets the specified path for all resources contained in this instance. // SetConfigFilePath sets the specified path for all resources contained in this instance.
// This property is used to correctly resolve paths relative to the path // This property is used to correctly resolve paths relative to the path
// of the configuration file they were defined in. // of the configuration file they were defined in.

View File

@ -0,0 +1,96 @@
package config
import (
"testing"
"github.com/databricks/bricks/bundle/config/resources"
"github.com/stretchr/testify/assert"
)
func TestVerifyUniqueResourceIdentifiers(t *testing.T) {
r := Resources{
Jobs: map[string]*resources.Job{
"foo": {
Paths: resources.Paths{
ConfigFilePath: "foo.yml",
},
},
},
Models: map[string]*resources.MlflowModel{
"bar": {
Paths: resources.Paths{
ConfigFilePath: "bar.yml",
},
},
},
Experiments: map[string]*resources.MlflowExperiment{
"foo": {
Paths: resources.Paths{
ConfigFilePath: "foo2.yml",
},
},
},
}
_, err := r.VerifyUniqueResourceIdentifiers()
assert.ErrorContains(t, err, "multiple resources named foo (job at foo.yml, mlflow_experiment at foo2.yml)")
}
func TestVerifySafeMerge(t *testing.T) {
r := Resources{
Jobs: map[string]*resources.Job{
"foo": {
Paths: resources.Paths{
ConfigFilePath: "foo.yml",
},
},
},
Models: map[string]*resources.MlflowModel{
"bar": {
Paths: resources.Paths{
ConfigFilePath: "bar.yml",
},
},
},
}
other := Resources{
Pipelines: map[string]*resources.Pipeline{
"foo": {
Paths: resources.Paths{
ConfigFilePath: "foo2.yml",
},
},
},
}
err := r.VerifySafeMerge(&other)
assert.ErrorContains(t, err, "multiple resources named foo (job at foo.yml, pipeline at foo2.yml)")
}
func TestVerifySafeMergeForSameResourceType(t *testing.T) {
r := Resources{
Jobs: map[string]*resources.Job{
"foo": {
Paths: resources.Paths{
ConfigFilePath: "foo.yml",
},
},
},
Models: map[string]*resources.MlflowModel{
"bar": {
Paths: resources.Paths{
ConfigFilePath: "bar.yml",
},
},
},
}
other := Resources{
Jobs: map[string]*resources.Job{
"foo": {
Paths: resources.Paths{
ConfigFilePath: "foo2.yml",
},
},
},
}
err := r.VerifySafeMerge(&other)
assert.ErrorContains(t, err, "multiple resources named foo (job at foo.yml, job at foo2.yml)")
}

View File

@ -88,15 +88,24 @@ func (r *Root) Load(path string) error {
if err != nil { if err != nil {
return err return err
} }
r.Path = filepath.Dir(path) r.Path = filepath.Dir(path)
r.SetConfigFilePath(path) r.SetConfigFilePath(path)
return nil
_, err = r.Resources.VerifyUniqueResourceIdentifiers()
return err
} }
func (r *Root) Merge(other *Root) error { func (r *Root) Merge(other *Root) error {
// TODO: when hooking into merge semantics, disallow setting path on the target instance. // TODO: when hooking into merge semantics, disallow setting path on the target instance.
other.Path = "" other.Path = ""
// Check for safe merge, protecting against duplicate resource identifiers
err := r.Resources.VerifySafeMerge(&other.Resources)
if err != nil {
return err
}
// TODO: define and test semantics for merging. // TODO: define and test semantics for merging.
return mergo.MergeWithOverwrite(r, other) return mergo.MergeWithOverwrite(r, other)
} }

View File

@ -74,3 +74,22 @@ func TestRootMergeMap(t *testing.T) {
assert.NoError(t, root.Merge(other)) assert.NoError(t, root.Merge(other))
assert.Equal(t, &Workspace{Host: "bar", Profile: "profile"}, root.Environments["development"].Workspace) assert.Equal(t, &Workspace{Host: "bar", Profile: "profile"}, root.Environments["development"].Workspace)
} }
func TestDuplicateIdOnLoadReturnsError(t *testing.T) {
root := &Root{}
err := root.Load("./testdata/duplicate_resource_names_in_root/bundle.yml")
assert.ErrorContains(t, err, "multiple resources named foo (job at ./testdata/duplicate_resource_names_in_root/bundle.yml, pipeline at ./testdata/duplicate_resource_names_in_root/bundle.yml)")
}
func TestDuplicateIdOnMergeReturnsError(t *testing.T) {
root := &Root{}
err := root.Load("./testdata/duplicate_resource_name_in_subconfiguration/bundle.yml")
require.NoError(t, err)
other := &Root{}
err = other.Load("./testdata/duplicate_resource_name_in_subconfiguration/resources.yml")
require.NoError(t, err)
err = root.Merge(other)
assert.ErrorContains(t, err, "multiple resources named foo (job at ./testdata/duplicate_resource_name_in_subconfiguration/bundle.yml, pipeline at ./testdata/duplicate_resource_name_in_subconfiguration/resources.yml)")
}

View File

@ -0,0 +1,10 @@
bundle:
name: test
workspace:
profile: test
resources:
jobs:
foo:
name: job foo

View File

@ -0,0 +1,4 @@
resources:
pipelines:
foo:
name: pipeline foo

View File

@ -0,0 +1,13 @@
bundle:
name: test
workspace:
profile: test
resources:
jobs:
foo:
name: job foo
pipelines:
foo:
name: pipeline foo

View File

@ -0,0 +1,13 @@
bundle:
name: test
workspace:
profile: test
resources:
jobs:
foo:
name: job foo
pipelines:
foo:
name: pipeline foo

View File

@ -0,0 +1,10 @@
bundle:
name: test
workspace:
profile: test
resources:
jobs:
foo:
name: job foo

View File

@ -0,0 +1,4 @@
resources:
pipelines:
foo:
name: pipeline foo

View File

@ -0,0 +1,5 @@
bundle:
name: test
workspace:
profile: test

View File

@ -0,0 +1,4 @@
resources:
jobs:
foo:
name: job foo

View File

@ -0,0 +1,4 @@
resources:
pipelines:
foo:
name: pipeline foo

View File

@ -0,0 +1,37 @@
package config_tests
import (
"context"
"fmt"
"path/filepath"
"testing"
"github.com/databricks/bricks/bundle"
"github.com/databricks/bricks/bundle/config/mutator"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestConflictingResourceIdsNoSubconfig(t *testing.T) {
_, err := bundle.Load("./conflicting_resource_ids/no_subconfigurations")
bundleConfigPath := filepath.FromSlash("conflicting_resource_ids/no_subconfigurations/bundle.yml")
assert.ErrorContains(t, err, fmt.Sprintf("multiple resources named foo (job at %s, pipeline at %s)", bundleConfigPath, bundleConfigPath))
}
func TestConflictingResourceIdsOneSubconfig(t *testing.T) {
b, err := bundle.Load("./conflicting_resource_ids/one_subconfiguration")
require.NoError(t, err)
err = bundle.Apply(context.Background(), b, mutator.DefaultMutators())
bundleConfigPath := filepath.FromSlash("conflicting_resource_ids/one_subconfiguration/bundle.yml")
resourcesConfigPath := filepath.FromSlash("conflicting_resource_ids/one_subconfiguration/resources.yml")
assert.ErrorContains(t, err, fmt.Sprintf("multiple resources named foo (job at %s, pipeline at %s)", bundleConfigPath, resourcesConfigPath))
}
func TestConflictingResourceIdsTwoSubconfigs(t *testing.T) {
b, err := bundle.Load("./conflicting_resource_ids/two_subconfigurations")
require.NoError(t, err)
err = bundle.Apply(context.Background(), b, mutator.DefaultMutators())
resources1ConfigPath := filepath.FromSlash("conflicting_resource_ids/two_subconfigurations/resources1.yml")
resources2ConfigPath := filepath.FromSlash("conflicting_resource_ids/two_subconfigurations/resources2.yml")
assert.ErrorContains(t, err, fmt.Sprintf("multiple resources named foo (job at %s, pipeline at %s)", resources1ConfigPath, resources2ConfigPath))
}