From 93d57dd00f50a1f82d3983badfa7437c44fa6453 Mon Sep 17 00:00:00 2001 From: shreyas-goenka <88374338+shreyas-goenka@users.noreply.github.com> Date: Mon, 17 Apr 2023 12:21:21 +0200 Subject: [PATCH] Detect duplicate identifiers in bundle config (#332) ## Changes This PR adds checks during bundle config load and merge to error out if there are duplicate keys for resource definitions ## Tests Using unit tests and manually --- bundle/config/resources.go | 83 ++++++++++++++++ bundle/config/resources_test.go | 96 +++++++++++++++++++ bundle/config/root.go | 11 ++- bundle/config/root_test.go | 19 ++++ .../bundle.yml | 10 ++ .../resources.yml | 4 + .../bundle.yml | 13 +++ .../no_subconfigurations/bundle.yml | 13 +++ .../one_subconfiguration/bundle.yml | 10 ++ .../one_subconfiguration/resources.yml | 4 + .../two_subconfigurations/bundle.yml | 5 + .../two_subconfigurations/resources1.yml | 4 + .../two_subconfigurations/resources2.yml | 4 + bundle/tests/conflicting_resource_ids_test.go | 37 +++++++ 14 files changed, 312 insertions(+), 1 deletion(-) create mode 100644 bundle/config/resources_test.go create mode 100644 bundle/config/testdata/duplicate_resource_name_in_subconfiguration/bundle.yml create mode 100644 bundle/config/testdata/duplicate_resource_name_in_subconfiguration/resources.yml create mode 100644 bundle/config/testdata/duplicate_resource_names_in_root/bundle.yml create mode 100644 bundle/tests/conflicting_resource_ids/no_subconfigurations/bundle.yml create mode 100644 bundle/tests/conflicting_resource_ids/one_subconfiguration/bundle.yml create mode 100644 bundle/tests/conflicting_resource_ids/one_subconfiguration/resources.yml create mode 100644 bundle/tests/conflicting_resource_ids/two_subconfigurations/bundle.yml create mode 100644 bundle/tests/conflicting_resource_ids/two_subconfigurations/resources1.yml create mode 100644 bundle/tests/conflicting_resource_ids/two_subconfigurations/resources2.yml create mode 100644 bundle/tests/conflicting_resource_ids_test.go diff --git a/bundle/config/resources.go b/bundle/config/resources.go index d11610cfc..6aed895f6 100644 --- a/bundle/config/resources.go +++ b/bundle/config/resources.go @@ -1,6 +1,8 @@ package config import ( + "fmt" + "github.com/databricks/bricks/bundle/config/resources" ) @@ -13,6 +15,87 @@ type Resources struct { Experiments map[string]*resources.MlflowExperiment `json:"experiments,omitempty"` } +type UniqueResourceIdTracker struct { + Type map[string]string + ConfigPath map[string]string +} + +// verifies merging is safe by checking no duplicate identifiers exist +func (r *Resources) VerifySafeMerge(other *Resources) error { + rootTracker, err := r.VerifyUniqueResourceIdentifiers() + if err != nil { + return err + } + otherTracker, err := other.VerifyUniqueResourceIdentifiers() + if err != nil { + return err + } + for k := range otherTracker.Type { + if _, ok := rootTracker.Type[k]; ok { + return fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)", + k, + rootTracker.Type[k], + rootTracker.ConfigPath[k], + otherTracker.Type[k], + otherTracker.ConfigPath[k], + ) + } + } + return nil +} + +// This function verifies there are no duplicate names used for the resource definations +func (r *Resources) VerifyUniqueResourceIdentifiers() (*UniqueResourceIdTracker, error) { + tracker := &UniqueResourceIdTracker{ + Type: make(map[string]string), + ConfigPath: make(map[string]string), + } + for k := range r.Jobs { + tracker.Type[k] = "job" + tracker.ConfigPath[k] = r.Jobs[k].ConfigFilePath + } + for k := range r.Pipelines { + if _, ok := tracker.Type[k]; ok { + return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)", + k, + tracker.Type[k], + tracker.ConfigPath[k], + "pipeline", + r.Pipelines[k].ConfigFilePath, + ) + } + tracker.Type[k] = "pipeline" + tracker.ConfigPath[k] = r.Pipelines[k].ConfigFilePath + } + for k := range r.Models { + if _, ok := tracker.Type[k]; ok { + return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)", + k, + tracker.Type[k], + tracker.ConfigPath[k], + "mlflow_model", + r.Models[k].ConfigFilePath, + ) + } + tracker.Type[k] = "mlflow_model" + tracker.ConfigPath[k] = r.Models[k].ConfigFilePath + } + for k := range r.Experiments { + if _, ok := tracker.Type[k]; ok { + return tracker, fmt.Errorf("multiple resources named %s (%s at %s, %s at %s)", + k, + tracker.Type[k], + tracker.ConfigPath[k], + "mlflow_experiment", + r.Experiments[k].ConfigFilePath, + ) + } + tracker.Type[k] = "mlflow_experiment" + tracker.ConfigPath[k] = r.Experiments[k].ConfigFilePath + } + return tracker, nil +} + // SetConfigFilePath sets the specified path for all resources contained in this instance. // This property is used to correctly resolve paths relative to the path // of the configuration file they were defined in. diff --git a/bundle/config/resources_test.go b/bundle/config/resources_test.go new file mode 100644 index 000000000..cf07e5437 --- /dev/null +++ b/bundle/config/resources_test.go @@ -0,0 +1,96 @@ +package config + +import ( + "testing" + + "github.com/databricks/bricks/bundle/config/resources" + "github.com/stretchr/testify/assert" +) + +func TestVerifyUniqueResourceIdentifiers(t *testing.T) { + r := Resources{ + Jobs: map[string]*resources.Job{ + "foo": { + Paths: resources.Paths{ + ConfigFilePath: "foo.yml", + }, + }, + }, + Models: map[string]*resources.MlflowModel{ + "bar": { + Paths: resources.Paths{ + ConfigFilePath: "bar.yml", + }, + }, + }, + Experiments: map[string]*resources.MlflowExperiment{ + "foo": { + Paths: resources.Paths{ + ConfigFilePath: "foo2.yml", + }, + }, + }, + } + _, err := r.VerifyUniqueResourceIdentifiers() + assert.ErrorContains(t, err, "multiple resources named foo (job at foo.yml, mlflow_experiment at foo2.yml)") +} + +func TestVerifySafeMerge(t *testing.T) { + r := Resources{ + Jobs: map[string]*resources.Job{ + "foo": { + Paths: resources.Paths{ + ConfigFilePath: "foo.yml", + }, + }, + }, + Models: map[string]*resources.MlflowModel{ + "bar": { + Paths: resources.Paths{ + ConfigFilePath: "bar.yml", + }, + }, + }, + } + other := Resources{ + Pipelines: map[string]*resources.Pipeline{ + "foo": { + Paths: resources.Paths{ + ConfigFilePath: "foo2.yml", + }, + }, + }, + } + err := r.VerifySafeMerge(&other) + assert.ErrorContains(t, err, "multiple resources named foo (job at foo.yml, pipeline at foo2.yml)") +} + +func TestVerifySafeMergeForSameResourceType(t *testing.T) { + r := Resources{ + Jobs: map[string]*resources.Job{ + "foo": { + Paths: resources.Paths{ + ConfigFilePath: "foo.yml", + }, + }, + }, + Models: map[string]*resources.MlflowModel{ + "bar": { + Paths: resources.Paths{ + ConfigFilePath: "bar.yml", + }, + }, + }, + } + other := Resources{ + Jobs: map[string]*resources.Job{ + "foo": { + Paths: resources.Paths{ + ConfigFilePath: "foo2.yml", + }, + }, + }, + } + err := r.VerifySafeMerge(&other) + assert.ErrorContains(t, err, "multiple resources named foo (job at foo.yml, job at foo2.yml)") +} diff --git a/bundle/config/root.go b/bundle/config/root.go index 50e223128..0e7b89e14 100644 --- a/bundle/config/root.go +++ b/bundle/config/root.go @@ -88,15 +88,24 @@ func (r *Root) Load(path string) error { if err != nil { return err } + r.Path = filepath.Dir(path) r.SetConfigFilePath(path) - return nil + + _, err = r.Resources.VerifyUniqueResourceIdentifiers() + return err } func (r *Root) Merge(other *Root) error { // TODO: when hooking into merge semantics, disallow setting path on the target instance. other.Path = "" + // Check for safe merge, protecting against duplicate resource identifiers + err := r.Resources.VerifySafeMerge(&other.Resources) + if err != nil { + return err + } + // TODO: define and test semantics for merging. return mergo.MergeWithOverwrite(r, other) } diff --git a/bundle/config/root_test.go b/bundle/config/root_test.go index 70a1e1356..7dd131817 100644 --- a/bundle/config/root_test.go +++ b/bundle/config/root_test.go @@ -74,3 +74,22 @@ func TestRootMergeMap(t *testing.T) { assert.NoError(t, root.Merge(other)) assert.Equal(t, &Workspace{Host: "bar", Profile: "profile"}, root.Environments["development"].Workspace) } + +func TestDuplicateIdOnLoadReturnsError(t *testing.T) { + root := &Root{} + err := root.Load("./testdata/duplicate_resource_names_in_root/bundle.yml") + assert.ErrorContains(t, err, "multiple resources named foo (job at ./testdata/duplicate_resource_names_in_root/bundle.yml, pipeline at ./testdata/duplicate_resource_names_in_root/bundle.yml)") +} + +func TestDuplicateIdOnMergeReturnsError(t *testing.T) { + root := &Root{} + err := root.Load("./testdata/duplicate_resource_name_in_subconfiguration/bundle.yml") + require.NoError(t, err) + + other := &Root{} + err = other.Load("./testdata/duplicate_resource_name_in_subconfiguration/resources.yml") + require.NoError(t, err) + + err = root.Merge(other) + assert.ErrorContains(t, err, "multiple resources named foo (job at ./testdata/duplicate_resource_name_in_subconfiguration/bundle.yml, pipeline at ./testdata/duplicate_resource_name_in_subconfiguration/resources.yml)") +} diff --git a/bundle/config/testdata/duplicate_resource_name_in_subconfiguration/bundle.yml b/bundle/config/testdata/duplicate_resource_name_in_subconfiguration/bundle.yml new file mode 100644 index 000000000..a81602920 --- /dev/null +++ b/bundle/config/testdata/duplicate_resource_name_in_subconfiguration/bundle.yml @@ -0,0 +1,10 @@ +bundle: + name: test + +workspace: + profile: test + +resources: + jobs: + foo: + name: job foo diff --git a/bundle/config/testdata/duplicate_resource_name_in_subconfiguration/resources.yml b/bundle/config/testdata/duplicate_resource_name_in_subconfiguration/resources.yml new file mode 100644 index 000000000..c3dcb6e2f --- /dev/null +++ b/bundle/config/testdata/duplicate_resource_name_in_subconfiguration/resources.yml @@ -0,0 +1,4 @@ +resources: + pipelines: + foo: + name: pipeline foo diff --git a/bundle/config/testdata/duplicate_resource_names_in_root/bundle.yml b/bundle/config/testdata/duplicate_resource_names_in_root/bundle.yml new file mode 100644 index 000000000..1e9aa10b1 --- /dev/null +++ b/bundle/config/testdata/duplicate_resource_names_in_root/bundle.yml @@ -0,0 +1,13 @@ +bundle: + name: test + +workspace: + profile: test + +resources: + jobs: + foo: + name: job foo + pipelines: + foo: + name: pipeline foo diff --git a/bundle/tests/conflicting_resource_ids/no_subconfigurations/bundle.yml b/bundle/tests/conflicting_resource_ids/no_subconfigurations/bundle.yml new file mode 100644 index 000000000..1e9aa10b1 --- /dev/null +++ b/bundle/tests/conflicting_resource_ids/no_subconfigurations/bundle.yml @@ -0,0 +1,13 @@ +bundle: + name: test + +workspace: + profile: test + +resources: + jobs: + foo: + name: job foo + pipelines: + foo: + name: pipeline foo diff --git a/bundle/tests/conflicting_resource_ids/one_subconfiguration/bundle.yml b/bundle/tests/conflicting_resource_ids/one_subconfiguration/bundle.yml new file mode 100644 index 000000000..a81602920 --- /dev/null +++ b/bundle/tests/conflicting_resource_ids/one_subconfiguration/bundle.yml @@ -0,0 +1,10 @@ +bundle: + name: test + +workspace: + profile: test + +resources: + jobs: + foo: + name: job foo diff --git a/bundle/tests/conflicting_resource_ids/one_subconfiguration/resources.yml b/bundle/tests/conflicting_resource_ids/one_subconfiguration/resources.yml new file mode 100644 index 000000000..c3dcb6e2f --- /dev/null +++ b/bundle/tests/conflicting_resource_ids/one_subconfiguration/resources.yml @@ -0,0 +1,4 @@ +resources: + pipelines: + foo: + name: pipeline foo diff --git a/bundle/tests/conflicting_resource_ids/two_subconfigurations/bundle.yml b/bundle/tests/conflicting_resource_ids/two_subconfigurations/bundle.yml new file mode 100644 index 000000000..f8fe99ebc --- /dev/null +++ b/bundle/tests/conflicting_resource_ids/two_subconfigurations/bundle.yml @@ -0,0 +1,5 @@ +bundle: + name: test + +workspace: + profile: test diff --git a/bundle/tests/conflicting_resource_ids/two_subconfigurations/resources1.yml b/bundle/tests/conflicting_resource_ids/two_subconfigurations/resources1.yml new file mode 100644 index 000000000..39a24ef59 --- /dev/null +++ b/bundle/tests/conflicting_resource_ids/two_subconfigurations/resources1.yml @@ -0,0 +1,4 @@ +resources: + jobs: + foo: + name: job foo diff --git a/bundle/tests/conflicting_resource_ids/two_subconfigurations/resources2.yml b/bundle/tests/conflicting_resource_ids/two_subconfigurations/resources2.yml new file mode 100644 index 000000000..c3dcb6e2f --- /dev/null +++ b/bundle/tests/conflicting_resource_ids/two_subconfigurations/resources2.yml @@ -0,0 +1,4 @@ +resources: + pipelines: + foo: + name: pipeline foo diff --git a/bundle/tests/conflicting_resource_ids_test.go b/bundle/tests/conflicting_resource_ids_test.go new file mode 100644 index 000000000..500a9d8e2 --- /dev/null +++ b/bundle/tests/conflicting_resource_ids_test.go @@ -0,0 +1,37 @@ +package config_tests + +import ( + "context" + "fmt" + "path/filepath" + "testing" + + "github.com/databricks/bricks/bundle" + "github.com/databricks/bricks/bundle/config/mutator" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestConflictingResourceIdsNoSubconfig(t *testing.T) { + _, err := bundle.Load("./conflicting_resource_ids/no_subconfigurations") + bundleConfigPath := filepath.FromSlash("conflicting_resource_ids/no_subconfigurations/bundle.yml") + assert.ErrorContains(t, err, fmt.Sprintf("multiple resources named foo (job at %s, pipeline at %s)", bundleConfigPath, bundleConfigPath)) +} + +func TestConflictingResourceIdsOneSubconfig(t *testing.T) { + b, err := bundle.Load("./conflicting_resource_ids/one_subconfiguration") + require.NoError(t, err) + err = bundle.Apply(context.Background(), b, mutator.DefaultMutators()) + bundleConfigPath := filepath.FromSlash("conflicting_resource_ids/one_subconfiguration/bundle.yml") + resourcesConfigPath := filepath.FromSlash("conflicting_resource_ids/one_subconfiguration/resources.yml") + assert.ErrorContains(t, err, fmt.Sprintf("multiple resources named foo (job at %s, pipeline at %s)", bundleConfigPath, resourcesConfigPath)) +} + +func TestConflictingResourceIdsTwoSubconfigs(t *testing.T) { + b, err := bundle.Load("./conflicting_resource_ids/two_subconfigurations") + require.NoError(t, err) + err = bundle.Apply(context.Background(), b, mutator.DefaultMutators()) + resources1ConfigPath := filepath.FromSlash("conflicting_resource_ids/two_subconfigurations/resources1.yml") + resources2ConfigPath := filepath.FromSlash("conflicting_resource_ids/two_subconfigurations/resources2.yml") + assert.ErrorContains(t, err, fmt.Sprintf("multiple resources named foo (job at %s, pipeline at %s)", resources1ConfigPath, resources2ConfigPath)) +}