Fix description memoization in bundle schema (#1409)

## Changes Fixes https://github.com/databricks/cli/issues/559 The CLI generation is now stable and does not produce a diff for the `bundle_descriptions.json` file. Before a pointer to the schema was stored in the memo, which would be mutated later to include the description. This lead to duplicate documentation for schema components that were used in multiple places. This PR fixes this issue. Eg: Before all references of `pause_status` would have the same description. ## Tests Added regression test.
2024-05-01 16:34:37 +05:30 · 2024-05-01 16:34:37 +05:30 · 30215860e7
parent 507053ee50
commit 30215860e7
4 changed files with 108 additions and 39 deletions
--- a/bundle/schema/docs.go
+++ b/bundle/schema/docs.go
@ -70,7 +70,7 @@ func UpdateBundleDescriptions(openapiSpecPath string) (*Docs, error) {
 	}
 	openapiReader := &OpenapiReader{
 		OpenapiSpec: spec,
-		Memo:        make(map[string]*jsonschema.Schema),
+		memo:        make(map[string]jsonschema.Schema),
 	}

 	// Generate descriptions for the "resources" field
--- a/bundle/schema/docs/bundle_descriptions.json
+++ b/bundle/schema/docs/bundle_descriptions.json
@ -756,7 +756,7 @@
                "description": "An optional periodic schedule for this job. The default behavior is that the job only runs when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.",
                "properties": {
                  "pause_status": {
-                    "description": "Indicate whether the continuous execution of the job is paused or not. Defaults to UNPAUSED."
+                    "description": "Indicate whether this schedule is paused or not."
                  },
                  "quartz_cron_expression": {
                    "description": "A Cron expression using Quartz syntax that describes the schedule for a job. See [Cron Trigger](http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/crontrigger.html) for details. This field is required."
@ -813,7 +813,7 @@
                          "description": "Optional schema to write to. This parameter is only used when a warehouse_id is also provided. If not provided, the `default` schema is used."
                        },
                        "source": {
-                          "description": "Optional location type of the notebook. When set to `WORKSPACE`, the notebook will be retrieved from the local Databricks workspace. When set to `GIT`, the notebook will be retrieved from a Git repository\ndefined in `git_source`. If the value is empty, the task will use `GIT` if `git_source` is defined and `WORKSPACE` otherwise.\n* `WORKSPACE`: Notebook is located in Databricks workspace.\n* `GIT`: Notebook is located in cloud Git provider."
+                          "description": "Optional location type of the project directory. When set to `WORKSPACE`, the project will be retrieved\nfrom the local Databricks workspace. When set to `GIT`, the project will be retrieved from a Git repository\ndefined in `git_source`. If the value is empty, the task will use `GIT` if `git_source` is defined and `WORKSPACE` otherwise.\n\n* `WORKSPACE`: Project is located in Databricks workspace.\n* `GIT`: Project is located in cloud Git provider."
                        },
                        "warehouse_id": {
                          "description": "ID of the SQL warehouse to connect to. If provided, we automatically generate and provide the profile and connection details to dbt. It can be overridden on a per-command basis by using the `--profiles-dir` command line argument."
@ -972,7 +972,7 @@
                      "description": "An optional minimal interval in milliseconds between the start of the failed run and the subsequent retry run. The default behavior is that unsuccessful runs are immediately retried."
                    },
                    "new_cluster": {
-                      "description": "If new_cluster, a description of a cluster that is created for each task.",
+                      "description": "If new_cluster, a description of a new cluster that is created for each run.",
                      "properties": {
                        "apply_policy_default_values": {
                          "description": ""
@ -1474,7 +1474,7 @@
                          "description": "The Python file to be executed. Cloud file URIs (such as dbfs:/, s3:/, adls:/, gcs:/) and workspace paths are supported. For python files stored in the Databricks workspace, the path must be absolute and begin with `/`. For files stored in a remote repository, the path must be relative. This field is required."
                        },
                        "source": {
-                          "description": "Optional location type of the notebook. When set to `WORKSPACE`, the notebook will be retrieved from the local Databricks workspace. When set to `GIT`, the notebook will be retrieved from a Git repository\ndefined in `git_source`. If the value is empty, the task will use `GIT` if `git_source` is defined and `WORKSPACE` otherwise.\n* `WORKSPACE`: Notebook is located in Databricks workspace.\n* `GIT`: Notebook is located in cloud Git provider."
+                          "description": "Optional location type of the Python file. When set to `WORKSPACE` or not specified, the file will be retrieved from the local\nDatabricks workspace or cloud location (if the `python_file` has a URI format). When set to `GIT`,\nthe Python file will be retrieved from a Git repository defined in `git_source`.\n\n* `WORKSPACE`: The Python file is located in a Databricks workspace or at a cloud filesystem URI.\n* `GIT`: The Python file is located in a remote Git repository."
                        }
                      }
                    },
@ -1552,7 +1552,7 @@
                              "description": "Path of the SQL file. Must be relative if the source is a remote Git repository and absolute for workspace paths."
                            },
                            "source": {
-                              "description": "Optional location type of the notebook. When set to `WORKSPACE`, the notebook will be retrieved from the local Databricks workspace. When set to `GIT`, the notebook will be retrieved from a Git repository\ndefined in `git_source`. If the value is empty, the task will use `GIT` if `git_source` is defined and `WORKSPACE` otherwise.\n* `WORKSPACE`: Notebook is located in Databricks workspace.\n* `GIT`: Notebook is located in cloud Git provider."
+                              "description": "Optional location type of the SQL file. When set to `WORKSPACE`, the SQL file will be retrieved\nfrom the local Databricks workspace. When set to `GIT`, the SQL file will be retrieved from a Git repository\ndefined in `git_source`. If the value is empty, the task will use `GIT` if `git_source` is defined and `WORKSPACE` otherwise.\n\n* `WORKSPACE`: SQL file is located in Databricks workspace.\n* `GIT`: SQL file is located in cloud Git provider."
                            }
                          }
                        },
@ -1654,10 +1654,10 @@
                    }
                  },
                  "pause_status": {
-                    "description": "Indicate whether the continuous execution of the job is paused or not. Defaults to UNPAUSED."
+                    "description": "Whether this trigger is paused or not."
                  },
                  "table": {
-                    "description": "",
+                    "description": "Old table trigger settings name. Deprecated in favor of `table_update`.",
                    "properties": {
                      "condition": {
                        "description": "The table(s) condition based on which to trigger a job run."
@ -3479,7 +3479,7 @@
                      "description": "An optional periodic schedule for this job. The default behavior is that the job only runs when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.",
                      "properties": {
                        "pause_status": {
-                          "description": "Indicate whether the continuous execution of the job is paused or not. Defaults to UNPAUSED."
+                          "description": "Indicate whether this schedule is paused or not."
                        },
                        "quartz_cron_expression": {
                          "description": "A Cron expression using Quartz syntax that describes the schedule for a job. See [Cron Trigger](http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/crontrigger.html) for details. This field is required."
@ -3536,7 +3536,7 @@
                                "description": "Optional schema to write to. This parameter is only used when a warehouse_id is also provided. If not provided, the `default` schema is used."
                              },
                              "source": {
-                                "description": "Optional location type of the notebook. When set to `WORKSPACE`, the notebook will be retrieved from the local Databricks workspace. When set to `GIT`, the notebook will be retrieved from a Git repository\ndefined in `git_source`. If the value is empty, the task will use `GIT` if `git_source` is defined and `WORKSPACE` otherwise.\n* `WORKSPACE`: Notebook is located in Databricks workspace.\n* `GIT`: Notebook is located in cloud Git provider."
+                                "description": "Optional location type of the project directory. When set to `WORKSPACE`, the project will be retrieved\nfrom the local Databricks workspace. When set to `GIT`, the project will be retrieved from a Git repository\ndefined in `git_source`. If the value is empty, the task will use `GIT` if `git_source` is defined and `WORKSPACE` otherwise.\n\n* `WORKSPACE`: Project is located in Databricks workspace.\n* `GIT`: Project is located in cloud Git provider."
                              },
                              "warehouse_id": {
                                "description": "ID of the SQL warehouse to connect to. If provided, we automatically generate and provide the profile and connection details to dbt. It can be overridden on a per-command basis by using the `--profiles-dir` command line argument."
@ -3695,7 +3695,7 @@
                            "description": "An optional minimal interval in milliseconds between the start of the failed run and the subsequent retry run. The default behavior is that unsuccessful runs are immediately retried."
                          },
                          "new_cluster": {
-                            "description": "If new_cluster, a description of a cluster that is created for each task.",
+                            "description": "If new_cluster, a description of a new cluster that is created for each run.",
                            "properties": {
                              "apply_policy_default_values": {
                                "description": ""
@ -4197,7 +4197,7 @@
                                "description": "The Python file to be executed. Cloud file URIs (such as dbfs:/, s3:/, adls:/, gcs:/) and workspace paths are supported. For python files stored in the Databricks workspace, the path must be absolute and begin with `/`. For files stored in a remote repository, the path must be relative. This field is required."
                              },
                              "source": {
-                                "description": "Optional location type of the notebook. When set to `WORKSPACE`, the notebook will be retrieved from the local Databricks workspace. When set to `GIT`, the notebook will be retrieved from a Git repository\ndefined in `git_source`. If the value is empty, the task will use `GIT` if `git_source` is defined and `WORKSPACE` otherwise.\n* `WORKSPACE`: Notebook is located in Databricks workspace.\n* `GIT`: Notebook is located in cloud Git provider."
+                                "description": "Optional location type of the Python file. When set to `WORKSPACE` or not specified, the file will be retrieved from the local\nDatabricks workspace or cloud location (if the `python_file` has a URI format). When set to `GIT`,\nthe Python file will be retrieved from a Git repository defined in `git_source`.\n\n* `WORKSPACE`: The Python file is located in a Databricks workspace or at a cloud filesystem URI.\n* `GIT`: The Python file is located in a remote Git repository."
                              }
                            }
                          },
@ -4275,7 +4275,7 @@
                                    "description": "Path of the SQL file. Must be relative if the source is a remote Git repository and absolute for workspace paths."
                                  },
                                  "source": {
-                                    "description": "Optional location type of the notebook. When set to `WORKSPACE`, the notebook will be retrieved from the local Databricks workspace. When set to `GIT`, the notebook will be retrieved from a Git repository\ndefined in `git_source`. If the value is empty, the task will use `GIT` if `git_source` is defined and `WORKSPACE` otherwise.\n* `WORKSPACE`: Notebook is located in Databricks workspace.\n* `GIT`: Notebook is located in cloud Git provider."
+                                    "description": "Optional location type of the SQL file. When set to `WORKSPACE`, the SQL file will be retrieved\nfrom the local Databricks workspace. When set to `GIT`, the SQL file will be retrieved from a Git repository\ndefined in `git_source`. If the value is empty, the task will use `GIT` if `git_source` is defined and `WORKSPACE` otherwise.\n\n* `WORKSPACE`: SQL file is located in Databricks workspace.\n* `GIT`: SQL file is located in cloud Git provider."
                                  }
                                }
                              },
@ -4377,10 +4377,10 @@
                          }
                        },
                        "pause_status": {
-                          "description": "Indicate whether the continuous execution of the job is paused or not. Defaults to UNPAUSED."
+                          "description": "Whether this trigger is paused or not."
                        },
                        "table": {
-                          "description": "",
+                          "description": "Old table trigger settings name. Deprecated in favor of `table_update`.",
                          "properties": {
                            "condition": {
                              "description": "The table(s) condition based on which to trigger a job run."
--- a/bundle/schema/openapi.go
+++ b/bundle/schema/openapi.go
@ -10,17 +10,21 @@ import (
 )

 type OpenapiReader struct {
+	// OpenAPI spec to read schemas from.
 	OpenapiSpec *openapi.Specification
-	Memo        map[string]*jsonschema.Schema
+
+	// In-memory cache of schemas read from the OpenAPI spec.
+	memo map[string]jsonschema.Schema
 }

 const SchemaPathPrefix = "#/components/schemas/"

-func (reader *OpenapiReader) readOpenapiSchema(path string) (*jsonschema.Schema, error) {
+// Read a schema directly from the OpenAPI spec.
+func (reader *OpenapiReader) readOpenapiSchema(path string) (jsonschema.Schema, error) {
 	schemaKey := strings.TrimPrefix(path, SchemaPathPrefix)

 	// return early if we already have a computed schema
-	memoSchema, ok := reader.Memo[schemaKey]
+	memoSchema, ok := reader.memo[schemaKey]
 	if ok {
 		return memoSchema, nil
 	}
@ -28,18 +32,18 @@ func (reader *OpenapiReader) readOpenapiSchema(path string) (*jsonschema.Schema,
 	// check path is present in openapi spec
 	openapiSchema, ok := reader.OpenapiSpec.Components.Schemas[schemaKey]
 	if !ok {
-		return nil, fmt.Errorf("schema with path %s not found in openapi spec", path)
+		return jsonschema.Schema{}, fmt.Errorf("schema with path %s not found in openapi spec", path)
 	}

 	// convert openapi schema to the native schema struct
 	bytes, err := json.Marshal(*openapiSchema)
 	if err != nil {
-		return nil, err
+		return jsonschema.Schema{}, err
 	}
-	jsonSchema := &jsonschema.Schema{}
-	err = json.Unmarshal(bytes, jsonSchema)
+	jsonSchema := jsonschema.Schema{}
+	err = json.Unmarshal(bytes, &jsonSchema)
 	if err != nil {
-		return nil, err
+		return jsonschema.Schema{}, err
 	}

 	// A hack to convert a map[string]interface{} to *Schema
@ -49,23 +53,28 @@ func (reader *OpenapiReader) readOpenapiSchema(path string) (*jsonschema.Schema,
 	if ok {
 		b, err := json.Marshal(jsonSchema.AdditionalProperties)
 		if err != nil {
-			return nil, err
+			return jsonschema.Schema{}, err
 		}
 		additionalProperties := &jsonschema.Schema{}
 		err = json.Unmarshal(b, additionalProperties)
 		if err != nil {
-			return nil, err
+			return jsonschema.Schema{}, err
 		}
 		jsonSchema.AdditionalProperties = additionalProperties
 	}

 	// store read schema into memo
-	reader.Memo[schemaKey] = jsonSchema
+	reader.memo[schemaKey] = jsonSchema

 	return jsonSchema, nil
 }

-// safe againt loops in refs
+// Resolve all nested "$ref" references in the schema. This function unrolls a single
+// level of "$ref" in the schema and calls into traverseSchema to resolve nested references.
+// Thus this function and traverseSchema are mutually recursive.
+//
+// This function is safe against reference loops. If a reference loop is detected, an error
+// is returned.
 func (reader *OpenapiReader) safeResolveRefs(root *jsonschema.Schema, tracker *tracker) (*jsonschema.Schema, error) {
 	if root.Reference == nil {
 		return reader.traverseSchema(root, tracker)
@ -91,12 +100,12 @@ func (reader *OpenapiReader) safeResolveRefs(root *jsonschema.Schema, tracker *t
 	// in the memo
 	root.Reference = nil

-	// unroll one level of reference
+	// unroll one level of reference.
 	selfRef, err := reader.readOpenapiSchema(ref)
 	if err != nil {
 		return nil, err
 	}
-	root = selfRef
+	root = &selfRef
 	root.Description = description

 	// traverse again to find new references
@ -108,6 +117,8 @@ func (reader *OpenapiReader) safeResolveRefs(root *jsonschema.Schema, tracker *t
 	return root, err
 }

+// Traverse the nested properties of the schema to resolve "$ref" references. This function
+// and safeResolveRefs are mutually recursive.
 func (reader *OpenapiReader) traverseSchema(root *jsonschema.Schema, tracker *tracker) (*jsonschema.Schema, error) {
 	// case primitive (or invalid)
 	if root.Type != jsonschema.ObjectType && root.Type != jsonschema.ArrayType {
@ -154,11 +165,11 @@ func (reader *OpenapiReader) readResolvedSchema(path string) (*jsonschema.Schema
 	}
 	tracker := newTracker()
 	tracker.push(path, path)
-	root, err = reader.safeResolveRefs(root, tracker)
+	resolvedRoot, err := reader.safeResolveRefs(&root, tracker)
 	if err != nil {
 		return nil, tracker.errWithTrace(err.Error(), "")
 	}
-	return root, nil
+	return resolvedRoot, nil
 }

 func (reader *OpenapiReader) jobsDocs() (*Docs, error) {
--- a/bundle/schema/openapi_test.go
+++ b/bundle/schema/openapi_test.go
@ -48,7 +48,7 @@ func TestReadSchemaForObject(t *testing.T) {
 	spec := &openapi.Specification{}
 	reader := &OpenapiReader{
 		OpenapiSpec: spec,
-		Memo:        make(map[string]*jsonschema.Schema),
+		memo:        make(map[string]jsonschema.Schema),
 	}
 	err := json.Unmarshal([]byte(specString), spec)
 	require.NoError(t, err)
@ -106,7 +106,7 @@ func TestReadSchemaForArray(t *testing.T) {
 	spec := &openapi.Specification{}
 	reader := &OpenapiReader{
 		OpenapiSpec: spec,
-		Memo:        make(map[string]*jsonschema.Schema),
+		memo:        make(map[string]jsonschema.Schema),
 	}
 	err := json.Unmarshal([]byte(specString), spec)
 	require.NoError(t, err)
@ -152,7 +152,7 @@ func TestReadSchemaForMap(t *testing.T) {
 	spec := &openapi.Specification{}
 	reader := &OpenapiReader{
 		OpenapiSpec: spec,
-		Memo:        make(map[string]*jsonschema.Schema),
+		memo:        make(map[string]jsonschema.Schema),
 	}
 	err := json.Unmarshal([]byte(specString), spec)
 	require.NoError(t, err)
@ -201,7 +201,7 @@ func TestRootReferenceIsResolved(t *testing.T) {
 	spec := &openapi.Specification{}
 	reader := &OpenapiReader{
 		OpenapiSpec: spec,
-		Memo:        make(map[string]*jsonschema.Schema),
+		memo:        make(map[string]jsonschema.Schema),
 	}
 	err := json.Unmarshal([]byte(specString), spec)
 	require.NoError(t, err)
@ -251,7 +251,7 @@ func TestSelfReferenceLoopErrors(t *testing.T) {
 	spec := &openapi.Specification{}
 	reader := &OpenapiReader{
 		OpenapiSpec: spec,
-		Memo:        make(map[string]*jsonschema.Schema),
+		memo:        make(map[string]jsonschema.Schema),
 	}
 	err := json.Unmarshal([]byte(specString), spec)
 	require.NoError(t, err)
@ -285,7 +285,7 @@ func TestCrossReferenceLoopErrors(t *testing.T) {
 	spec := &openapi.Specification{}
 	reader := &OpenapiReader{
 		OpenapiSpec: spec,
-		Memo:        make(map[string]*jsonschema.Schema),
+		memo:        make(map[string]jsonschema.Schema),
 	}
 	err := json.Unmarshal([]byte(specString), spec)
 	require.NoError(t, err)
@ -330,7 +330,7 @@ func TestReferenceResolutionForMapInObject(t *testing.T) {
 	spec := &openapi.Specification{}
 	reader := &OpenapiReader{
 		OpenapiSpec: spec,
-		Memo:        make(map[string]*jsonschema.Schema),
+		memo:        make(map[string]jsonschema.Schema),
 	}
 	err := json.Unmarshal([]byte(specString), spec)
 	require.NoError(t, err)
@ -400,7 +400,7 @@ func TestReferenceResolutionForArrayInObject(t *testing.T) {
 	spec := &openapi.Specification{}
 	reader := &OpenapiReader{
 		OpenapiSpec: spec,
-		Memo:        make(map[string]*jsonschema.Schema),
+		memo:        make(map[string]jsonschema.Schema),
 	}
 	err := json.Unmarshal([]byte(specString), spec)
 	require.NoError(t, err)
@ -434,3 +434,61 @@ func TestReferenceResolutionForArrayInObject(t *testing.T) {
 	t.Log("[DEBUG] expected: ", expected)
 	assert.Equal(t, expected, string(fruitsSchemaJson))
 }
+
+func TestReferenceResolutionDoesNotOverwriteDescriptions(t *testing.T) {
+	specString := `{
+		"components": {
+			"schemas": {
+				"foo": {
+					"type": "number"
+				},
+				"fruits": {
+					"type": "object",
+					"properties": {
+						"guava": {
+							"type": "object",
+							"description": "Guava is a fruit",
+							"$ref": "#/components/schemas/foo"
+						},
+						"mango": {
+							"type": "object",
+							"description": "What is a mango?",
+							"$ref": "#/components/schemas/foo"
+						}
+					}
+				}
+			}
+		}
+	}`
+	spec := &openapi.Specification{}
+	reader := &OpenapiReader{
+		OpenapiSpec: spec,
+		memo:        make(map[string]jsonschema.Schema),
+	}
+	err := json.Unmarshal([]byte(specString), spec)
+	require.NoError(t, err)
+
+	fruitsSchema, err := reader.readResolvedSchema("#/components/schemas/fruits")
+	require.NoError(t, err)
+
+	fruitsSchemaJson, err := json.MarshalIndent(fruitsSchema, "		", "	")
+	require.NoError(t, err)
+
+	expected := `{
+			"type": "object",
+			"properties": {
+				"guava": {
+					"type": "number",
+					"description": "Guava is a fruit"
+				},
+				"mango": {
+					"type": "number",
+					"description": "What is a mango?"
+				}
+			}
+		}`
+
+	t.Log("[DEBUG] actual: ", string(fruitsSchemaJson))
+	t.Log("[DEBUG] expected: ", expected)
+	assert.Equal(t, expected, string(fruitsSchemaJson))
+}