databricks-cli/bundle/config/mutator/python/python_locations.go

package python

import (
	"encoding/json"
	"fmt"
	"io"
	pathlib "path"
	"path/filepath"

	"github.com/databricks/cli/libs/dyn"
)

// generatedFileName is used as the virtual file name for YAML generated by Python code.
//
// mergePythonLocations replaces dyn.Location with generatedFileName with locations loaded
// from locations.json
const generatedFileName = "__generated_by_python__.yml"

// pythonLocations is data structure for efficient location lookup for a given path
//
// Locations form a tree, and we assign locations of the closest ancestor to each dyn.Value based on its path.
// We implement it as a trie (prefix tree) where keys are components of the path. With that, lookups are O(n)
// where n is the number of components in the path.
//
// For example, with locations.json:
//
//		{"path": "resources.jobs.job_0", "file": "resources/job_0.py", "line": 3, "column": 5}
//		{"path": "resources.jobs.job_0.tasks[0].task_key", "file": "resources/job_0.py", "line": 10, "column": 5}
//		{"path": "resources.jobs.job_1", "file": "resources/job_1.py", "line": 5, "column": 7}
//
//	- resources.jobs.job_0.tasks[0].task_key is located at job_0.py:10:5
//
//	- resources.jobs.job_0.tasks[0].email_notifications is located at job_0.py:3:5,
//	  because we use the location of the job as the most precise approximation.
//
// See pythonLocationEntry for the structure of a single entry in locations.json
type pythonLocations struct {
	// descendants referenced by index, e.g. '.foo'
	keys map[string]*pythonLocations

	// descendants referenced by key, e.g. '[0]'
	indexes map[int]*pythonLocations

	// location for the current node if it exists
	location dyn.Location

	// if true, location is present
	exists bool
}

// pythonLocationEntry is a single entry in locations.json
type pythonLocationEntry struct {
	Path   string `json:"path"`
	File   string `json:"file"`
	Line   int    `json:"line"`
	Column int    `json:"column"`
}

// mergePythonLocations applies locations from Python mutator into given dyn.Value
//
// The primary use-case is to merge locations.json with output.json, so that any
// validation errors will point to Python source code instead of generated YAML.
func mergePythonLocations(value dyn.Value, locations *pythonLocations) (dyn.Value, error) {
	return dyn.Walk(value, func(path dyn.Path, value dyn.Value) (dyn.Value, error) {
		newLocation, ok := findPythonLocation(locations, path)
		if !ok {
			return value, nil
		}

		// The first item in the list is the "last" location used for error reporting
		//
		// Loaded YAML uses virtual file path as location, we remove any of such references,
		// because they should use 'newLocation' instead.
		//
		// We preserve any previous non-virtual locations in case when Python function modified
		// resource defined in YAML.
		newLocations := append(
			[]dyn.Location{newLocation},
			removeVirtualLocations(value.Locations())...,
		)

		return value.WithLocations(newLocations), nil
	})
}

func removeVirtualLocations(locations []dyn.Location) []dyn.Location {
	var newLocations []dyn.Location

	for _, location := range locations {
		if filepath.Base(location.File) == generatedFileName {
			continue
		}

		newLocations = append(newLocations, location)
	}

	return newLocations
}

// parsePythonLocations parses locations.json from the Python mutator.
//
// locations file is newline-separated JSON objects with pythonLocationEntry structure.
func parsePythonLocations(bundleRoot string, input io.Reader) (*pythonLocations, error) {
	decoder := json.NewDecoder(input)
	locations := newPythonLocations()

	for decoder.More() {
		var entry pythonLocationEntry

		err := decoder.Decode(&entry)
		if err != nil {
			return nil, fmt.Errorf("failed to parse python location: %s", err)
		}

		path, err := dyn.NewPathFromString(entry.Path)
		if err != nil {
			return nil, fmt.Errorf("failed to parse python location: %s", err)
		}

		// Output can contain both relative paths and absolute paths outside of bundle root.
		// Mutator pipeline expects all path to be absolute at this point, so make all paths absolute.
		if !pathlib.IsAbs(entry.File) {
			entry.File = filepath.Join(bundleRoot, entry.File)
		}

		location := dyn.Location{
			File:   entry.File,
			Line:   entry.Line,
			Column: entry.Column,
		}

		putPythonLocation(locations, path, location)
	}

	return locations, nil
}

// putPythonLocation puts the location to the trie for the given path
func putPythonLocation(trie *pythonLocations, path dyn.Path, location dyn.Location) {
	currentNode := trie

	for _, component := range path {
		if key := component.Key(); key != "" {
			if _, ok := currentNode.keys[key]; !ok {
				currentNode.keys[key] = newPythonLocations()
			}

			currentNode = currentNode.keys[key]
		} else {
			index := component.Index()
			if _, ok := currentNode.indexes[index]; !ok {
				currentNode.indexes[index] = newPythonLocations()
			}

			currentNode = currentNode.indexes[index]
		}
	}

	currentNode.location = location
	currentNode.exists = true
}

// newPythonLocations creates a new trie node
func newPythonLocations() *pythonLocations {
	return &pythonLocations{
		keys:    make(map[string]*pythonLocations),
		indexes: make(map[int]*pythonLocations),
	}
}

// findPythonLocation finds the location or closest ancestor location in the trie for the given path
// if no ancestor or exact location is found, false is returned.
func findPythonLocation(locations *pythonLocations, path dyn.Path) (dyn.Location, bool) {
	currentNode := locations
	lastLocation := locations.location
	exists := locations.exists

	for _, component := range path {
		if key := component.Key(); key != "" {
			if _, ok := currentNode.keys[key]; !ok {
				break
			}

			currentNode = currentNode.keys[key]
		} else {
			index := component.Index()
			if _, ok := currentNode.indexes[index]; !ok {
				break
			}

			currentNode = currentNode.indexes[index]
		}

		if currentNode.exists {
			lastLocation = currentNode.location
			exists = true
		}
	}

	return lastLocation, exists
}
PythonMutator: propagate source locations (#1783) ## Changes Add a mechanism to load Python source locations in the Python mutator. Previously, locations pointed to generated YAML. Now, they point to Python sources instead. Python process outputs "locations.json" containing locations of bundle paths, examples: ```json {"path": "resources.jobs.job_0", "file": "resources/job_0.py", "line": 3, "column": 5} {"path": "resources.jobs.job_0.tasks[0].task_key", "file": "resources/job_0.py", "line": 10, "column": 5} {"path": "resources.jobs.job_1", "file": "resources/job_1.py", "line": 5, "column": 7} ``` Such locations form a tree, and we assign locations of the closest ancestor to each `dyn.Value` based on its path. For example, `resources.jobs.job_0.tasks[0].task_key` is located at `job_0.py:10:5` and `resources.jobs.job_0.tasks[0].email_notifications` is located at `job_0.py:3:5`, because we use the location of the job as the most precise approximation. This feature is only enabled if `experimental/python` is used. Note: for now, we don't update locations with relative paths, because it has a side effect in changing how these paths are resolved ## Example ``` % databricks bundle validate Warning: job_cluster_key abc is not defined at resources.jobs.examples.tasks[0].job_cluster_key in resources/example.py:10:1 ``` ## Tests Unit tests and manually 2025-01-22 15:37:37 +00:00			`package python`

			`import (`
			`"encoding/json"`
			`"fmt"`
			`"io"`
PythonMutator: Fix relative path error (#2253) ## Changes Fix relative path errors in the Python mutator that was failing during deployment since v0.239.1. Before that: ``` % databricks bundle deploy Deploying resources... Updating deployment state... Error: failed to compute relative path for job jobs_as_code_project_job: Rel: can't make resources/jobs_as_code_project_job.py relative to /Users/$USER/jobs_as_code_project ``` As a result, the bundle was deployed, but the deployment state wasn't updated. ## Tests Unit tests, adding acceptance tests in https://github.com/databricks/cli/pull/2254 2025-01-29 13:56:57 +00:00			`pathlib "path"`
PythonMutator: propagate source locations (#1783) ## Changes Add a mechanism to load Python source locations in the Python mutator. Previously, locations pointed to generated YAML. Now, they point to Python sources instead. Python process outputs "locations.json" containing locations of bundle paths, examples: ```json {"path": "resources.jobs.job_0", "file": "resources/job_0.py", "line": 3, "column": 5} {"path": "resources.jobs.job_0.tasks[0].task_key", "file": "resources/job_0.py", "line": 10, "column": 5} {"path": "resources.jobs.job_1", "file": "resources/job_1.py", "line": 5, "column": 7} ``` Such locations form a tree, and we assign locations of the closest ancestor to each `dyn.Value` based on its path. For example, `resources.jobs.job_0.tasks[0].task_key` is located at `job_0.py:10:5` and `resources.jobs.job_0.tasks[0].email_notifications` is located at `job_0.py:3:5`, because we use the location of the job as the most precise approximation. This feature is only enabled if `experimental/python` is used. Note: for now, we don't update locations with relative paths, because it has a side effect in changing how these paths are resolved ## Example ``` % databricks bundle validate Warning: job_cluster_key abc is not defined at resources.jobs.examples.tasks[0].job_cluster_key in resources/example.py:10:1 ``` ## Tests Unit tests and manually 2025-01-22 15:37:37 +00:00			`"path/filepath"`

			`"github.com/databricks/cli/libs/dyn"`
			`)`

			`// generatedFileName is used as the virtual file name for YAML generated by Python code.`
			`//`
			`// mergePythonLocations replaces dyn.Location with generatedFileName with locations loaded`
			`// from locations.json`
			`const generatedFileName = "__generated_by_python__.yml"`

			`// pythonLocations is data structure for efficient location lookup for a given path`
			`//`
			`// Locations form a tree, and we assign locations of the closest ancestor to each dyn.Value based on its path.`
			`// We implement it as a trie (prefix tree) where keys are components of the path. With that, lookups are O(n)`
			`// where n is the number of components in the path.`
			`//`
			`// For example, with locations.json:`
			`//`
			`// {"path": "resources.jobs.job_0", "file": "resources/job_0.py", "line": 3, "column": 5}`
			`// {"path": "resources.jobs.job_0.tasks[0].task_key", "file": "resources/job_0.py", "line": 10, "column": 5}`
			`// {"path": "resources.jobs.job_1", "file": "resources/job_1.py", "line": 5, "column": 7}`
			`//`
			`// - resources.jobs.job_0.tasks[0].task_key is located at job_0.py:10:5`
			`//`
			`// - resources.jobs.job_0.tasks[0].email_notifications is located at job_0.py:3:5,`
			`// because we use the location of the job as the most precise approximation.`
			`//`
			`// See pythonLocationEntry for the structure of a single entry in locations.json`
			`type pythonLocations struct {`
			`// descendants referenced by index, e.g. '.foo'`
			`keys map[string]*pythonLocations`

			`// descendants referenced by key, e.g. '[0]'`
			`indexes map[int]*pythonLocations`

			`// location for the current node if it exists`
			`location dyn.Location`

			`// if true, location is present`
			`exists bool`
			`}`

			`// pythonLocationEntry is a single entry in locations.json`
			`type pythonLocationEntry struct {`
			Path string `json:"path"`
			File string `json:"file"`
			Line int `json:"line"`
			Column int `json:"column"`
			`}`

			`// mergePythonLocations applies locations from Python mutator into given dyn.Value`
			`//`
			`// The primary use-case is to merge locations.json with output.json, so that any`
			`// validation errors will point to Python source code instead of generated YAML.`
			`func mergePythonLocations(value dyn.Value, locations *pythonLocations) (dyn.Value, error) {`
			`return dyn.Walk(value, func(path dyn.Path, value dyn.Value) (dyn.Value, error) {`
			`newLocation, ok := findPythonLocation(locations, path)`
			`if !ok {`
			`return value, nil`
			`}`

			`// The first item in the list is the "last" location used for error reporting`
			`//`
			`// Loaded YAML uses virtual file path as location, we remove any of such references,`
			`// because they should use 'newLocation' instead.`
			`//`
			`// We preserve any previous non-virtual locations in case when Python function modified`
			`// resource defined in YAML.`
			`newLocations := append(`
			`[]dyn.Location{newLocation},`
			`removeVirtualLocations(value.Locations())...,`
			`)`

			`return value.WithLocations(newLocations), nil`
			`})`
			`}`

			`func removeVirtualLocations(locations []dyn.Location) []dyn.Location {`
			`var newLocations []dyn.Location`

			`for _, location := range locations {`
			`if filepath.Base(location.File) == generatedFileName {`
			`continue`
			`}`

			`newLocations = append(newLocations, location)`
			`}`

			`return newLocations`
			`}`

			`// parsePythonLocations parses locations.json from the Python mutator.`
			`//`
			`// locations file is newline-separated JSON objects with pythonLocationEntry structure.`
PythonMutator: Fix relative path error (#2253) ## Changes Fix relative path errors in the Python mutator that was failing during deployment since v0.239.1. Before that: ``` % databricks bundle deploy Deploying resources... Updating deployment state... Error: failed to compute relative path for job jobs_as_code_project_job: Rel: can't make resources/jobs_as_code_project_job.py relative to /Users/$USER/jobs_as_code_project ``` As a result, the bundle was deployed, but the deployment state wasn't updated. ## Tests Unit tests, adding acceptance tests in https://github.com/databricks/cli/pull/2254 2025-01-29 13:56:57 +00:00			`func parsePythonLocations(bundleRoot string, input io.Reader) (*pythonLocations, error) {`
PythonMutator: propagate source locations (#1783) ## Changes Add a mechanism to load Python source locations in the Python mutator. Previously, locations pointed to generated YAML. Now, they point to Python sources instead. Python process outputs "locations.json" containing locations of bundle paths, examples: ```json {"path": "resources.jobs.job_0", "file": "resources/job_0.py", "line": 3, "column": 5} {"path": "resources.jobs.job_0.tasks[0].task_key", "file": "resources/job_0.py", "line": 10, "column": 5} {"path": "resources.jobs.job_1", "file": "resources/job_1.py", "line": 5, "column": 7} ``` Such locations form a tree, and we assign locations of the closest ancestor to each `dyn.Value` based on its path. For example, `resources.jobs.job_0.tasks[0].task_key` is located at `job_0.py:10:5` and `resources.jobs.job_0.tasks[0].email_notifications` is located at `job_0.py:3:5`, because we use the location of the job as the most precise approximation. This feature is only enabled if `experimental/python` is used. Note: for now, we don't update locations with relative paths, because it has a side effect in changing how these paths are resolved ## Example ``` % databricks bundle validate Warning: job_cluster_key abc is not defined at resources.jobs.examples.tasks[0].job_cluster_key in resources/example.py:10:1 ``` ## Tests Unit tests and manually 2025-01-22 15:37:37 +00:00			`decoder := json.NewDecoder(input)`
			`locations := newPythonLocations()`

			`for decoder.More() {`
			`var entry pythonLocationEntry`

			`err := decoder.Decode(&entry)`
			`if err != nil {`
			`return nil, fmt.Errorf("failed to parse python location: %s", err)`
			`}`

			`path, err := dyn.NewPathFromString(entry.Path)`
			`if err != nil {`
			`return nil, fmt.Errorf("failed to parse python location: %s", err)`
			`}`

PythonMutator: Fix relative path error (#2253) ## Changes Fix relative path errors in the Python mutator that was failing during deployment since v0.239.1. Before that: ``` % databricks bundle deploy Deploying resources... Updating deployment state... Error: failed to compute relative path for job jobs_as_code_project_job: Rel: can't make resources/jobs_as_code_project_job.py relative to /Users/$USER/jobs_as_code_project ``` As a result, the bundle was deployed, but the deployment state wasn't updated. ## Tests Unit tests, adding acceptance tests in https://github.com/databricks/cli/pull/2254 2025-01-29 13:56:57 +00:00			`// Output can contain both relative paths and absolute paths outside of bundle root.`
			`// Mutator pipeline expects all path to be absolute at this point, so make all paths absolute.`
			`if !pathlib.IsAbs(entry.File) {`
			`entry.File = filepath.Join(bundleRoot, entry.File)`
			`}`

PythonMutator: propagate source locations (#1783) ## Changes Add a mechanism to load Python source locations in the Python mutator. Previously, locations pointed to generated YAML. Now, they point to Python sources instead. Python process outputs "locations.json" containing locations of bundle paths, examples: ```json {"path": "resources.jobs.job_0", "file": "resources/job_0.py", "line": 3, "column": 5} {"path": "resources.jobs.job_0.tasks[0].task_key", "file": "resources/job_0.py", "line": 10, "column": 5} {"path": "resources.jobs.job_1", "file": "resources/job_1.py", "line": 5, "column": 7} ``` Such locations form a tree, and we assign locations of the closest ancestor to each `dyn.Value` based on its path. For example, `resources.jobs.job_0.tasks[0].task_key` is located at `job_0.py:10:5` and `resources.jobs.job_0.tasks[0].email_notifications` is located at `job_0.py:3:5`, because we use the location of the job as the most precise approximation. This feature is only enabled if `experimental/python` is used. Note: for now, we don't update locations with relative paths, because it has a side effect in changing how these paths are resolved ## Example ``` % databricks bundle validate Warning: job_cluster_key abc is not defined at resources.jobs.examples.tasks[0].job_cluster_key in resources/example.py:10:1 ``` ## Tests Unit tests and manually 2025-01-22 15:37:37 +00:00			`location := dyn.Location{`
			`File: entry.File,`
			`Line: entry.Line,`
			`Column: entry.Column,`
			`}`

			`putPythonLocation(locations, path, location)`
			`}`

			`return locations, nil`
			`}`

			`// putPythonLocation puts the location to the trie for the given path`
			`func putPythonLocation(trie *pythonLocations, path dyn.Path, location dyn.Location) {`
			`currentNode := trie`

			`for _, component := range path {`
			`if key := component.Key(); key != "" {`
			`if _, ok := currentNode.keys[key]; !ok {`
			`currentNode.keys[key] = newPythonLocations()`
			`}`

			`currentNode = currentNode.keys[key]`
			`} else {`
			`index := component.Index()`
			`if _, ok := currentNode.indexes[index]; !ok {`
			`currentNode.indexes[index] = newPythonLocations()`
			`}`

			`currentNode = currentNode.indexes[index]`
			`}`
			`}`

			`currentNode.location = location`
			`currentNode.exists = true`
			`}`

			`// newPythonLocations creates a new trie node`
			`func newPythonLocations() *pythonLocations {`
			`return &pythonLocations{`
			`keys: make(map[string]*pythonLocations),`
			`indexes: make(map[int]*pythonLocations),`
			`}`
			`}`

			`// findPythonLocation finds the location or closest ancestor location in the trie for the given path`
			`// if no ancestor or exact location is found, false is returned.`
			`func findPythonLocation(locations *pythonLocations, path dyn.Path) (dyn.Location, bool) {`
			`currentNode := locations`
			`lastLocation := locations.location`
			`exists := locations.exists`

			`for _, component := range path {`
			`if key := component.Key(); key != "" {`
			`if _, ok := currentNode.keys[key]; !ok {`
			`break`
			`}`

			`currentNode = currentNode.keys[key]`
			`} else {`
			`index := component.Index()`
			`if _, ok := currentNode.indexes[index]; !ok {`
			`break`
			`}`

			`currentNode = currentNode.indexes[index]`
			`}`

			`if currentNode.exists {`
			`lastLocation = currentNode.location`
			`exists = true`
			`}`
			`}`

			`return lastLocation, exists`
			`}`