diff --git a/bundle/schema/README.md b/bundle/schema/README.md new file mode 100644 index 00000000..cd849be0 --- /dev/null +++ b/bundle/schema/README.md @@ -0,0 +1,26 @@ +### Overview + +`docs/bundle_descriptions.json` contains both autogenerated as well as manually written +descriptions for the json schema. Specifically +1. `resources` : almost all descriptions are autogenerated from the OpenAPI spec +2. `environments` : almost all descriptions are copied over from root level entities (eg: `bundle`, `artifacts`) +3. `bundle` : manually editted +4. `include` : manually editted +5. `workspace` : manually editted +6. `artifacts` : manually editted + +These descriptions are rendered in the inline documentation in an IDE + +### SOP: Add schema descriptions for new fields in bundle config + +1. You can autogenerate empty descriptions for the new fields by running +`bricks bundle schema --only-docs > ~/bricks/bundle/schema/docs/bundle_descriptions.json` +2. Manually edit bundle_descriptions.json to add your descriptions +3. Again run `bricks bundle schema --only-docs > ~/bricks/bundle/schema/docs/bundle_descriptions.json` to copy over any applicable descriptions to `environments` +4. push to repo + + +### SOP: Update descriptions in resources from a newer openapi spec + +1. Run `bricks bundle schema --only-docs --openapi PATH_TO_SPEC > ~/bricks/bundle/schema/docs/bundle_descriptions.json` +2. push to repo diff --git a/bundle/schema/bundle_config_docs.yml b/bundle/schema/bundle_config_docs.yml deleted file mode 100644 index eeb102fc..00000000 --- a/bundle/schema/bundle_config_docs.yml +++ /dev/null @@ -1,12 +0,0 @@ -documentation: Root of the bundle config -children: - bundle: - documentation: | - Bundle contains details about this bundle, such as its name, - version of the spec (TODO), default cluster, default warehouse, etc. - children: - environment: - documentation: Environment is set by the mutator that selects the environment. - - artifacts: - documentation: Artifacts contains a description of all code artifacts in this bundle. \ No newline at end of file diff --git a/bundle/schema/docs.go b/bundle/schema/docs.go index 8522b17b..c8ad15a3 100644 --- a/bundle/schema/docs.go +++ b/bundle/schema/docs.go @@ -2,37 +2,109 @@ package schema import ( _ "embed" + "encoding/json" + "fmt" "os" + "reflect" - "gopkg.in/yaml.v3" + "github.com/databricks/bricks/bundle/config" + "github.com/databricks/databricks-sdk-go/openapi" ) +// A subset of Schema struct type Docs struct { - Documentation string `json:"documentation"` - Children map[string]Docs `json:"children"` + Description string `json:"description"` + Properties map[string]*Docs `json:"properties,omitempty"` + Items *Docs `json:"items,omitempty"` + AdditionalProperties *Docs `json:"additionalproperties,omitempty"` } -func LoadDocs(path string) (*Docs, error) { - bytes, err := os.ReadFile(path) - if err != nil { - return nil, err - } - docs := Docs{} - err = yaml.Unmarshal(bytes, &docs) - if err != nil { - return nil, err - } - return &docs, nil -} - -//go:embed bundle_config_docs.yml +//go:embed docs/bundle_descriptions.json var bundleDocs []byte -func GetBundleDocs() (*Docs, error) { - docs := Docs{} - err := yaml.Unmarshal(bundleDocs, &docs) +func BundleDocs(openapiSpecPath string) (*Docs, error) { + docs, err := initializeBundleDocs() if err != nil { return nil, err } - return &docs, nil + if openapiSpecPath != "" { + openapiSpec, err := os.ReadFile(openapiSpecPath) + if err != nil { + return nil, err + } + spec := &openapi.Specification{} + err = json.Unmarshal(openapiSpec, spec) + if err != nil { + return nil, err + } + openapiReader := &OpenapiReader{ + OpenapiSpec: spec, + Memo: make(map[string]*Schema), + } + resourcesDocs, err := openapiReader.ResourcesDocs() + if err != nil { + return nil, err + } + resourceSchema, err := New(reflect.TypeOf(config.Resources{}), resourcesDocs) + if err != nil { + return nil, err + } + docs.Properties["resources"] = schemaToDocs(resourceSchema) + } + docs.refreshEnvironmentsDocs() + return docs, nil +} + +func (docs *Docs) refreshEnvironmentsDocs() error { + environmentsDocs, ok := docs.Properties["environments"] + if !ok || environmentsDocs.AdditionalProperties == nil || + environmentsDocs.AdditionalProperties.Properties == nil { + return fmt.Errorf("invalid environments descriptions") + } + environmentProperties := environmentsDocs.AdditionalProperties.Properties + propertiesToCopy := []string{"artifacts", "bundle", "resources", "workspace"} + for _, p := range propertiesToCopy { + environmentProperties[p] = docs.Properties[p] + } + return nil +} + +func initializeBundleDocs() (*Docs, error) { + // load embedded descriptions + embedded := Docs{} + err := json.Unmarshal(bundleDocs, &embedded) + if err != nil { + return nil, err + } + // generate schema with the embedded descriptions + schema, err := New(reflect.TypeOf(config.Root{}), &embedded) + if err != nil { + return nil, err + } + // converting the schema back to docs. This creates empty descriptions + // for any properties that were missing in the embedded descriptions + docs := schemaToDocs(schema) + return docs, nil +} + +// *Docs are a subset of *Schema, this function selects that subset +func schemaToDocs(schema *Schema) *Docs { + // terminate recursion if schema is nil + if schema == nil { + return nil + } + docs := &Docs{ + Description: schema.Description, + } + if len(schema.Properties) > 0 { + docs.Properties = make(map[string]*Docs) + } + for k, v := range schema.Properties { + docs.Properties[k] = schemaToDocs(v) + } + docs.Items = schemaToDocs(schema.Items) + if additionalProperties, ok := schema.AdditionalProperties.(*Schema); ok { + docs.AdditionalProperties = schemaToDocs(additionalProperties) + } + return docs } diff --git a/bundle/schema/docs/bundle_descriptions.json b/bundle/schema/docs/bundle_descriptions.json new file mode 100644 index 00000000..ed2cb72b --- /dev/null +++ b/bundle/schema/docs/bundle_descriptions.json @@ -0,0 +1,2862 @@ +{ + "description": "Root of the bundle config", + "properties": { + "artifacts": { + "description": "Artifacts contains a description of all code artifacts in this bundle.", + "additionalproperties": { + "description": "", + "properties": { + "notebook": { + "description": "", + "properties": { + "language": { + "description": "" + }, + "local_path": { + "description": "" + }, + "path": { + "description": "" + }, + "remote_path": { + "description": "" + } + } + } + } + } + }, + "bundle": { + "description": "Bundle contains details about this bundle, such as its name,\nversion of the spec (TODO), default cluster, default warehouse, etc.\n", + "properties": { + "environment": { + "description": "Environment is set by the mutator that selects the environment." + }, + "name": { + "description": "" + }, + "terraform": { + "description": "", + "properties": { + "exec_path": { + "description": "" + } + } + } + } + }, + "environments": { + "description": "", + "additionalproperties": { + "description": "", + "properties": { + "artifacts": { + "description": "Artifacts contains a description of all code artifacts in this bundle.", + "additionalproperties": { + "description": "", + "properties": { + "notebook": { + "description": "", + "properties": { + "language": { + "description": "" + }, + "local_path": { + "description": "" + }, + "path": { + "description": "" + }, + "remote_path": { + "description": "" + } + } + } + } + } + }, + "bundle": { + "description": "Bundle contains details about this bundle, such as its name,\nversion of the spec (TODO), default cluster, default warehouse, etc.\n", + "properties": { + "environment": { + "description": "Environment is set by the mutator that selects the environment." + }, + "name": { + "description": "" + }, + "terraform": { + "description": "", + "properties": { + "exec_path": { + "description": "" + } + } + } + } + }, + "default": { + "description": "" + }, + "resources": { + "description": "Specification of databricks resources to instantiate", + "properties": { + "jobs": { + "description": "List of job definations", + "additionalproperties": { + "description": "", + "properties": { + "email_notifications": { + "description": "An optional set of email addresses that is notified when runs of this job begin or complete as well as when this job is deleted. The default behavior is to not send any emails.", + "properties": { + "no_alert_for_skipped_runs": { + "description": "If true, do not send email to recipients specified in `on_failure` if the run is skipped." + }, + "on_failure": { + "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `SKIPPED`, `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_start": { + "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_success": { + "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESSFUL` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + } + } + }, + "format": { + "description": "Used to tell what is the format of the job. This field is ignored in Create/Update/Reset calls. When using the Jobs API 2.1 this value is always set to `\"MULTI_TASK\"`." + }, + "git_source": { + "description": "An optional specification for a remote repository containing the notebooks used by this job's notebook tasks.", + "properties": { + "git_branch": { + "description": "Name of the branch to be checked out and used by this job.\nThis field cannot be specified in conjunction with git_tag or git_commit.\n\nThe maximum length is 255 characters.\n" + }, + "git_commit": { + "description": "Commit to be checked out and used by this job. This field cannot be specified in conjunction with git_branch or git_tag.\nThe maximum length is 64 characters." + }, + "git_provider": { + "description": "Unique identifier of the service used to host the Git repository. The value is case insensitive." + }, + "git_snapshot": { + "description": "", + "properties": { + "used_commit": { + "description": "Commit that was used to execute the run. If git_branch was specified, this points to the HEAD of the branch at the time of the run; if git_tag was specified, this points to the commit the tag points to." + } + } + }, + "git_tag": { + "description": "Name of the tag to be checked out and used by this job.\nThis field cannot be specified in conjunction with git_branch or git_commit.\n\nThe maximum length is 255 characters.\n" + }, + "git_url": { + "description": "URL of the repository to be cloned by this job.\nThe maximum length is 300 characters." + } + } + }, + "id": { + "description": "" + }, + "job_clusters": { + "description": "A list of job cluster specifications that can be shared and reused by tasks of this job. Libraries cannot be declared in a shared job cluster. You must declare dependent libraries in task settings.", + "items": { + "description": "", + "properties": { + "job_cluster_key": { + "description": "A unique name for the job cluster. This field is required and must be unique within the job.\n`JobTaskSettings` may refer to this field to determine which cluster to launch for the task execution." + }, + "new_cluster": { + "description": "If new_cluster, a description of a cluster that is created for each task.", + "properties": { + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "autotermination_minutes": { + "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either region or endpoint should also be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + } + } + }, + "cluster_name": { + "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" + }, + "cluster_source": { + "description": "" + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" + }, + "enable_elastic_disk": { + "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." + }, + "enable_local_disk_encryption": { + "description": "Whether to enable LUKS on cluster VMs' local disks" + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + } + } + }, + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "runtime_engine": { + "description": "" + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", + "additionalproperties": { + "description": "" + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" + } + }, + "spark_version": { + "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } + }, + "workload_type": { + "description": "", + "properties": { + "clients": { + "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", + "properties": { + "jobs": { + "description": "With jobs set, the cluster can be used for jobs" + }, + "notebooks": { + "description": "With notebooks set, this cluster can be used for notebooks" + } + } + } + } + } + } + } + } + } + }, + "max_concurrent_runs": { + "description": "An optional maximum allowed number of concurrent runs of the job.\n\nSet this value if you want to be able to execute multiple runs of the same job concurrently. This is useful for example if you trigger your job on a frequent schedule and want to allow consecutive runs to overlap with each other, or if you want to trigger multiple runs which differ by their input parameters.\n\nThis setting affects only new runs. For example, suppose the job’s concurrency is 4 and there are 4 concurrent active runs. Then setting the concurrency to 3 won’t kill any of the active runs. However, from then on, new runs are skipped unless there are fewer than 3 active runs.\n\nThis value cannot exceed 1000\\. Setting this value to 0 causes all new runs to be skipped. The default behavior is to allow only 1 concurrent run." + }, + "name": { + "description": "An optional name for the job." + }, + "schedule": { + "description": "An optional periodic schedule for this job. The default behavior is that the job only runs when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.", + "properties": { + "pause_status": { + "description": "Indicate whether this schedule is paused or not." + }, + "quartz_cron_expression": { + "description": "A Cron expression using Quartz syntax that describes the schedule for a job.\nSee [Cron Trigger](http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/crontrigger.html)\nfor details. This field is required.\"\n" + }, + "timezone_id": { + "description": "A Java timezone ID. The schedule for a job is resolved with respect to this timezone.\nSee [Java TimeZone](https://docs.oracle.com/javase/7/docs/api/java/util/TimeZone.html) for details.\nThis field is required.\n" + } + } + }, + "tags": { + "description": "A map of tags associated with the job. These are forwarded to the cluster as cluster tags for jobs clusters, and are subject to the same limitations as cluster tags. A maximum of 25 tags can be added to the job.", + "additionalproperties": { + "description": "" + } + }, + "tasks": { + "description": "A list of task specifications to be executed by this job.", + "items": { + "description": "", + "properties": { + "dbt_task": { + "description": "If dbt_task, indicates that this must execute a dbt task. It requires both Databricks SQL and the ability to use a serverless or a pro SQL warehouse.", + "properties": { + "catalog": { + "description": "Optional name of the catalog to use. The value is the top level in the 3-level namespace of Unity Catalog (catalog / schema / relation). The catalog value can only be specified if a warehouse_id is specified. Requires dbt-databricks \u003e= 1.1.1." + }, + "commands": { + "description": "A list of dbt commands to execute. All commands must start with `dbt`. This parameter must not be empty. A maximum of up to 10 commands can be provided.", + "items": { + "description": "" + } + }, + "profiles_directory": { + "description": "Optional (relative) path to the profiles directory. Can only be specified if no warehouse_id is specified. If no warehouse_id is specified and this folder is unset, the root directory is used." + }, + "project_directory": { + "description": "Optional (relative) path to the project directory, if no value is provided, the root of the git repository is used." + }, + "schema": { + "description": "Optional schema to write to. This parameter is only used when a warehouse_id is also provided. If not provided, the `default` schema is used." + }, + "warehouse_id": { + "description": "ID of the SQL warehouse to connect to. If provided, we automatically generate and provide the profile and connection details to dbt. It can be overridden on a per-command basis by using the `--profiles-dir` command line argument." + } + } + }, + "depends_on": { + "description": "", + "items": { + "description": "", + "properties": { + "task_key": { + "description": "" + } + } + } + }, + "description": { + "description": "An optional description for this task.\nThe maximum length is 4096 bytes." + }, + "email_notifications": { + "description": "An optional set of email addresses that is notified when runs of this job begin or complete as well as when this job is deleted. The default behavior is to not send any emails.", + "properties": { + "no_alert_for_skipped_runs": { + "description": "If true, do not send email to recipients specified in `on_failure` if the run is skipped." + }, + "on_failure": { + "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `SKIPPED`, `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_start": { + "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_success": { + "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESSFUL` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + } + } + }, + "existing_cluster_id": { + "description": "If existing_cluster_id, the ID of an existing cluster that is used for all runs of this task. When running tasks on an existing cluster, you may need to manually restart the cluster if it stops responding. We suggest running jobs on new clusters for greater reliability." + }, + "job_cluster_key": { + "description": "If job_cluster_key, this task is executed reusing the cluster specified in `job.settings.job_clusters`." + }, + "libraries": { + "description": "An optional list of libraries to be installed on the cluster that executes the task. The default value is an empty list.", + "items": { + "description": "", + "properties": { + "cran": { + "description": "Specification of a CRAN library to be installed as part of the library", + "properties": { + "package": { + "description": "The name of the CRAN package to install." + }, + "repo": { + "description": "The repository where the package can be found. If not specified, the default CRAN repo is used." + } + } + }, + "egg": { + "description": "URI of the egg to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"egg\": \"dbfs:/my/egg\" }` or\n`{ \"egg\": \"s3://my-bucket/egg\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + }, + "jar": { + "description": "URI of the jar to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"jar\": \"dbfs:/mnt/databricks/library.jar\" }` or\n`{ \"jar\": \"s3://my-bucket/library.jar\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + }, + "maven": { + "description": "Specification of a maven library to be installed. For example:\n`{ \"coordinates\": \"org.jsoup:jsoup:1.7.2\" }`", + "properties": { + "coordinates": { + "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." + }, + "exclusions": { + "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", + "items": { + "description": "" + } + }, + "repo": { + "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." + } + } + }, + "pypi": { + "description": "Specification of a PyPi library to be installed. For example:\n`{ \"package\": \"simplejson\" }`", + "properties": { + "package": { + "description": "The name of the pypi package to install. An optional exact version specification is also\nsupported. Examples: \"simplejson\" and \"simplejson==3.8.0\"." + }, + "repo": { + "description": "The repository where the package can be found. If not specified, the default pip index is\nused." + } + } + }, + "whl": { + "description": "URI of the wheel to be installed.\nFor example: `{ \"whl\": \"dbfs:/my/whl\" }` or `{ \"whl\": \"s3://my-bucket/whl\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + } + } + } + }, + "max_retries": { + "description": "An optional maximum number of times to retry an unsuccessful run. A run is considered to be unsuccessful if it completes with the `FAILED` result_state or `INTERNAL_ERROR` `life_cycle_state`. The value -1 means to retry indefinitely and the value 0 means to never retry. The default behavior is to never retry." + }, + "min_retry_interval_millis": { + "description": "An optional minimal interval in milliseconds between the start of the failed run and the subsequent retry run. The default behavior is that unsuccessful runs are immediately retried." + }, + "new_cluster": { + "description": "If new_cluster, a description of a cluster that is created for each task.", + "properties": { + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "autotermination_minutes": { + "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either region or endpoint should also be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + } + } + }, + "cluster_name": { + "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" + }, + "cluster_source": { + "description": "" + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" + }, + "enable_elastic_disk": { + "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." + }, + "enable_local_disk_encryption": { + "description": "Whether to enable LUKS on cluster VMs' local disks" + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + } + } + }, + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "runtime_engine": { + "description": "" + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", + "additionalproperties": { + "description": "" + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" + } + }, + "spark_version": { + "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } + }, + "workload_type": { + "description": "", + "properties": { + "clients": { + "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", + "properties": { + "jobs": { + "description": "With jobs set, the cluster can be used for jobs" + }, + "notebooks": { + "description": "With notebooks set, this cluster can be used for notebooks" + } + } + } + } + } + } + }, + "notebook_task": { + "description": "If notebook_task, indicates that this task must run a notebook. This field may not be specified in conjunction with spark_jar_task.", + "properties": { + "base_parameters": { + "description": "Base parameters to be used for each run of this job. If the run is initiated by a call to\n:method:jobs/runNow with parameters specified, the two parameters maps are merged. If the same key is specified in\n`base_parameters` and in `run-now`, the value from `run-now` is used.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n\nIf the notebook takes a parameter that is not specified in the job’s `base_parameters` or the `run-now` override parameters,\nthe default value from the notebook is used.\n\nRetrieve these parameters in a notebook using [dbutils.widgets.get](https://docs.databricks.com/dev-tools/databricks-utils.html#dbutils-widgets).\n", + "additionalproperties": { + "description": "" + } + }, + "notebook_path": { + "description": "The path of the notebook to be run in the Databricks workspace or remote repository.\nFor notebooks stored in the Databricks workspace, the path must be absolute and begin with a slash.\nFor notebooks stored in a remote repository, the path must be relative. This field is required.\n" + }, + "source": { + "description": "This describes an enum" + } + } + }, + "pipeline_task": { + "description": "If pipeline_task, indicates that this task must execute a Pipeline.", + "properties": { + "full_refresh": { + "description": "If true, a full refresh will be triggered on the delta live table." + }, + "pipeline_id": { + "description": "The full name of the pipeline task to execute." + } + } + }, + "python_wheel_task": { + "description": "If python_wheel_task, indicates that this job must execute a PythonWheel.", + "properties": { + "entry_point": { + "description": "Named entry point to use, if it does not exist in the metadata of the package it executes the function from the package directly using `$packageName.$entryPoint()`" + }, + "named_parameters": { + "description": "Command-line parameters passed to Python wheel task in the form of `[\"--name=task\", \"--data=dbfs:/path/to/data.json\"]`. Leave it empty if `parameters` is not null.", + "additionalproperties": { + "description": "" + } + }, + "package_name": { + "description": "Name of the package to execute" + }, + "parameters": { + "description": "Command-line parameters passed to Python wheel task. Leave it empty if `named_parameters` is not null.", + "items": { + "description": "" + } + } + } + }, + "retry_on_timeout": { + "description": "An optional policy to specify whether to retry a task when it times out. The default behavior is to not retry on timeout." + }, + "spark_jar_task": { + "description": "If spark_jar_task, indicates that this task must run a JAR.", + "properties": { + "jar_uri": { + "description": "Deprecated since 04/2016\\\\. Provide a `jar` through the `libraries` field instead. For an example, see :method:jobs/create.\n" + }, + "main_class_name": { + "description": "The full name of the class containing the main method to be executed. This class must be contained in a JAR provided as a library.\n\nThe code must use `SparkContext.getOrCreate` to obtain a Spark context; otherwise, runs of the job fail." + }, + "parameters": { + "description": "Parameters passed to the main method.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" + } + } + } + }, + "spark_python_task": { + "description": "If spark_python_task, indicates that this task must run a Python file.", + "properties": { + "parameters": { + "description": "Command line parameters passed to the Python file.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" + } + }, + "python_file": { + "description": "" + } + } + }, + "spark_submit_task": { + "description": "If spark_submit_task, indicates that this task must be launched by the spark submit script. This task can run only on new clusters.", + "properties": { + "parameters": { + "description": "Command-line parameters passed to spark submit.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" + } + } + } + }, + "sql_task": { + "description": "If sql_task, indicates that this job must execute a SQL task.", + "properties": { + "alert": { + "description": "If alert, indicates that this job must refresh a SQL alert.", + "properties": { + "alert_id": { + "description": "The canonical identifier of the SQL alert." + }, + "pause_subscriptions": { + "description": "If true, the alert notifications are not sent to subscribers." + }, + "subscriptions": { + "description": "If specified, alert notifications are sent to subscribers.", + "items": { + "description": "", + "properties": { + "destination_id": { + "description": "The canonical identifier of the destination to receive email notification." + }, + "user_name": { + "description": "The user name to receive the subscription email." + } + } + } + } + } + }, + "dashboard": { + "description": "If dashboard, indicates that this job must refresh a SQL dashboard.", + "properties": { + "custom_subject": { + "description": "Subject of the email sent to subscribers of this task." + }, + "dashboard_id": { + "description": "The canonical identifier of the SQL dashboard." + }, + "pause_subscriptions": { + "description": "If true, the dashboard snapshot is not taken, and emails are not sent to subscribers." + }, + "subscriptions": { + "description": "If specified, dashboard snapshots are sent to subscriptions.", + "items": { + "description": "", + "properties": { + "destination_id": { + "description": "The canonical identifier of the destination to receive email notification." + }, + "user_name": { + "description": "The user name to receive the subscription email." + } + } + } + } + } + }, + "parameters": { + "description": "Parameters to be used for each run of this job. The SQL alert task does not support custom parameters.", + "additionalproperties": { + "description": "" + } + }, + "query": { + "description": "If query, indicates that this job must execute a SQL query.", + "properties": { + "query_id": { + "description": "The canonical identifier of the SQL query." + } + } + }, + "warehouse_id": { + "description": "The canonical identifier of the SQL warehouse. Only serverless and pro SQL warehouses are supported." + } + } + }, + "task_key": { + "description": "A unique name for the task. This field is used to refer to this task from other tasks.\nThis field is required and must be unique within its parent job.\nOn Update or Reset, this field is used to reference the tasks to be updated or reset.\nThe maximum length is 100 characters." + }, + "timeout_seconds": { + "description": "An optional timeout applied to each run of this job task. The default behavior is to have no timeout." + } + } + } + }, + "timeout_seconds": { + "description": "An optional timeout applied to each run of this job. The default behavior is to have no timeout." + }, + "webhook_notifications": { + "description": "A collection of system notification IDs to notify when the run begins or completes. The default behavior is to not send any system notifications.", + "properties": { + "on_failure": { + "description": "An optional list of system notification IDs to call when the run fails. A maximum of 3 destinations can be specified for the `on_failure` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + }, + "on_start": { + "description": "An optional list of system notification IDs to call when the run starts. A maximum of 3 destinations can be specified for the `on_start` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + }, + "on_success": { + "description": "An optional list of system notification IDs to call when the run completes successfully. A maximum of 3 destinations can be specified for the `on_success` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + } + } + } + } + } + }, + "pipelines": { + "description": "List of pipeline definations", + "additionalproperties": { + "description": "", + "properties": { + "catalog": { + "description": "Catalog in UC to add tables to. If target is specified, tables in this pipeline will be\npublished to a \"target\" schema inside catalog (i.e. \u003ccatalog\u003e.\u003ctarget\u003e.\u003ctable\u003e)." + }, + "channel": { + "description": "DLT Release Channel that specifies which version to use." + }, + "clusters": { + "description": "Cluster settings for this pipeline deployment.", + "items": { + "description": "", + "properties": { + "apply_policy_default_values": { + "description": "Note: This field won't be persisted. Only API users will check this field." + }, + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either region or endpoint should also be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + } + } + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver.\nNote that this field is optional; if unset, the driver node type will be set as the same value\nas `node_type_id` defined above." + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + } + } + }, + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." + }, + "label": { + "description": "Cluster label" + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nSee :method:clusters/create for more details.\n", + "additionalproperties": { + "description": "" + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" + } + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } + } + } + } + }, + "configuration": { + "description": "String-String configuration for this pipeline execution.", + "additionalproperties": { + "description": "" + } + }, + "continuous": { + "description": "Whether the pipeline is continuous or triggered. This replaces `trigger`." + }, + "development": { + "description": "Whether the pipeline is in Development mode. Defaults to false." + }, + "edition": { + "description": "Pipeline product edition." + }, + "filters": { + "description": "Filters on which Pipeline packages to include in the deployed graph.", + "properties": { + "exclude": { + "description": "Paths to exclude.", + "items": { + "description": "" + } + }, + "include": { + "description": "Paths to include.", + "items": { + "description": "" + } + } + } + }, + "id": { + "description": "Unique identifier for this pipeline." + }, + "libraries": { + "description": "Libraries or code needed by this deployment.", + "items": { + "description": "", + "properties": { + "jar": { + "description": "URI of the jar to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"jar\": \"dbfs:/mnt/databricks/library.jar\" }` or\n`{ \"jar\": \"s3://my-bucket/library.jar\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + }, + "maven": { + "description": "Specification of a maven library to be installed. For example:\n`{ \"coordinates\": \"org.jsoup:jsoup:1.7.2\" }`", + "properties": { + "coordinates": { + "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." + }, + "exclusions": { + "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", + "items": { + "description": "" + } + }, + "repo": { + "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." + } + } + }, + "notebook": { + "description": "The path to a notebook that defines a pipeline and is stored in the Databricks workspace.\nFor example: `{ \"notebook\" : { \"path\" : \"/my-pipeline-notebook-path\" } }`.\nCurrently, only Scala notebooks are supported, and pipelines must be defined in a package\ncell.", + "properties": { + "path": { + "description": "The absolute path of the notebook." + } + } + }, + "whl": { + "description": "URI of the wheel to be installed.\nFor example: `{ \"whl\": \"dbfs:/my/whl\" }` or `{ \"whl\": \"s3://my-bucket/whl\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + } + } + } + }, + "name": { + "description": "Friendly identifier for this pipeline." + }, + "photon": { + "description": "Whether Photon is enabled for this pipeline." + }, + "storage": { + "description": "DBFS root directory for storing checkpoints and tables." + }, + "target": { + "description": "Target schema (database) to add tables in this pipeline to." + }, + "trigger": { + "description": "Which pipeline trigger to use. Deprecated: Use `continuous` instead.", + "properties": { + "cron": { + "description": "", + "properties": { + "quartz_cron_schedule": { + "description": "" + }, + "timezone_id": { + "description": "" + } + } + }, + "manual": { + "description": "" + } + } + } + } + } + } + } + }, + "workspace": { + "description": "", + "properties": { + "artifact_path": { + "description": "", + "properties": { + "dbfs": { + "description": "" + }, + "workspace": { + "description": "" + } + } + }, + "azure_client_id": { + "description": "" + }, + "azure_environment": { + "description": "" + }, + "azure_login_app_id": { + "description": "" + }, + "azure_tenant_id": { + "description": "" + }, + "azure_use_msi": { + "description": "" + }, + "azure_workspace_resource_id": { + "description": "" + }, + "current_user": { + "description": "", + "properties": { + "active": { + "description": "" + }, + "displayName": { + "description": "" + }, + "emails": { + "description": "", + "items": { + "description": "", + "properties": { + "$ref": { + "description": "" + }, + "display": { + "description": "" + }, + "primary": { + "description": "" + }, + "type": { + "description": "" + }, + "value": { + "description": "" + } + } + } + }, + "entitlements": { + "description": "", + "items": { + "description": "", + "properties": { + "$ref": { + "description": "" + }, + "display": { + "description": "" + }, + "primary": { + "description": "" + }, + "type": { + "description": "" + }, + "value": { + "description": "" + } + } + } + }, + "externalId": { + "description": "" + }, + "groups": { + "description": "", + "items": { + "description": "", + "properties": { + "$ref": { + "description": "" + }, + "display": { + "description": "" + }, + "primary": { + "description": "" + }, + "type": { + "description": "" + }, + "value": { + "description": "" + } + } + } + }, + "id": { + "description": "" + }, + "name": { + "description": "", + "properties": { + "familyName": { + "description": "" + }, + "givenName": { + "description": "" + } + } + }, + "roles": { + "description": "", + "items": { + "description": "", + "properties": { + "$ref": { + "description": "" + }, + "display": { + "description": "" + }, + "primary": { + "description": "" + }, + "type": { + "description": "" + }, + "value": { + "description": "" + } + } + } + }, + "userName": { + "description": "" + } + } + }, + "file_path": { + "description": "", + "properties": { + "dbfs": { + "description": "" + }, + "workspace": { + "description": "" + } + } + }, + "google_service_account": { + "description": "" + }, + "host": { + "description": "" + }, + "profile": { + "description": "" + }, + "root": { + "description": "" + }, + "state_path": { + "description": "", + "properties": { + "dbfs": { + "description": "" + }, + "workspace": { + "description": "" + } + } + } + } + } + } + } + }, + "include": { + "description": "", + "items": { + "description": "" + } + }, + "resources": { + "description": "Specification of databricks resources to instantiate", + "properties": { + "jobs": { + "description": "List of job definations", + "additionalproperties": { + "description": "", + "properties": { + "email_notifications": { + "description": "An optional set of email addresses that is notified when runs of this job begin or complete as well as when this job is deleted. The default behavior is to not send any emails.", + "properties": { + "no_alert_for_skipped_runs": { + "description": "If true, do not send email to recipients specified in `on_failure` if the run is skipped." + }, + "on_failure": { + "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `SKIPPED`, `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_start": { + "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_success": { + "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESSFUL` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + } + } + }, + "format": { + "description": "Used to tell what is the format of the job. This field is ignored in Create/Update/Reset calls. When using the Jobs API 2.1 this value is always set to `\"MULTI_TASK\"`." + }, + "git_source": { + "description": "An optional specification for a remote repository containing the notebooks used by this job's notebook tasks.", + "properties": { + "git_branch": { + "description": "Name of the branch to be checked out and used by this job.\nThis field cannot be specified in conjunction with git_tag or git_commit.\n\nThe maximum length is 255 characters.\n" + }, + "git_commit": { + "description": "Commit to be checked out and used by this job. This field cannot be specified in conjunction with git_branch or git_tag.\nThe maximum length is 64 characters." + }, + "git_provider": { + "description": "Unique identifier of the service used to host the Git repository. The value is case insensitive." + }, + "git_snapshot": { + "description": "", + "properties": { + "used_commit": { + "description": "Commit that was used to execute the run. If git_branch was specified, this points to the HEAD of the branch at the time of the run; if git_tag was specified, this points to the commit the tag points to." + } + } + }, + "git_tag": { + "description": "Name of the tag to be checked out and used by this job.\nThis field cannot be specified in conjunction with git_branch or git_commit.\n\nThe maximum length is 255 characters.\n" + }, + "git_url": { + "description": "URL of the repository to be cloned by this job.\nThe maximum length is 300 characters." + } + } + }, + "id": { + "description": "" + }, + "job_clusters": { + "description": "A list of job cluster specifications that can be shared and reused by tasks of this job. Libraries cannot be declared in a shared job cluster. You must declare dependent libraries in task settings.", + "items": { + "description": "", + "properties": { + "job_cluster_key": { + "description": "A unique name for the job cluster. This field is required and must be unique within the job.\n`JobTaskSettings` may refer to this field to determine which cluster to launch for the task execution." + }, + "new_cluster": { + "description": "If new_cluster, a description of a cluster that is created for each task.", + "properties": { + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "autotermination_minutes": { + "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either region or endpoint should also be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + } + } + }, + "cluster_name": { + "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" + }, + "cluster_source": { + "description": "" + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" + }, + "enable_elastic_disk": { + "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." + }, + "enable_local_disk_encryption": { + "description": "Whether to enable LUKS on cluster VMs' local disks" + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + } + } + }, + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "runtime_engine": { + "description": "" + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", + "additionalproperties": { + "description": "" + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" + } + }, + "spark_version": { + "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } + }, + "workload_type": { + "description": "", + "properties": { + "clients": { + "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", + "properties": { + "jobs": { + "description": "With jobs set, the cluster can be used for jobs" + }, + "notebooks": { + "description": "With notebooks set, this cluster can be used for notebooks" + } + } + } + } + } + } + } + } + } + }, + "max_concurrent_runs": { + "description": "An optional maximum allowed number of concurrent runs of the job.\n\nSet this value if you want to be able to execute multiple runs of the same job concurrently. This is useful for example if you trigger your job on a frequent schedule and want to allow consecutive runs to overlap with each other, or if you want to trigger multiple runs which differ by their input parameters.\n\nThis setting affects only new runs. For example, suppose the job’s concurrency is 4 and there are 4 concurrent active runs. Then setting the concurrency to 3 won’t kill any of the active runs. However, from then on, new runs are skipped unless there are fewer than 3 active runs.\n\nThis value cannot exceed 1000\\. Setting this value to 0 causes all new runs to be skipped. The default behavior is to allow only 1 concurrent run." + }, + "name": { + "description": "An optional name for the job." + }, + "schedule": { + "description": "An optional periodic schedule for this job. The default behavior is that the job only runs when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.", + "properties": { + "pause_status": { + "description": "Indicate whether this schedule is paused or not." + }, + "quartz_cron_expression": { + "description": "A Cron expression using Quartz syntax that describes the schedule for a job.\nSee [Cron Trigger](http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/crontrigger.html)\nfor details. This field is required.\"\n" + }, + "timezone_id": { + "description": "A Java timezone ID. The schedule for a job is resolved with respect to this timezone.\nSee [Java TimeZone](https://docs.oracle.com/javase/7/docs/api/java/util/TimeZone.html) for details.\nThis field is required.\n" + } + } + }, + "tags": { + "description": "A map of tags associated with the job. These are forwarded to the cluster as cluster tags for jobs clusters, and are subject to the same limitations as cluster tags. A maximum of 25 tags can be added to the job.", + "additionalproperties": { + "description": "" + } + }, + "tasks": { + "description": "A list of task specifications to be executed by this job.", + "items": { + "description": "", + "properties": { + "dbt_task": { + "description": "If dbt_task, indicates that this must execute a dbt task. It requires both Databricks SQL and the ability to use a serverless or a pro SQL warehouse.", + "properties": { + "catalog": { + "description": "Optional name of the catalog to use. The value is the top level in the 3-level namespace of Unity Catalog (catalog / schema / relation). The catalog value can only be specified if a warehouse_id is specified. Requires dbt-databricks \u003e= 1.1.1." + }, + "commands": { + "description": "A list of dbt commands to execute. All commands must start with `dbt`. This parameter must not be empty. A maximum of up to 10 commands can be provided.", + "items": { + "description": "" + } + }, + "profiles_directory": { + "description": "Optional (relative) path to the profiles directory. Can only be specified if no warehouse_id is specified. If no warehouse_id is specified and this folder is unset, the root directory is used." + }, + "project_directory": { + "description": "Optional (relative) path to the project directory, if no value is provided, the root of the git repository is used." + }, + "schema": { + "description": "Optional schema to write to. This parameter is only used when a warehouse_id is also provided. If not provided, the `default` schema is used." + }, + "warehouse_id": { + "description": "ID of the SQL warehouse to connect to. If provided, we automatically generate and provide the profile and connection details to dbt. It can be overridden on a per-command basis by using the `--profiles-dir` command line argument." + } + } + }, + "depends_on": { + "description": "", + "items": { + "description": "", + "properties": { + "task_key": { + "description": "" + } + } + } + }, + "description": { + "description": "An optional description for this task.\nThe maximum length is 4096 bytes." + }, + "email_notifications": { + "description": "An optional set of email addresses that is notified when runs of this job begin or complete as well as when this job is deleted. The default behavior is to not send any emails.", + "properties": { + "no_alert_for_skipped_runs": { + "description": "If true, do not send email to recipients specified in `on_failure` if the run is skipped." + }, + "on_failure": { + "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `SKIPPED`, `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_start": { + "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_success": { + "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESSFUL` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + } + } + }, + "existing_cluster_id": { + "description": "If existing_cluster_id, the ID of an existing cluster that is used for all runs of this task. When running tasks on an existing cluster, you may need to manually restart the cluster if it stops responding. We suggest running jobs on new clusters for greater reliability." + }, + "job_cluster_key": { + "description": "If job_cluster_key, this task is executed reusing the cluster specified in `job.settings.job_clusters`." + }, + "libraries": { + "description": "An optional list of libraries to be installed on the cluster that executes the task. The default value is an empty list.", + "items": { + "description": "", + "properties": { + "cran": { + "description": "Specification of a CRAN library to be installed as part of the library", + "properties": { + "package": { + "description": "The name of the CRAN package to install." + }, + "repo": { + "description": "The repository where the package can be found. If not specified, the default CRAN repo is used." + } + } + }, + "egg": { + "description": "URI of the egg to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"egg\": \"dbfs:/my/egg\" }` or\n`{ \"egg\": \"s3://my-bucket/egg\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + }, + "jar": { + "description": "URI of the jar to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"jar\": \"dbfs:/mnt/databricks/library.jar\" }` or\n`{ \"jar\": \"s3://my-bucket/library.jar\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + }, + "maven": { + "description": "Specification of a maven library to be installed. For example:\n`{ \"coordinates\": \"org.jsoup:jsoup:1.7.2\" }`", + "properties": { + "coordinates": { + "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." + }, + "exclusions": { + "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", + "items": { + "description": "" + } + }, + "repo": { + "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." + } + } + }, + "pypi": { + "description": "Specification of a PyPi library to be installed. For example:\n`{ \"package\": \"simplejson\" }`", + "properties": { + "package": { + "description": "The name of the pypi package to install. An optional exact version specification is also\nsupported. Examples: \"simplejson\" and \"simplejson==3.8.0\"." + }, + "repo": { + "description": "The repository where the package can be found. If not specified, the default pip index is\nused." + } + } + }, + "whl": { + "description": "URI of the wheel to be installed.\nFor example: `{ \"whl\": \"dbfs:/my/whl\" }` or `{ \"whl\": \"s3://my-bucket/whl\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + } + } + } + }, + "max_retries": { + "description": "An optional maximum number of times to retry an unsuccessful run. A run is considered to be unsuccessful if it completes with the `FAILED` result_state or `INTERNAL_ERROR` `life_cycle_state`. The value -1 means to retry indefinitely and the value 0 means to never retry. The default behavior is to never retry." + }, + "min_retry_interval_millis": { + "description": "An optional minimal interval in milliseconds between the start of the failed run and the subsequent retry run. The default behavior is that unsuccessful runs are immediately retried." + }, + "new_cluster": { + "description": "If new_cluster, a description of a cluster that is created for each task.", + "properties": { + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "autotermination_minutes": { + "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either region or endpoint should also be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + } + } + }, + "cluster_name": { + "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" + }, + "cluster_source": { + "description": "" + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" + }, + "enable_elastic_disk": { + "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." + }, + "enable_local_disk_encryption": { + "description": "Whether to enable LUKS on cluster VMs' local disks" + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + } + } + }, + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "runtime_engine": { + "description": "" + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", + "additionalproperties": { + "description": "" + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" + } + }, + "spark_version": { + "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } + }, + "workload_type": { + "description": "", + "properties": { + "clients": { + "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", + "properties": { + "jobs": { + "description": "With jobs set, the cluster can be used for jobs" + }, + "notebooks": { + "description": "With notebooks set, this cluster can be used for notebooks" + } + } + } + } + } + } + }, + "notebook_task": { + "description": "If notebook_task, indicates that this task must run a notebook. This field may not be specified in conjunction with spark_jar_task.", + "properties": { + "base_parameters": { + "description": "Base parameters to be used for each run of this job. If the run is initiated by a call to\n:method:jobs/runNow with parameters specified, the two parameters maps are merged. If the same key is specified in\n`base_parameters` and in `run-now`, the value from `run-now` is used.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n\nIf the notebook takes a parameter that is not specified in the job’s `base_parameters` or the `run-now` override parameters,\nthe default value from the notebook is used.\n\nRetrieve these parameters in a notebook using [dbutils.widgets.get](https://docs.databricks.com/dev-tools/databricks-utils.html#dbutils-widgets).\n", + "additionalproperties": { + "description": "" + } + }, + "notebook_path": { + "description": "The path of the notebook to be run in the Databricks workspace or remote repository.\nFor notebooks stored in the Databricks workspace, the path must be absolute and begin with a slash.\nFor notebooks stored in a remote repository, the path must be relative. This field is required.\n" + }, + "source": { + "description": "This describes an enum" + } + } + }, + "pipeline_task": { + "description": "If pipeline_task, indicates that this task must execute a Pipeline.", + "properties": { + "full_refresh": { + "description": "If true, a full refresh will be triggered on the delta live table." + }, + "pipeline_id": { + "description": "The full name of the pipeline task to execute." + } + } + }, + "python_wheel_task": { + "description": "If python_wheel_task, indicates that this job must execute a PythonWheel.", + "properties": { + "entry_point": { + "description": "Named entry point to use, if it does not exist in the metadata of the package it executes the function from the package directly using `$packageName.$entryPoint()`" + }, + "named_parameters": { + "description": "Command-line parameters passed to Python wheel task in the form of `[\"--name=task\", \"--data=dbfs:/path/to/data.json\"]`. Leave it empty if `parameters` is not null.", + "additionalproperties": { + "description": "" + } + }, + "package_name": { + "description": "Name of the package to execute" + }, + "parameters": { + "description": "Command-line parameters passed to Python wheel task. Leave it empty if `named_parameters` is not null.", + "items": { + "description": "" + } + } + } + }, + "retry_on_timeout": { + "description": "An optional policy to specify whether to retry a task when it times out. The default behavior is to not retry on timeout." + }, + "spark_jar_task": { + "description": "If spark_jar_task, indicates that this task must run a JAR.", + "properties": { + "jar_uri": { + "description": "Deprecated since 04/2016\\\\. Provide a `jar` through the `libraries` field instead. For an example, see :method:jobs/create.\n" + }, + "main_class_name": { + "description": "The full name of the class containing the main method to be executed. This class must be contained in a JAR provided as a library.\n\nThe code must use `SparkContext.getOrCreate` to obtain a Spark context; otherwise, runs of the job fail." + }, + "parameters": { + "description": "Parameters passed to the main method.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" + } + } + } + }, + "spark_python_task": { + "description": "If spark_python_task, indicates that this task must run a Python file.", + "properties": { + "parameters": { + "description": "Command line parameters passed to the Python file.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" + } + }, + "python_file": { + "description": "" + } + } + }, + "spark_submit_task": { + "description": "If spark_submit_task, indicates that this task must be launched by the spark submit script. This task can run only on new clusters.", + "properties": { + "parameters": { + "description": "Command-line parameters passed to spark submit.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" + } + } + } + }, + "sql_task": { + "description": "If sql_task, indicates that this job must execute a SQL task.", + "properties": { + "alert": { + "description": "If alert, indicates that this job must refresh a SQL alert.", + "properties": { + "alert_id": { + "description": "The canonical identifier of the SQL alert." + }, + "pause_subscriptions": { + "description": "If true, the alert notifications are not sent to subscribers." + }, + "subscriptions": { + "description": "If specified, alert notifications are sent to subscribers.", + "items": { + "description": "", + "properties": { + "destination_id": { + "description": "The canonical identifier of the destination to receive email notification." + }, + "user_name": { + "description": "The user name to receive the subscription email." + } + } + } + } + } + }, + "dashboard": { + "description": "If dashboard, indicates that this job must refresh a SQL dashboard.", + "properties": { + "custom_subject": { + "description": "Subject of the email sent to subscribers of this task." + }, + "dashboard_id": { + "description": "The canonical identifier of the SQL dashboard." + }, + "pause_subscriptions": { + "description": "If true, the dashboard snapshot is not taken, and emails are not sent to subscribers." + }, + "subscriptions": { + "description": "If specified, dashboard snapshots are sent to subscriptions.", + "items": { + "description": "", + "properties": { + "destination_id": { + "description": "The canonical identifier of the destination to receive email notification." + }, + "user_name": { + "description": "The user name to receive the subscription email." + } + } + } + } + } + }, + "parameters": { + "description": "Parameters to be used for each run of this job. The SQL alert task does not support custom parameters.", + "additionalproperties": { + "description": "" + } + }, + "query": { + "description": "If query, indicates that this job must execute a SQL query.", + "properties": { + "query_id": { + "description": "The canonical identifier of the SQL query." + } + } + }, + "warehouse_id": { + "description": "The canonical identifier of the SQL warehouse. Only serverless and pro SQL warehouses are supported." + } + } + }, + "task_key": { + "description": "A unique name for the task. This field is used to refer to this task from other tasks.\nThis field is required and must be unique within its parent job.\nOn Update or Reset, this field is used to reference the tasks to be updated or reset.\nThe maximum length is 100 characters." + }, + "timeout_seconds": { + "description": "An optional timeout applied to each run of this job task. The default behavior is to have no timeout." + } + } + } + }, + "timeout_seconds": { + "description": "An optional timeout applied to each run of this job. The default behavior is to have no timeout." + }, + "webhook_notifications": { + "description": "A collection of system notification IDs to notify when the run begins or completes. The default behavior is to not send any system notifications.", + "properties": { + "on_failure": { + "description": "An optional list of system notification IDs to call when the run fails. A maximum of 3 destinations can be specified for the `on_failure` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + }, + "on_start": { + "description": "An optional list of system notification IDs to call when the run starts. A maximum of 3 destinations can be specified for the `on_start` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + }, + "on_success": { + "description": "An optional list of system notification IDs to call when the run completes successfully. A maximum of 3 destinations can be specified for the `on_success` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + } + } + } + } + } + }, + "pipelines": { + "description": "List of pipeline definations", + "additionalproperties": { + "description": "", + "properties": { + "catalog": { + "description": "Catalog in UC to add tables to. If target is specified, tables in this pipeline will be\npublished to a \"target\" schema inside catalog (i.e. \u003ccatalog\u003e.\u003ctarget\u003e.\u003ctable\u003e)." + }, + "channel": { + "description": "DLT Release Channel that specifies which version to use." + }, + "clusters": { + "description": "Cluster settings for this pipeline deployment.", + "items": { + "description": "", + "properties": { + "apply_policy_default_values": { + "description": "Note: This field won't be persisted. Only API users will check this field." + }, + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either region or endpoint should also be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + } + } + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver.\nNote that this field is optional; if unset, the driver node type will be set as the same value\nas `node_type_id` defined above." + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + } + } + }, + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." + }, + "label": { + "description": "Cluster label" + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nSee :method:clusters/create for more details.\n", + "additionalproperties": { + "description": "" + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" + } + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } + } + } + } + }, + "configuration": { + "description": "String-String configuration for this pipeline execution.", + "additionalproperties": { + "description": "" + } + }, + "continuous": { + "description": "Whether the pipeline is continuous or triggered. This replaces `trigger`." + }, + "development": { + "description": "Whether the pipeline is in Development mode. Defaults to false." + }, + "edition": { + "description": "Pipeline product edition." + }, + "filters": { + "description": "Filters on which Pipeline packages to include in the deployed graph.", + "properties": { + "exclude": { + "description": "Paths to exclude.", + "items": { + "description": "" + } + }, + "include": { + "description": "Paths to include.", + "items": { + "description": "" + } + } + } + }, + "id": { + "description": "Unique identifier for this pipeline." + }, + "libraries": { + "description": "Libraries or code needed by this deployment.", + "items": { + "description": "", + "properties": { + "jar": { + "description": "URI of the jar to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"jar\": \"dbfs:/mnt/databricks/library.jar\" }` or\n`{ \"jar\": \"s3://my-bucket/library.jar\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + }, + "maven": { + "description": "Specification of a maven library to be installed. For example:\n`{ \"coordinates\": \"org.jsoup:jsoup:1.7.2\" }`", + "properties": { + "coordinates": { + "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." + }, + "exclusions": { + "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", + "items": { + "description": "" + } + }, + "repo": { + "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." + } + } + }, + "notebook": { + "description": "The path to a notebook that defines a pipeline and is stored in the Databricks workspace.\nFor example: `{ \"notebook\" : { \"path\" : \"/my-pipeline-notebook-path\" } }`.\nCurrently, only Scala notebooks are supported, and pipelines must be defined in a package\ncell.", + "properties": { + "path": { + "description": "The absolute path of the notebook." + } + } + }, + "whl": { + "description": "URI of the wheel to be installed.\nFor example: `{ \"whl\": \"dbfs:/my/whl\" }` or `{ \"whl\": \"s3://my-bucket/whl\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + } + } + } + }, + "name": { + "description": "Friendly identifier for this pipeline." + }, + "photon": { + "description": "Whether Photon is enabled for this pipeline." + }, + "storage": { + "description": "DBFS root directory for storing checkpoints and tables." + }, + "target": { + "description": "Target schema (database) to add tables in this pipeline to." + }, + "trigger": { + "description": "Which pipeline trigger to use. Deprecated: Use `continuous` instead.", + "properties": { + "cron": { + "description": "", + "properties": { + "quartz_cron_schedule": { + "description": "" + }, + "timezone_id": { + "description": "" + } + } + }, + "manual": { + "description": "" + } + } + } + } + } + } + } + }, + "workspace": { + "description": "", + "properties": { + "artifact_path": { + "description": "", + "properties": { + "dbfs": { + "description": "" + }, + "workspace": { + "description": "" + } + } + }, + "azure_client_id": { + "description": "" + }, + "azure_environment": { + "description": "" + }, + "azure_login_app_id": { + "description": "" + }, + "azure_tenant_id": { + "description": "" + }, + "azure_use_msi": { + "description": "" + }, + "azure_workspace_resource_id": { + "description": "" + }, + "current_user": { + "description": "", + "properties": { + "active": { + "description": "" + }, + "displayName": { + "description": "" + }, + "emails": { + "description": "", + "items": { + "description": "", + "properties": { + "$ref": { + "description": "" + }, + "display": { + "description": "" + }, + "primary": { + "description": "" + }, + "type": { + "description": "" + }, + "value": { + "description": "" + } + } + } + }, + "entitlements": { + "description": "", + "items": { + "description": "", + "properties": { + "$ref": { + "description": "" + }, + "display": { + "description": "" + }, + "primary": { + "description": "" + }, + "type": { + "description": "" + }, + "value": { + "description": "" + } + } + } + }, + "externalId": { + "description": "" + }, + "groups": { + "description": "", + "items": { + "description": "", + "properties": { + "$ref": { + "description": "" + }, + "display": { + "description": "" + }, + "primary": { + "description": "" + }, + "type": { + "description": "" + }, + "value": { + "description": "" + } + } + } + }, + "id": { + "description": "" + }, + "name": { + "description": "", + "properties": { + "familyName": { + "description": "" + }, + "givenName": { + "description": "" + } + } + }, + "roles": { + "description": "", + "items": { + "description": "", + "properties": { + "$ref": { + "description": "" + }, + "display": { + "description": "" + }, + "primary": { + "description": "" + }, + "type": { + "description": "" + }, + "value": { + "description": "" + } + } + } + }, + "userName": { + "description": "" + } + } + }, + "file_path": { + "description": "", + "properties": { + "dbfs": { + "description": "" + }, + "workspace": { + "description": "" + } + } + }, + "google_service_account": { + "description": "" + }, + "host": { + "description": "" + }, + "profile": { + "description": "" + }, + "root": { + "description": "" + }, + "state_path": { + "description": "", + "properties": { + "dbfs": { + "description": "" + }, + "workspace": { + "description": "" + } + } + } + } + } + } +} \ No newline at end of file diff --git a/bundle/schema/docs_test.go b/bundle/schema/docs_test.go new file mode 100644 index 00000000..84d804b0 --- /dev/null +++ b/bundle/schema/docs_test.go @@ -0,0 +1,61 @@ +package schema + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestSchemaToDocs(t *testing.T) { + schema := &Schema{ + Type: "object", + Description: "root doc", + Properties: map[string]*Schema{ + "foo": {Type: "number", Description: "foo doc"}, + "bar": {Type: "string"}, + "octave": { + Type: "object", + AdditionalProperties: &Schema{Type: "number"}, + Description: "octave docs", + }, + "scales": { + Type: "object", + Description: "scale docs", + Items: &Schema{Type: "string"}, + }, + }, + } + docs := schemaToDocs(schema) + docsJson, err := json.MarshalIndent(docs, " ", " ") + require.NoError(t, err) + + expected := + `{ + "description": "root doc", + "properties": { + "bar": { + "description": "" + }, + "foo": { + "description": "foo doc" + }, + "octave": { + "description": "octave docs", + "additionalproperties": { + "description": "" + } + }, + "scales": { + "description": "scale docs", + "items": { + "description": "" + } + } + } + }` + t.Log("[DEBUG] actual: ", string(docsJson)) + t.Log("[DEBUG] expected: ", expected) + assert.Equal(t, expected, string(docsJson)) +} diff --git a/bundle/schema/openapi.go b/bundle/schema/openapi.go new file mode 100644 index 00000000..97ec5141 --- /dev/null +++ b/bundle/schema/openapi.go @@ -0,0 +1,214 @@ +package schema + +import ( + "encoding/json" + "fmt" + "strings" + + "github.com/databricks/databricks-sdk-go/openapi" +) + +type OpenapiReader struct { + OpenapiSpec *openapi.Specification + Memo map[string]*Schema +} + +const SchemaPathPrefix = "#/components/schemas/" + +func (reader *OpenapiReader) readOpenapiSchema(path string) (*Schema, error) { + schemaKey := strings.TrimPrefix(path, SchemaPathPrefix) + + // return early if we already have a computed schema + memoSchema, ok := reader.Memo[schemaKey] + if ok { + return memoSchema, nil + } + + // check path is present in openapi spec + openapiSchema, ok := reader.OpenapiSpec.Components.Schemas[schemaKey] + if !ok { + return nil, fmt.Errorf("schema with path %s not found in openapi spec", path) + } + + // convert openapi schema to the native schema struct + bytes, err := json.Marshal(*openapiSchema) + if err != nil { + return nil, err + } + jsonSchema := &Schema{} + err = json.Unmarshal(bytes, jsonSchema) + if err != nil { + return nil, err + } + + // A hack to convert a map[string]interface{} to *Schema + // We rely on the type of a AdditionalProperties in downstream functions + // to do reference interpolation + _, ok = jsonSchema.AdditionalProperties.(map[string]interface{}) + if ok { + b, err := json.Marshal(jsonSchema.AdditionalProperties) + if err != nil { + return nil, err + } + additionalProperties := &Schema{} + err = json.Unmarshal(b, additionalProperties) + if err != nil { + return nil, err + } + jsonSchema.AdditionalProperties = additionalProperties + } + + // store read schema into memo + reader.Memo[schemaKey] = jsonSchema + + return jsonSchema, nil +} + +// safe againt loops in refs +func (reader *OpenapiReader) safeResolveRefs(root *Schema, seenRefs map[string]struct{}) (*Schema, error) { + if root.Reference == nil { + return reader.traverseSchema(root, seenRefs) + } + key := *root.Reference + _, ok := seenRefs[key] + if ok { + // self reference loops can be supported however the logic is non-trivial because + // cross refernce loops are not allowed (see: http://json-schema.org/understanding-json-schema/structuring.html#recursion) + return nil, fmt.Errorf("references loop detected") + } + ref := *root.Reference + description := root.Description + seenRefs[ref] = struct{}{} + + // Mark reference nil, so we do not traverse this again. This is tracked + // in the memo + root.Reference = nil + + // unroll one level of reference + selfRef, err := reader.readOpenapiSchema(ref) + if err != nil { + return nil, err + } + root = selfRef + root.Description = description + + // traverse again to find new references + root, err = reader.traverseSchema(root, seenRefs) + if err != nil { + return nil, err + } + delete(seenRefs, ref) + return root, err +} + +func (reader *OpenapiReader) traverseSchema(root *Schema, seenRefs map[string]struct{}) (*Schema, error) { + // case primitive (or invalid) + if root.Type != Object && root.Type != Array { + return root, nil + } + // only root references are resolved + if root.Reference != nil { + return reader.safeResolveRefs(root, seenRefs) + } + // case struct + if len(root.Properties) > 0 { + for k, v := range root.Properties { + childSchema, err := reader.safeResolveRefs(v, seenRefs) + if err != nil { + return nil, err + } + root.Properties[k] = childSchema + } + } + // case array + if root.Items != nil { + itemsSchema, err := reader.safeResolveRefs(root.Items, seenRefs) + if err != nil { + return nil, err + } + root.Items = itemsSchema + } + // case map + additionionalProperties, ok := root.AdditionalProperties.(*Schema) + if ok && additionionalProperties != nil { + valueSchema, err := reader.safeResolveRefs(additionionalProperties, seenRefs) + if err != nil { + return nil, err + } + root.AdditionalProperties = valueSchema + } + return root, nil +} + +func (reader *OpenapiReader) readResolvedSchema(path string) (*Schema, error) { + root, err := reader.readOpenapiSchema(path) + if err != nil { + return nil, err + } + seenRefs := make(map[string]struct{}) + seenRefs[path] = struct{}{} + root, err = reader.safeResolveRefs(root, seenRefs) + if err != nil { + trace := "" + count := 0 + for k := range seenRefs { + if count == len(seenRefs)-1 { + trace += k + break + } + trace += k + " -> " + count++ + } + return nil, fmt.Errorf("%s. schema ref trace: %s", err, trace) + } + return root, nil +} + +func (reader *OpenapiReader) jobsDocs() (*Docs, error) { + jobSettingsSchema, err := reader.readResolvedSchema(SchemaPathPrefix + "jobs.JobSettings") + if err != nil { + return nil, err + } + jobDocs := schemaToDocs(jobSettingsSchema) + // TODO: add description for id if needed. + // Tracked in https://github.com/databricks/bricks/issues/242 + jobsDocs := &Docs{ + Description: "List of job definations", + AdditionalProperties: jobDocs, + } + return jobsDocs, nil +} + +func (reader *OpenapiReader) pipelinesDocs() (*Docs, error) { + pipelineSpecSchema, err := reader.readResolvedSchema(SchemaPathPrefix + "pipelines.PipelineSpec") + if err != nil { + return nil, err + } + pipelineDocs := schemaToDocs(pipelineSpecSchema) + // TODO: Two fields in resources.Pipeline have the json tag id. Clarify the + // semantics and then add a description if needed. (https://github.com/databricks/bricks/issues/242) + pipelinesDocs := &Docs{ + Description: "List of pipeline definations", + AdditionalProperties: pipelineDocs, + } + return pipelinesDocs, nil +} + +func (reader *OpenapiReader) ResourcesDocs() (*Docs, error) { + jobsDocs, err := reader.jobsDocs() + if err != nil { + return nil, err + } + pipelinesDocs, err := reader.pipelinesDocs() + if err != nil { + return nil, err + } + + return &Docs{ + Description: "Specification of databricks resources to instantiate", + Properties: map[string]*Docs{ + "jobs": jobsDocs, + "pipelines": pipelinesDocs, + }, + }, nil +} diff --git a/bundle/schema/openapi_test.go b/bundle/schema/openapi_test.go new file mode 100644 index 00000000..faee64fa --- /dev/null +++ b/bundle/schema/openapi_test.go @@ -0,0 +1,435 @@ +package schema + +import ( + "encoding/json" + "testing" + + "github.com/databricks/databricks-sdk-go/openapi" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestReadSchemaForObject(t *testing.T) { + specString := ` + { + "components": { + "schemas": { + "foo": { + "type": "number" + }, + "fruits": { + "type": "object", + "description": "fruits that are cool", + "properties": { + "guava": { + "type": "string", + "description": "a guava for my schema" + }, + "mango": { + "type": "object", + "description": "a mango for my schema", + "$ref": "#/components/schemas/mango" + } + } + }, + "mango": { + "type": "object", + "properties": { + "foo": { + "$ref": "#/components/schemas/foo" + } + } + } + } + } + } + ` + spec := &openapi.Specification{} + reader := &OpenapiReader{ + OpenapiSpec: spec, + Memo: make(map[string]*Schema), + } + err := json.Unmarshal([]byte(specString), spec) + require.NoError(t, err) + + fruitsSchema, err := reader.readResolvedSchema("#/components/schemas/fruits") + require.NoError(t, err) + + fruitsSchemaJson, err := json.MarshalIndent(fruitsSchema, " ", " ") + require.NoError(t, err) + + expected := `{ + "type": "object", + "description": "fruits that are cool", + "properties": { + "guava": { + "type": "string", + "description": "a guava for my schema" + }, + "mango": { + "type": "object", + "description": "a mango for my schema", + "properties": { + "foo": { + "type": "number" + } + } + } + } + }` + + t.Log("[DEBUG] actual: ", string(fruitsSchemaJson)) + t.Log("[DEBUG] expected: ", expected) + assert.Equal(t, expected, string(fruitsSchemaJson)) +} + +func TestReadSchemaForArray(t *testing.T) { + specString := ` + { + "components": { + "schemas": { + "fruits": { + "type": "object", + "description": "fruits that are cool", + "items": { + "description": "some papayas, because papayas are fruits too", + "$ref": "#/components/schemas/papaya" + } + }, + "papaya": { + "type": "number" + } + } + } + }` + spec := &openapi.Specification{} + reader := &OpenapiReader{ + OpenapiSpec: spec, + Memo: make(map[string]*Schema), + } + err := json.Unmarshal([]byte(specString), spec) + require.NoError(t, err) + + fruitsSchema, err := reader.readResolvedSchema("#/components/schemas/fruits") + require.NoError(t, err) + + fruitsSchemaJson, err := json.MarshalIndent(fruitsSchema, " ", " ") + require.NoError(t, err) + + expected := `{ + "type": "object", + "description": "fruits that are cool", + "items": { + "type": "number", + "description": "some papayas, because papayas are fruits too" + } + }` + + t.Log("[DEBUG] actual: ", string(fruitsSchemaJson)) + t.Log("[DEBUG] expected: ", expected) + assert.Equal(t, expected, string(fruitsSchemaJson)) +} + +func TestReadSchemaForMap(t *testing.T) { + specString := `{ + "components": { + "schemas": { + "fruits": { + "type": "object", + "description": "fruits that are meh", + "additionalProperties": { + "description": "watermelons. watermelons.", + "$ref": "#/components/schemas/watermelon" + } + }, + "watermelon": { + "type": "number" + } + } + } + }` + spec := &openapi.Specification{} + reader := &OpenapiReader{ + OpenapiSpec: spec, + Memo: make(map[string]*Schema), + } + err := json.Unmarshal([]byte(specString), spec) + require.NoError(t, err) + + fruitsSchema, err := reader.readResolvedSchema("#/components/schemas/fruits") + require.NoError(t, err) + + fruitsSchemaJson, err := json.MarshalIndent(fruitsSchema, " ", " ") + require.NoError(t, err) + + expected := `{ + "type": "object", + "description": "fruits that are meh", + "additionalProperties": { + "type": "number", + "description": "watermelons. watermelons." + } + }` + + t.Log("[DEBUG] actual: ", string(fruitsSchemaJson)) + t.Log("[DEBUG] expected: ", expected) + assert.Equal(t, expected, string(fruitsSchemaJson)) +} + +func TestRootReferenceIsResolved(t *testing.T) { + specString := `{ + "components": { + "schemas": { + "foo": { + "type": "object", + "description": "this description is ignored", + "properties": { + "abc": { + "type": "string" + } + } + }, + "fruits": { + "type": "object", + "description": "foo fighters fighting fruits", + "$ref": "#/components/schemas/foo" + } + } + } + }` + spec := &openapi.Specification{} + reader := &OpenapiReader{ + OpenapiSpec: spec, + Memo: make(map[string]*Schema), + } + err := json.Unmarshal([]byte(specString), spec) + require.NoError(t, err) + + schema, err := reader.readResolvedSchema("#/components/schemas/fruits") + require.NoError(t, err) + fruitsSchemaJson, err := json.MarshalIndent(schema, " ", " ") + require.NoError(t, err) + + expected := `{ + "type": "object", + "description": "foo fighters fighting fruits", + "properties": { + "abc": { + "type": "string" + } + } + }` + + t.Log("[DEBUG] actual: ", string(fruitsSchemaJson)) + t.Log("[DEBUG] expected: ", expected) + assert.Equal(t, expected, string(fruitsSchemaJson)) +} + +func TestSelfReferenceLoopErrors(t *testing.T) { + specString := `{ + "components": { + "schemas": { + "foo": { + "type": "object", + "description": "this description is ignored", + "properties": { + "bar": { + "type": "object", + "$ref": "#/components/schemas/foo" + } + } + }, + "fruits": { + "type": "object", + "description": "foo fighters fighting fruits", + "$ref": "#/components/schemas/foo" + } + } + } + }` + spec := &openapi.Specification{} + reader := &OpenapiReader{ + OpenapiSpec: spec, + Memo: make(map[string]*Schema), + } + err := json.Unmarshal([]byte(specString), spec) + require.NoError(t, err) + + _, err = reader.readResolvedSchema("#/components/schemas/fruits") + assert.ErrorContains(t, err, "references loop detected. schema ref trace: #/components/schemas/fruits -> #/components/schemas/foo") +} + +func TestCrossReferenceLoopErrors(t *testing.T) { + specString := `{ + "components": { + "schemas": { + "foo": { + "type": "object", + "description": "this description is ignored", + "properties": { + "bar": { + "type": "object", + "$ref": "#/components/schemas/fruits" + } + } + }, + "fruits": { + "type": "object", + "description": "foo fighters fighting fruits", + "$ref": "#/components/schemas/foo" + } + } + } + }` + spec := &openapi.Specification{} + reader := &OpenapiReader{ + OpenapiSpec: spec, + Memo: make(map[string]*Schema), + } + err := json.Unmarshal([]byte(specString), spec) + require.NoError(t, err) + + _, err = reader.readResolvedSchema("#/components/schemas/fruits") + assert.ErrorContains(t, err, "references loop detected. schema ref trace: #/components/schemas/fruits -> #/components/schemas/foo") +} + +func TestReferenceResolutionForMapInObject(t *testing.T) { + specString := ` + { + "components": { + "schemas": { + "foo": { + "type": "number" + }, + "fruits": { + "type": "object", + "description": "fruits that are cool", + "properties": { + "guava": { + "type": "string", + "description": "a guava for my schema" + }, + "mangos": { + "type": "object", + "description": "multiple mangos", + "$ref": "#/components/schemas/mango" + } + } + }, + "mango": { + "type": "object", + "additionalProperties": { + "description": "a single mango", + "$ref": "#/components/schemas/foo" + } + } + } + } + }` + spec := &openapi.Specification{} + reader := &OpenapiReader{ + OpenapiSpec: spec, + Memo: make(map[string]*Schema), + } + err := json.Unmarshal([]byte(specString), spec) + require.NoError(t, err) + + fruitsSchema, err := reader.readResolvedSchema("#/components/schemas/fruits") + require.NoError(t, err) + + fruitsSchemaJson, err := json.MarshalIndent(fruitsSchema, " ", " ") + require.NoError(t, err) + + expected := `{ + "type": "object", + "description": "fruits that are cool", + "properties": { + "guava": { + "type": "string", + "description": "a guava for my schema" + }, + "mangos": { + "type": "object", + "description": "multiple mangos", + "additionalProperties": { + "type": "number", + "description": "a single mango" + } + } + } + }` + + t.Log("[DEBUG] actual: ", string(fruitsSchemaJson)) + t.Log("[DEBUG] expected: ", expected) + assert.Equal(t, expected, string(fruitsSchemaJson)) +} + +func TestReferenceResolutionForArrayInObject(t *testing.T) { + specString := `{ + "components": { + "schemas": { + "foo": { + "type": "number" + }, + "fruits": { + "type": "object", + "description": "fruits that are cool", + "properties": { + "guava": { + "type": "string", + "description": "a guava for my schema" + }, + "mangos": { + "type": "object", + "description": "multiple mangos", + "$ref": "#/components/schemas/mango" + } + } + }, + "mango": { + "type": "object", + "items": { + "description": "a single mango", + "$ref": "#/components/schemas/foo" + } + } + } + } + }` + spec := &openapi.Specification{} + reader := &OpenapiReader{ + OpenapiSpec: spec, + Memo: make(map[string]*Schema), + } + err := json.Unmarshal([]byte(specString), spec) + require.NoError(t, err) + + fruitsSchema, err := reader.readResolvedSchema("#/components/schemas/fruits") + require.NoError(t, err) + + fruitsSchemaJson, err := json.MarshalIndent(fruitsSchema, " ", " ") + require.NoError(t, err) + + expected := `{ + "type": "object", + "description": "fruits that are cool", + "properties": { + "guava": { + "type": "string", + "description": "a guava for my schema" + }, + "mangos": { + "type": "object", + "description": "multiple mangos", + "items": { + "type": "number", + "description": "a single mango" + } + } + } + }` + + t.Log("[DEBUG] actual: ", string(fruitsSchemaJson)) + t.Log("[DEBUG] expected: ", expected) + assert.Equal(t, expected, string(fruitsSchemaJson)) +} diff --git a/bundle/schema/schema.go b/bundle/schema/schema.go index d6d0295d..2e50298c 100644 --- a/bundle/schema/schema.go +++ b/bundle/schema/schema.go @@ -35,6 +35,9 @@ type Schema struct { // Required properties for the object. Any fields missing the "omitempty" // json tag will be included Required []string `json:"required,omitempty"` + + // URI to a json schema + Reference *string `json:"$ref,omitempty"` } // This function translates golang types into json schema. Here is the mapping @@ -187,7 +190,7 @@ func toSchema(golangType reflect.Type, docs *Docs, tracker *tracker) (*Schema, e schema := &Schema{Type: rootJavascriptType} if docs != nil { - schema.Description = docs.Documentation + schema.Description = docs.Description } // case array/slice @@ -197,7 +200,11 @@ func toSchema(golangType reflect.Type, docs *Docs, tracker *tracker) (*Schema, e if err != nil { return nil, err } - elemProps, err := safeToSchema(elemGolangType, docs, "", tracker) + var childDocs *Docs + if docs != nil { + childDocs = docs.Items + } + elemProps, err := safeToSchema(elemGolangType, childDocs, "", tracker) if err != nil { return nil, err } @@ -215,7 +222,11 @@ func toSchema(golangType reflect.Type, docs *Docs, tracker *tracker) (*Schema, e if golangType.Key().Kind() != reflect.String { return nil, fmt.Errorf("only string keyed maps allowed") } - schema.AdditionalProperties, err = safeToSchema(golangType.Elem(), docs, "", tracker) + var childDocs *Docs + if docs != nil { + childDocs = docs.AdditionalProperties + } + schema.AdditionalProperties, err = safeToSchema(golangType.Elem(), childDocs, "", tracker) if err != nil { return nil, err } @@ -240,8 +251,8 @@ func toSchema(golangType reflect.Type, docs *Docs, tracker *tracker) (*Schema, e // get docs for the child if they exist var childDocs *Docs if docs != nil { - if val, ok := docs.Children[childName]; ok { - childDocs = &val + if val, ok := docs.Properties[childName]; ok { + childDocs = val } } diff --git a/bundle/schema/schema_test.go b/bundle/schema/schema_test.go index 1539e35f..966aab81 100644 --- a/bundle/schema/schema_test.go +++ b/bundle/schema/schema_test.go @@ -1062,60 +1062,31 @@ func TestFieldsWithoutOmitEmptyAreRequired(t *testing.T) { assert.Equal(t, expectedSchema, string(jsonSchema)) } -func TestDocIngestionInSchema(t *testing.T) { +func TestDocIngestionForObject(t *testing.T) { docs := &Docs{ - Documentation: "docs for root", - Children: map[string]Docs{ + Description: "docs for root", + Properties: map[string]*Docs{ "my_struct": { - Documentation: "docs for my struct", - }, - "my_val": { - Documentation: "docs for my val", - }, - "my_slice": { - Documentation: "docs for my slice", - Children: map[string]Docs{ - "guava": { - Documentation: "docs for guava", + Description: "docs for my struct", + Properties: map[string]*Docs{ + "a": { + Description: "docs for a", }, - "pineapple": { - Documentation: "docs for pineapple", - }, - }, - }, - "my_map": { - Documentation: "docs for my map", - Children: map[string]Docs{ - "apple": { - Documentation: "docs for apple", - }, - "mango": { - Documentation: "docs for mango", + "c": { + Description: "docs for c which does not exist on my_struct", }, }, }, }, } - type Foo struct { - Apple int `json:"apple"` - Mango int `json:"mango"` - } - - type Bar struct { - Guava int `json:"guava"` - Pineapple int `json:"pineapple"` - } - type MyStruct struct { A string `json:"a"` + B int `json:"b"` } type Root struct { - MyStruct *MyStruct `json:"my_struct"` - MyVal int `json:"my_val"` - MySlice []Bar `json:"my_slice"` - MyMap map[string]*Foo `json:"my_map"` + MyStruct *MyStruct `json:"my_struct"` } elem := Root{} @@ -1131,29 +1102,82 @@ func TestDocIngestionInSchema(t *testing.T) { "type": "object", "description": "docs for root", "properties": { - "my_map": { + "my_struct": { "type": "object", - "description": "docs for my map", - "additionalProperties": { - "type": "object", - "description": "docs for my map", - "properties": { - "apple": { - "type": "number", - "description": "docs for apple" - }, - "mango": { - "type": "number", - "description": "docs for mango" - } + "description": "docs for my struct", + "properties": { + "a": { + "type": "string", + "description": "docs for a" }, - "additionalProperties": false, - "required": [ - "apple", - "mango" - ] - } + "b": { + "type": "number" + } + }, + "additionalProperties": false, + "required": [ + "a", + "b" + ] + } + }, + "additionalProperties": false, + "required": [ + "my_struct" + ] + }` + + t.Log("[DEBUG] actual: ", string(jsonSchema)) + t.Log("[DEBUG] expected: ", expectedSchema) + + assert.Equal(t, expectedSchema, string(jsonSchema)) +} + +func TestDocIngestionForSlice(t *testing.T) { + docs := &Docs{ + Description: "docs for root", + Properties: map[string]*Docs{ + "my_slice": { + Description: "docs for my slice", + Items: &Docs{ + Properties: map[string]*Docs{ + "guava": { + Description: "docs for guava", + }, + "pineapple": { + Description: "docs for pineapple", + }, + "watermelon": { + Description: "docs for watermelon which does not exist in schema", + }, + }, }, + }, + }, + } + + type Bar struct { + Guava int `json:"guava"` + Pineapple int `json:"pineapple"` + } + + type Root struct { + MySlice []Bar `json:"my_slice"` + } + + elem := Root{} + + schema, err := New(reflect.TypeOf(elem), docs) + require.NoError(t, err) + + jsonSchema, err := json.MarshalIndent(schema, " ", " ") + assert.NoError(t, err) + + expectedSchema := + `{ + "type": "object", + "description": "docs for root", + "properties": { "my_slice": { "type": "array", "description": "docs for my slice", @@ -1175,20 +1199,130 @@ func TestDocIngestionInSchema(t *testing.T) { "pineapple" ] } - }, - "my_struct": { - "type": "object", - "description": "docs for my struct", - "properties": { - "a": { - "type": "string" - } + } + }, + "additionalProperties": false, + "required": [ + "my_slice" + ] + }` + + t.Log("[DEBUG] actual: ", string(jsonSchema)) + t.Log("[DEBUG] expected: ", expectedSchema) + + assert.Equal(t, expectedSchema, string(jsonSchema)) +} + +func TestDocIngestionForMap(t *testing.T) { + docs := &Docs{ + Description: "docs for root", + Properties: map[string]*Docs{ + "my_map": { + Description: "docs for my map", + AdditionalProperties: &Docs{ + Properties: map[string]*Docs{ + "apple": { + Description: "docs for apple", + }, + "mango": { + Description: "docs for mango", + }, + "watermelon": { + Description: "docs for watermelon which does not exist in schema", + }, + "papaya": { + Description: "docs for papaya which does not exist in schema", + }, }, - "additionalProperties": false, - "required": [ - "a" - ] }, + }, + }, + } + + type Foo struct { + Apple int `json:"apple"` + Mango int `json:"mango"` + } + + type Root struct { + MyMap map[string]*Foo `json:"my_map"` + } + + elem := Root{} + + schema, err := New(reflect.TypeOf(elem), docs) + require.NoError(t, err) + + jsonSchema, err := json.MarshalIndent(schema, " ", " ") + assert.NoError(t, err) + + expectedSchema := + `{ + "type": "object", + "description": "docs for root", + "properties": { + "my_map": { + "type": "object", + "description": "docs for my map", + "additionalProperties": { + "type": "object", + "properties": { + "apple": { + "type": "number", + "description": "docs for apple" + }, + "mango": { + "type": "number", + "description": "docs for mango" + } + }, + "additionalProperties": false, + "required": [ + "apple", + "mango" + ] + } + } + }, + "additionalProperties": false, + "required": [ + "my_map" + ] + }` + + t.Log("[DEBUG] actual: ", string(jsonSchema)) + t.Log("[DEBUG] expected: ", expectedSchema) + + assert.Equal(t, expectedSchema, string(jsonSchema)) +} + +func TestDocIngestionForTopLevelPrimitive(t *testing.T) { + docs := &Docs{ + Description: "docs for root", + Properties: map[string]*Docs{ + "my_val": { + Description: "docs for my val", + }, + }, + } + + type Root struct { + MyVal int `json:"my_val"` + } + + elem := Root{} + + schema, err := New(reflect.TypeOf(elem), docs) + require.NoError(t, err) + + jsonSchema, err := json.MarshalIndent(schema, " ", " ") + assert.NoError(t, err) + + expectedSchema := + `{ + "type": "object", + "description": "docs for root", + "properties": { "my_val": { "type": "number", "description": "docs for my val" @@ -1196,10 +1330,7 @@ func TestDocIngestionInSchema(t *testing.T) { }, "additionalProperties": false, "required": [ - "my_struct", - "my_val", - "my_slice", - "my_map" + "my_val" ] }` diff --git a/cmd/bundle/schema.go b/cmd/bundle/schema.go index e3c19c46..d59e08d1 100644 --- a/cmd/bundle/schema.go +++ b/cmd/bundle/schema.go @@ -14,7 +14,7 @@ var schemaCmd = &cobra.Command{ Short: "Generate JSON Schema for bundle configuration", RunE: func(cmd *cobra.Command, args []string) error { - docs, err := schema.GetBundleDocs() + docs, err := schema.BundleDocs(openapi) if err != nil { return err } @@ -22,15 +22,26 @@ var schemaCmd = &cobra.Command{ if err != nil { return err } - jsonSchema, err := json.MarshalIndent(schema, "", " ") + result, err := json.MarshalIndent(schema, "", " ") if err != nil { return err } - cmd.OutOrStdout().Write(jsonSchema) + if onlyDocs { + result, err = json.MarshalIndent(docs, "", " ") + if err != nil { + return err + } + } + cmd.OutOrStdout().Write(result) return nil }, } +var openapi string +var onlyDocs bool + func init() { AddCommand(schemaCmd) + schemaCmd.Flags().StringVar(&openapi, "openapi", "", "path to a databricks openapi spec") + schemaCmd.Flags().BoolVar(&onlyDocs, "only-docs", false, "only generate descriptions for the schema") } diff --git a/go.mod b/go.mod index 4e76dd26..8c02d1f4 100644 --- a/go.mod +++ b/go.mod @@ -62,5 +62,5 @@ require ( google.golang.org/grpc v1.53.0 // indirect google.golang.org/protobuf v1.28.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect - gopkg.in/yaml.v3 v3.0.1 + gopkg.in/yaml.v3 v3.0.1 // indirect )