Add an experimental default-sql template (#1051)

## Changes This adds a `default-sql` template! In this latest revision, I've hidden the new template from the list so we can merge it, iterate over it, and properly release the template at the right time. - [x] WorkspaceFS support for .sql files is in prod - [x] SQL extension is preconfigured based on extension settings (if possible) - [ ] Streaming tables support is either ungated or the template provides instructions about signup - _Mitigation for now: this template is hidden from the list of templates._ - [x] Support non-UC workspaces ## Tests - [x] Unit tests - [x] Manual testing - [x] More manual testing - [x] Reviewer testing --------- Co-authored-by: Pieter Noordhuis <pieter.noordhuis@databricks.com> Co-authored-by: PaulCornellDB <paul.cornell@databricks.com>
2024-02-19 13:01:11 +01:00 · 2024-02-19 13:01:11 +01:00 · 162b115e19
parent a2a4948047
commit 162b115e19
16 changed files with 349 additions and 4 deletions
--- a/cmd/bundle/init.go
+++ b/cmd/bundle/init.go
@ -35,6 +35,11 @@ var nativeTemplates = []nativeTemplate{
 		name:        "default-python",
 		description: "The default Python template for Notebooks / Delta Live Tables / Workflows",
 	},
+	{
+		name:        "default-sql",
+		description: "The default SQL template for .sql files that run with Databricks SQL",
+		hidden:      true,
+	},
 	{
 		name:        "dbt-sql",
 		description: "The dbt SQL template (https://www.databricks.com/blog/delivering-cost-effective-data-real-time-dbt-and-databricks)",
--- a/libs/template/renderer_test.go
+++ b/libs/template/renderer_test.go
@ -109,9 +109,9 @@ func TestBuiltinPythonTemplateValid(t *testing.T) {
 	// Test option combinations
 	options := []string{"yes", "no"}
 	isServicePrincipal := false
-	build := false
 	catalog := "hive_metastore"
 	cachedCatalog = &catalog
+	build := false
 	for _, includeNotebook := range options {
 		for _, includeDlt := range options {
 			for _, includePython := range options {
@ -149,6 +149,24 @@ func TestBuiltinPythonTemplateValid(t *testing.T) {
 	defer os.RemoveAll(tempDir)
 }

+func TestBuiltinSQLTemplateValid(t *testing.T) {
+	for _, personal_schemas := range []string{"yes", "no"} {
+		for _, target := range []string{"dev", "prod"} {
+			for _, isServicePrincipal := range []bool{true, false} {
+				config := map[string]any{
+					"project_name":     "my_project",
+					"http_path":        "/sql/1.0/warehouses/123abc",
+					"default_catalog":  "users",
+					"shared_schema":    "lennart",
+					"personal_schemas": personal_schemas,
+				}
+				build := false
+				assertBuiltinTemplateValid(t, "default-sql", config, target, isServicePrincipal, build, t.TempDir())
+			}
+		}
+	}
+}
+
 func TestBuiltinDbtTemplateValid(t *testing.T) {
 	for _, personal_schemas := range []string{"yes", "no"} {
 		for _, target := range []string{"dev", "prod"} {
--- a/libs/template/templates/dbt-sql/databricks_template_schema.json
+++ b/libs/template/templates/dbt-sql/databricks_template_schema.json
@ -3,8 +3,8 @@
    "properties": {
        "project_name": {
            "type": "string",
-            "pattern": "^[A-Za-z_][A-Za-z0-9_]+$",
-            "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores.",
+            "pattern": "^[A-Za-z_][A-Za-z0-9-_]+$",
+            "pattern_match_failure_message": "Name must consist of letters, numbers, dashes, and underscores.",
            "default": "dbt_project",
            "description": "\nPlease provide a unique name for this project.\nproject_name",
            "order": 1
--- a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl
@ -19,7 +19,7 @@ targets:
      host: {{workspace_host}}

  ## Optionally, there could be a 'staging' target here.
-  ## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.)
+  ## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/ci-cd.html.)
  #
  # staging:
  #   workspace:
--- a/libs/template/templates/default-sql/README.md
+++ b/libs/template/templates/default-sql/README.md
@ -0,0 +1,3 @@
+# sql template
+
+This folder provides a template for using SQL with Databricks Asset Bundles.
--- a/libs/template/templates/default-sql/databricks_template_schema.json
+++ b/libs/template/templates/default-sql/databricks_template_schema.json
@ -0,0 +1,53 @@
+{
+    "welcome_message": "\nWelcome to the (EXPERIMENTAL) default SQL template for Databricks Asset Bundles!",
+    "properties": {
+        "project_name": {
+            "type": "string",
+            "default": "sql_project",
+            "description": "\nPlease provide a unique name for this project.\nproject_name",
+            "order": 1,
+            "pattern": "^[A-Za-z_][A-Za-z0-9-_]+$",
+            "pattern_match_failure_message": "Name must consist of letters, numbers, dashes, and underscores."
+        },
+        "http_path": {
+            "type": "string",
+            "pattern": "^/sql/.\\../warehouses/[a-z0-9]+$",
+            "pattern_match_failure_message": "Path must be of the form /sql/1.0/warehouses/<warehouse id>",
+            "description": "\nPlease provide the HTTP Path of the SQL warehouse you would like to use with dbt during development.\nYou can find this path by clicking on \"Connection details\" for your SQL warehouse.\nhttp_path [example: /sql/1.0/warehouses/abcdef1234567890]",
+            "order": 2
+        },
+        "default_catalog": {
+            "type": "string",
+            "default": "{{if eq (default_catalog) \"\"}}hive_metastore{{else}}{{default_catalog}}{{end}}",
+            "pattern": "^\\w*$",
+            "pattern_match_failure_message": "Invalid catalog name.",
+            "description": "\nPlease provide an initial catalog{{if eq (default_catalog) \"\"}} or metastore{{end}}.\ndefault_catalog",
+            "order": 3
+        },
+        "personal_schemas": {
+            "type": "string",
+            "description": "\nWould you like to use a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas",
+            "enum": [
+                "yes, automatically use a schema based on the current user name during development",
+                "no, use a single schema for all users during development"
+            ],
+            "order": 4
+        },
+        "shared_schema": {
+            "skip_prompt_if": {
+                "properties": {
+                    "personal_schemas": {
+                        "const": "yes, automatically use a schema based on the current user name during development"
+                    }
+                }
+            },
+            "type": "string",
+            "default": "default",
+            "pattern": "^\\w+$",
+            "pattern_match_failure_message": "Invalid schema name.",
+            "description": "\nPlease provide an initial schema during development.\ndefault_schema",
+            "order": 5
+        }
+    },
+    "success_message": "\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html."
+}
--- a/libs/template/templates/default-sql/library/versions.tmpl
+++ b/libs/template/templates/default-sql/library/versions.tmpl
@ -0,0 +1,7 @@
+{{define "latest_lts_dbr_version" -}}
+  13.3.x-scala2.12
+{{- end}}
+
+{{define "latest_lts_db_connect_version_spec" -}}
+  >=13.3,<13.4
+{{- end}}
--- a/libs/template/templates/default-sql/template/{{.project_name}}/.vscode/extensions.json
+++ b/libs/template/templates/default-sql/template/{{.project_name}}/.vscode/extensions.json
@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "databricks.databricks",
+        "redhat.vscode-yaml",
+        "databricks.sqltools-databricks-driver",
+    ]
+}
--- a/libs/template/templates/default-sql/template/{{.project_name}}/.vscode/settings.json.tmpl
+++ b/libs/template/templates/default-sql/template/{{.project_name}}/.vscode/settings.json.tmpl
@ -0,0 +1,28 @@
+{
+    "python.analysis.stubPath": ".vscode",
+    "databricks.python.envFile": "${workspaceFolder}/.env",
+    "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
+    "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
+    "python.testing.pytestArgs": [
+        "."
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "python.analysis.extraPaths": ["src"],
+    "files.exclude": {
+        "**/*.egg-info": true,
+        "**/__pycache__": true,
+        ".pytest_cache": true,
+    },
+    "sqltools.connections": [
+        {
+            "connectionMethod": "VS Code Extension (beta)",
+            "catalog": "{{.default_catalog}}",
+            "previewLimit": 50,
+            "driver": "Databricks",
+            "name": "databricks",
+            "path": "{{.http_path}}"
+        }
+    ],
+    "sqltools.autoConnectTo": "",
+}
--- a/libs/template/templates/default-sql/template/{{.project_name}}/README.md.tmpl
+++ b/libs/template/templates/default-sql/template/{{.project_name}}/README.md.tmpl
@ -0,0 +1,41 @@
+# {{.project_name}}
+
+The '{{.project_name}}' project was generated by using the default-sql template.
+
+## Getting started
+
+1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/install.html
+
+2. Authenticate to your Databricks workspace (if you have not done so already):
+    ```
+    $ databricks configure
+    ```
+
+3. To deploy a development copy of this project, type:
+    ```
+    $ databricks bundle deploy --target dev
+    ```
+    (Note that "dev" is the default target, so the `--target` parameter
+    is optional here.)
+
+    This deploys everything that's defined for this project.
+    For example, the default template would deploy a job called
+    `[dev yourname] {{.project_name}}_job` to your workspace.
+    You can find that job by opening your workpace and clicking on **Workflows**.
+
+4. Similarly, to deploy a production copy, type:
+   ```
+   $ databricks bundle deploy --target prod
+   ```
+
+5. To run a job, use the "run" command:
+   ```
+   $ databricks bundle run
+   ```
+
+6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
+   https://docs.databricks.com/dev-tools/vscode-ext.html.
+
+7. For documentation on the Databricks Asset Bundles format used
+   for this project, and for CI/CD configuration, see
+   https://docs.databricks.com/dev-tools/bundles/index.html.
--- a/libs/template/templates/default-sql/template/{{.project_name}}/databricks.yml.tmpl
+++ b/libs/template/templates/default-sql/template/{{.project_name}}/databricks.yml.tmpl
@ -0,0 +1,71 @@
+# This is a Databricks asset bundle definition for {{.project_name}}.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: {{.project_name}}
+
+include:
+  - resources/*.yml
+
+# Variable declarations. These variables are assigned in the dev/prod targets below.
+variables:
+  warehouse_id:
+    description: The warehouse to use
+  catalog:
+    description: The catalog to use
+  schema:
+    description: The schema to use
+
+{{- $dev_schema := .shared_schema }}
+{{- $prod_schema := .shared_schema }}
+{{- if (regexp "^yes").MatchString .personal_schemas}}
+{{- $dev_schema = "${workspace.current_user.short_name}"}}
+{{- $prod_schema = "default"}}
+{{- end}}
+
+# Deployment targets.
+targets:
+  # The 'dev' target, for development purposes. This target is the default.
+  dev:
+    # We use 'mode: development' to indicate this is a personal development copy.
+    # Any job schedules and triggers are paused by default
+    mode: development
+    default: true
+    workspace:
+      host: {{workspace_host}}
+    variables:
+      warehouse_id: {{index ((regexp "[^/]+$").FindStringSubmatch .http_path) 0}}
+      catalog: {{.default_catalog}}
+      schema: {{$dev_schema}}
+
+  ## Optionally, there could be a 'staging' target here.
+  ## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/ci-cd.html.)
+  #
+  # staging:
+  #   workspace:
+  #     host: {{workspace_host}}
+
+  # The 'prod' target, used for production deployment.
+  prod:
+    # We use 'mode: production' to indicate this is a production deployment.
+    # Doing so enables strict verification of the settings below.
+    mode: production
+    workspace:
+      host: {{workspace_host}}
+      # We always use /Users/{{user_name}} for all resources to make sure we only have a single copy.
+      {{- /*
+      Internal note 2023-12: CLI versions v0.211.0 and before would show an error when using `mode: production`
+      with a path that doesn't say "/Shared". For now, we'll include an extra comment in the template
+      to explain that customers should update if they see this.
+      */}}
+      # If this path results in an error, please make sure you have a recent version of the CLI installed.
+      root_path: /Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target}
+    variables:
+      warehouse_id: {{index ((regexp "[^/]+$").FindStringSubmatch .http_path) 0}}
+      catalog: {{.default_catalog}}
+      schema: {{$prod_schema}}
+    {{- if not is_service_principal}}
+    run_as:
+      # This runs as {{user_name}} in production. We could also use a service principal here
+      # using service_principal_name (see https://docs.databricks.com/en/dev-tools/bundles/permissions.html).
+      user_name: {{user_name}}
+    {{end -}}
--- a/libs/template/templates/default-sql/template/{{.project_name}}/resources/{{.project_name}}_sql_job.yml.tmpl
+++ b/libs/template/templates/default-sql/template/{{.project_name}}/resources/{{.project_name}}_sql_job.yml.tmpl
@ -0,0 +1,43 @@
+# A job running SQL queries on a SQL warehouse
+resources:
+  jobs:
+    {{.project_name}}_sql_job:
+      name: {{.project_name}}_sql_job
+
+      schedule:
+        # Run every day at 7:17 AM
+        quartz_cron_expression: '44 17 7 * * ?'
+        timezone_id: Europe/Amsterdam
+
+      {{- if not is_service_principal}}
+
+      email_notifications:
+        on_failure:
+          - {{user_name}}
+
+      {{else}}
+
+      {{end -}}
+
+      parameters:
+        - name: catalog
+          default: ${var.catalog}
+        - name: schema
+          default: ${var.schema}
+        - name: bundle_target
+          default: ${bundle.target}
+
+      tasks:
+        - task_key: orders_raw
+          sql_task:
+            warehouse_id: ${var.warehouse_id}
+            file:
+              path: ../src/orders_raw.sql
+
+        - task_key: orders_daily
+          depends_on:
+            - task_key: orders_raw
+          sql_task:
+            warehouse_id: ${var.warehouse_id}
+            file:
+              path: ../src/orders_daily.sql
--- a/libs/template/templates/default-sql/template/{{.project_name}}/scratch/README.md
+++ b/libs/template/templates/default-sql/template/{{.project_name}}/scratch/README.md
@ -0,0 +1,4 @@
+# scratch
+
+This folder is reserved for personal, exploratory notebooks and SQL files.
+By default these are not committed to Git, as 'scratch' is listed in .gitignore.
--- a/libs/template/templates/default-sql/template/{{.project_name}}/scratch/exploration.ipynb.tmpl
+++ b/libs/template/templates/default-sql/template/{{.project_name}}/scratch/exploration.ipynb.tmpl
@ -0,0 +1,35 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "dc8c630c-1ea0-42e4-873f-e4dec4d3d416",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%sql\n",
+    "SELECT * FROM json.`/databricks-datasets/nyctaxi/sample/json/`"
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 2
+   },
+   "notebookName": "exploration",
+   "widgets": {}
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/libs/template/templates/default-sql/template/{{.project_name}}/src/orders_daily.sql.tmpl
+++ b/libs/template/templates/default-sql/template/{{.project_name}}/src/orders_daily.sql.tmpl
@ -0,0 +1,14 @@
+-- This query is executed using Databricks Workflows (see resources/{{.project_name}}_sql_job.yml)
+{{- /* We can't use a materialized view here since they don't support 'create or refresh yet.*/}}
+
+CREATE OR REPLACE VIEW
+  IDENTIFIER(CONCAT({{"{{"}}catalog{{"}}"}}, '.', {{"{{"}}schema{{"}}"}}, '.', 'orders_daily'))
+AS SELECT
+  order_date, count(*) AS number_of_orders
+FROM
+  IDENTIFIER(CONCAT({{"{{"}}catalog{{"}}"}}, '.', {{"{{"}}schema{{"}}"}}, '.', 'orders_raw'))
+
+-- During development, only process a smaller range of data
+WHERE {{"{{"}}bundle_target{{"}}"}} == "prod" OR (order_date >= '2019-08-01' AND order_date < '2019-09-01')
+
+GROUP BY order_date
--- a/libs/template/templates/default-sql/template/{{.project_name}}/src/orders_raw.sql.tmpl
+++ b/libs/template/templates/default-sql/template/{{.project_name}}/src/orders_raw.sql.tmpl
@ -0,0 +1,16 @@
+-- This query is executed using Databricks Workflows (see resources/{{.project_name}}_sql_job.yml)
+--
+-- The streaming table below ingests all JSON files in /databricks-datasets/retail-org/sales_orders/
+-- See also https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-streaming-table.html
+
+CREATE OR REFRESH STREAMING TABLE
+  IDENTIFIER(CONCAT({{"{{"}}catalog{{"}}"}}, '.', {{"{{"}}schema{{"}}"}}, '.', 'orders_raw'))
+AS SELECT
+  customer_name,
+  DATE(TIMESTAMP(FROM_UNIXTIME(TRY_CAST(order_datetime AS BIGINT)))) AS order_date,
+  order_number
+FROM STREAM READ_FILES(
+  "/databricks-datasets/retail-org/sales_orders/",
+  format => "json",
+  header => true
+)