diff --git a/cmd/bundle/init.go b/cmd/bundle/init.go index 306e2903..704bad64 100644 --- a/cmd/bundle/init.go +++ b/cmd/bundle/init.go @@ -35,6 +35,11 @@ var nativeTemplates = []nativeTemplate{ name: "default-python", description: "The default Python template for Notebooks / Delta Live Tables / Workflows", }, + { + name: "default-sql", + description: "The default SQL template for .sql files that run with Databricks SQL", + hidden: true, + }, { name: "dbt-sql", description: "The dbt SQL template (https://www.databricks.com/blog/delivering-cost-effective-data-real-time-dbt-and-databricks)", diff --git a/libs/template/renderer_test.go b/libs/template/renderer_test.go index 964159ec..dc287440 100644 --- a/libs/template/renderer_test.go +++ b/libs/template/renderer_test.go @@ -109,9 +109,9 @@ func TestBuiltinPythonTemplateValid(t *testing.T) { // Test option combinations options := []string{"yes", "no"} isServicePrincipal := false - build := false catalog := "hive_metastore" cachedCatalog = &catalog + build := false for _, includeNotebook := range options { for _, includeDlt := range options { for _, includePython := range options { @@ -149,6 +149,24 @@ func TestBuiltinPythonTemplateValid(t *testing.T) { defer os.RemoveAll(tempDir) } +func TestBuiltinSQLTemplateValid(t *testing.T) { + for _, personal_schemas := range []string{"yes", "no"} { + for _, target := range []string{"dev", "prod"} { + for _, isServicePrincipal := range []bool{true, false} { + config := map[string]any{ + "project_name": "my_project", + "http_path": "/sql/1.0/warehouses/123abc", + "default_catalog": "users", + "shared_schema": "lennart", + "personal_schemas": personal_schemas, + } + build := false + assertBuiltinTemplateValid(t, "default-sql", config, target, isServicePrincipal, build, t.TempDir()) + } + } + } +} + func TestBuiltinDbtTemplateValid(t *testing.T) { for _, personal_schemas := range []string{"yes", "no"} { for _, target := range []string{"dev", "prod"} { diff --git a/libs/template/templates/dbt-sql/databricks_template_schema.json b/libs/template/templates/dbt-sql/databricks_template_schema.json index 736b1232..7b39f618 100644 --- a/libs/template/templates/dbt-sql/databricks_template_schema.json +++ b/libs/template/templates/dbt-sql/databricks_template_schema.json @@ -3,8 +3,8 @@ "properties": { "project_name": { "type": "string", - "pattern": "^[A-Za-z_][A-Za-z0-9_]+$", - "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores.", + "pattern": "^[A-Za-z_][A-Za-z0-9-_]+$", + "pattern_match_failure_message": "Name must consist of letters, numbers, dashes, and underscores.", "default": "dbt_project", "description": "\nPlease provide a unique name for this project.\nproject_name", "order": 1 diff --git a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl index ea432f8d..e3572326 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl @@ -19,7 +19,7 @@ targets: host: {{workspace_host}} ## Optionally, there could be a 'staging' target here. - ## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.) + ## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/ci-cd.html.) # # staging: # workspace: diff --git a/libs/template/templates/default-sql/README.md b/libs/template/templates/default-sql/README.md new file mode 100644 index 00000000..6b7140f0 --- /dev/null +++ b/libs/template/templates/default-sql/README.md @@ -0,0 +1,3 @@ +# sql template + +This folder provides a template for using SQL with Databricks Asset Bundles. diff --git a/libs/template/templates/default-sql/databricks_template_schema.json b/libs/template/templates/default-sql/databricks_template_schema.json new file mode 100644 index 00000000..b7a42e19 --- /dev/null +++ b/libs/template/templates/default-sql/databricks_template_schema.json @@ -0,0 +1,53 @@ +{ + "welcome_message": "\nWelcome to the (EXPERIMENTAL) default SQL template for Databricks Asset Bundles!", + "properties": { + "project_name": { + "type": "string", + "default": "sql_project", + "description": "\nPlease provide a unique name for this project.\nproject_name", + "order": 1, + "pattern": "^[A-Za-z_][A-Za-z0-9-_]+$", + "pattern_match_failure_message": "Name must consist of letters, numbers, dashes, and underscores." + }, + "http_path": { + "type": "string", + "pattern": "^/sql/.\\../warehouses/[a-z0-9]+$", + "pattern_match_failure_message": "Path must be of the form /sql/1.0/warehouses/", + "description": "\nPlease provide the HTTP Path of the SQL warehouse you would like to use with dbt during development.\nYou can find this path by clicking on \"Connection details\" for your SQL warehouse.\nhttp_path [example: /sql/1.0/warehouses/abcdef1234567890]", + "order": 2 + }, + "default_catalog": { + "type": "string", + "default": "{{if eq (default_catalog) \"\"}}hive_metastore{{else}}{{default_catalog}}{{end}}", + "pattern": "^\\w*$", + "pattern_match_failure_message": "Invalid catalog name.", + "description": "\nPlease provide an initial catalog{{if eq (default_catalog) \"\"}} or metastore{{end}}.\ndefault_catalog", + "order": 3 + }, + "personal_schemas": { + "type": "string", + "description": "\nWould you like to use a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas", + "enum": [ + "yes, automatically use a schema based on the current user name during development", + "no, use a single schema for all users during development" + ], + "order": 4 + }, + "shared_schema": { + "skip_prompt_if": { + "properties": { + "personal_schemas": { + "const": "yes, automatically use a schema based on the current user name during development" + } + } + }, + "type": "string", + "default": "default", + "pattern": "^\\w+$", + "pattern_match_failure_message": "Invalid schema name.", + "description": "\nPlease provide an initial schema during development.\ndefault_schema", + "order": 5 + } + }, + "success_message": "\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html." +} diff --git a/libs/template/templates/default-sql/library/versions.tmpl b/libs/template/templates/default-sql/library/versions.tmpl new file mode 100644 index 00000000..f9a879d2 --- /dev/null +++ b/libs/template/templates/default-sql/library/versions.tmpl @@ -0,0 +1,7 @@ +{{define "latest_lts_dbr_version" -}} + 13.3.x-scala2.12 +{{- end}} + +{{define "latest_lts_db_connect_version_spec" -}} + >=13.3,<13.4 +{{- end}} diff --git a/libs/template/templates/default-sql/template/{{.project_name}}/.vscode/extensions.json b/libs/template/templates/default-sql/template/{{.project_name}}/.vscode/extensions.json new file mode 100644 index 00000000..8e102346 --- /dev/null +++ b/libs/template/templates/default-sql/template/{{.project_name}}/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "redhat.vscode-yaml", + "databricks.sqltools-databricks-driver", + ] +} diff --git a/libs/template/templates/default-sql/template/{{.project_name}}/.vscode/settings.json.tmpl b/libs/template/templates/default-sql/template/{{.project_name}}/.vscode/settings.json.tmpl new file mode 100644 index 00000000..c63af24b --- /dev/null +++ b/libs/template/templates/default-sql/template/{{.project_name}}/.vscode/settings.json.tmpl @@ -0,0 +1,28 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["src"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "sqltools.connections": [ + { + "connectionMethod": "VS Code Extension (beta)", + "catalog": "{{.default_catalog}}", + "previewLimit": 50, + "driver": "Databricks", + "name": "databricks", + "path": "{{.http_path}}" + } + ], + "sqltools.autoConnectTo": "", +} diff --git a/libs/template/templates/default-sql/template/{{.project_name}}/README.md.tmpl b/libs/template/templates/default-sql/template/{{.project_name}}/README.md.tmpl new file mode 100644 index 00000000..e5c44320 --- /dev/null +++ b/libs/template/templates/default-sql/template/{{.project_name}}/README.md.tmpl @@ -0,0 +1,41 @@ +# {{.project_name}} + +The '{{.project_name}}' project was generated by using the default-sql template. + +## Getting started + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/install.html + +2. Authenticate to your Databricks workspace (if you have not done so already): + ``` + $ databricks configure + ``` + +3. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, the default template would deploy a job called + `[dev yourname] {{.project_name}}_job` to your workspace. + You can find that job by opening your workpace and clicking on **Workflows**. + +4. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +5. To run a job, use the "run" command: + ``` + $ databricks bundle run + ``` + +6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. + +7. For documentation on the Databricks Asset Bundles format used + for this project, and for CI/CD configuration, see + https://docs.databricks.com/dev-tools/bundles/index.html. diff --git a/libs/template/templates/default-sql/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/default-sql/template/{{.project_name}}/databricks.yml.tmpl new file mode 100644 index 00000000..a47fb7c1 --- /dev/null +++ b/libs/template/templates/default-sql/template/{{.project_name}}/databricks.yml.tmpl @@ -0,0 +1,71 @@ +# This is a Databricks asset bundle definition for {{.project_name}}. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: {{.project_name}} + +include: + - resources/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + warehouse_id: + description: The warehouse to use + catalog: + description: The catalog to use + schema: + description: The schema to use + +{{- $dev_schema := .shared_schema }} +{{- $prod_schema := .shared_schema }} +{{- if (regexp "^yes").MatchString .personal_schemas}} +{{- $dev_schema = "${workspace.current_user.short_name}"}} +{{- $prod_schema = "default"}} +{{- end}} + +# Deployment targets. +targets: + # The 'dev' target, for development purposes. This target is the default. + dev: + # We use 'mode: development' to indicate this is a personal development copy. + # Any job schedules and triggers are paused by default + mode: development + default: true + workspace: + host: {{workspace_host}} + variables: + warehouse_id: {{index ((regexp "[^/]+$").FindStringSubmatch .http_path) 0}} + catalog: {{.default_catalog}} + schema: {{$dev_schema}} + + ## Optionally, there could be a 'staging' target here. + ## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/ci-cd.html.) + # + # staging: + # workspace: + # host: {{workspace_host}} + + # The 'prod' target, used for production deployment. + prod: + # We use 'mode: production' to indicate this is a production deployment. + # Doing so enables strict verification of the settings below. + mode: production + workspace: + host: {{workspace_host}} + # We always use /Users/{{user_name}} for all resources to make sure we only have a single copy. + {{- /* + Internal note 2023-12: CLI versions v0.211.0 and before would show an error when using `mode: production` + with a path that doesn't say "/Shared". For now, we'll include an extra comment in the template + to explain that customers should update if they see this. + */}} + # If this path results in an error, please make sure you have a recent version of the CLI installed. + root_path: /Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} + variables: + warehouse_id: {{index ((regexp "[^/]+$").FindStringSubmatch .http_path) 0}} + catalog: {{.default_catalog}} + schema: {{$prod_schema}} + {{- if not is_service_principal}} + run_as: + # This runs as {{user_name}} in production. We could also use a service principal here + # using service_principal_name (see https://docs.databricks.com/en/dev-tools/bundles/permissions.html). + user_name: {{user_name}} + {{end -}} diff --git a/libs/template/templates/default-sql/template/{{.project_name}}/resources/{{.project_name}}_sql_job.yml.tmpl b/libs/template/templates/default-sql/template/{{.project_name}}/resources/{{.project_name}}_sql_job.yml.tmpl new file mode 100644 index 00000000..31d2d21a --- /dev/null +++ b/libs/template/templates/default-sql/template/{{.project_name}}/resources/{{.project_name}}_sql_job.yml.tmpl @@ -0,0 +1,43 @@ +# A job running SQL queries on a SQL warehouse +resources: + jobs: + {{.project_name}}_sql_job: + name: {{.project_name}}_sql_job + + schedule: + # Run every day at 7:17 AM + quartz_cron_expression: '44 17 7 * * ?' + timezone_id: Europe/Amsterdam + + {{- if not is_service_principal}} + + email_notifications: + on_failure: + - {{user_name}} + + {{else}} + + {{end -}} + + parameters: + - name: catalog + default: ${var.catalog} + - name: schema + default: ${var.schema} + - name: bundle_target + default: ${bundle.target} + + tasks: + - task_key: orders_raw + sql_task: + warehouse_id: ${var.warehouse_id} + file: + path: ../src/orders_raw.sql + + - task_key: orders_daily + depends_on: + - task_key: orders_raw + sql_task: + warehouse_id: ${var.warehouse_id} + file: + path: ../src/orders_daily.sql diff --git a/libs/template/templates/default-sql/template/{{.project_name}}/scratch/README.md b/libs/template/templates/default-sql/template/{{.project_name}}/scratch/README.md new file mode 100644 index 00000000..5350d09c --- /dev/null +++ b/libs/template/templates/default-sql/template/{{.project_name}}/scratch/README.md @@ -0,0 +1,4 @@ +# scratch + +This folder is reserved for personal, exploratory notebooks and SQL files. +By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/libs/template/templates/default-sql/template/{{.project_name}}/scratch/exploration.ipynb.tmpl b/libs/template/templates/default-sql/template/{{.project_name}}/scratch/exploration.ipynb.tmpl new file mode 100644 index 00000000..becee5fb --- /dev/null +++ b/libs/template/templates/default-sql/template/{{.project_name}}/scratch/exploration.ipynb.tmpl @@ -0,0 +1,35 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "dc8c630c-1ea0-42e4-873f-e4dec4d3d416", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "%sql\n", + "SELECT * FROM json.`/databricks-datasets/nyctaxi/sample/json/`" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "exploration", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/libs/template/templates/default-sql/template/{{.project_name}}/src/orders_daily.sql.tmpl b/libs/template/templates/default-sql/template/{{.project_name}}/src/orders_daily.sql.tmpl new file mode 100644 index 00000000..76ecadd3 --- /dev/null +++ b/libs/template/templates/default-sql/template/{{.project_name}}/src/orders_daily.sql.tmpl @@ -0,0 +1,14 @@ +-- This query is executed using Databricks Workflows (see resources/{{.project_name}}_sql_job.yml) +{{- /* We can't use a materialized view here since they don't support 'create or refresh yet.*/}} + +CREATE OR REPLACE VIEW + IDENTIFIER(CONCAT({{"{{"}}catalog{{"}}"}}, '.', {{"{{"}}schema{{"}}"}}, '.', 'orders_daily')) +AS SELECT + order_date, count(*) AS number_of_orders +FROM + IDENTIFIER(CONCAT({{"{{"}}catalog{{"}}"}}, '.', {{"{{"}}schema{{"}}"}}, '.', 'orders_raw')) + +-- During development, only process a smaller range of data +WHERE {{"{{"}}bundle_target{{"}}"}} == "prod" OR (order_date >= '2019-08-01' AND order_date < '2019-09-01') + +GROUP BY order_date diff --git a/libs/template/templates/default-sql/template/{{.project_name}}/src/orders_raw.sql.tmpl b/libs/template/templates/default-sql/template/{{.project_name}}/src/orders_raw.sql.tmpl new file mode 100644 index 00000000..96769062 --- /dev/null +++ b/libs/template/templates/default-sql/template/{{.project_name}}/src/orders_raw.sql.tmpl @@ -0,0 +1,16 @@ +-- This query is executed using Databricks Workflows (see resources/{{.project_name}}_sql_job.yml) +-- +-- The streaming table below ingests all JSON files in /databricks-datasets/retail-org/sales_orders/ +-- See also https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-streaming-table.html + +CREATE OR REFRESH STREAMING TABLE + IDENTIFIER(CONCAT({{"{{"}}catalog{{"}}"}}, '.', {{"{{"}}schema{{"}}"}}, '.', 'orders_raw')) +AS SELECT + customer_name, + DATE(TIMESTAMP(FROM_UNIXTIME(TRY_CAST(order_datetime AS BIGINT)))) AS order_date, + order_number +FROM STREAM READ_FILES( + "/databricks-datasets/retail-org/sales_orders/", + format => "json", + header => true +)