Add an experimental default-sql template (#1051)

## Changes

This adds a `default-sql` template! 

In this latest revision, I've hidden the new template from the list so
we can merge it, iterate over it, and properly release the template at
the right time.

- [x] WorkspaceFS support for .sql files is in prod
- [x] SQL extension is preconfigured based on extension settings (if
possible)
- [ ] Streaming tables support is either ungated or the template
provides instructions about signup
- _Mitigation for now: this template is hidden from the list of
templates._
- [x] Support non-UC workspaces

## Tests
- [x] Unit tests
- [x] Manual testing
- [x] More manual testing
- [x] Reviewer testing

---------

Co-authored-by: Pieter Noordhuis <pieter.noordhuis@databricks.com>
Co-authored-by: PaulCornellDB <paul.cornell@databricks.com>
This commit is contained in:
Lennart Kats (databricks) 2024-02-19 13:01:11 +01:00 committed by GitHub
parent a2a4948047
commit 162b115e19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 349 additions and 4 deletions

View File

@ -35,6 +35,11 @@ var nativeTemplates = []nativeTemplate{
name: "default-python",
description: "The default Python template for Notebooks / Delta Live Tables / Workflows",
},
{
name: "default-sql",
description: "The default SQL template for .sql files that run with Databricks SQL",
hidden: true,
},
{
name: "dbt-sql",
description: "The dbt SQL template (https://www.databricks.com/blog/delivering-cost-effective-data-real-time-dbt-and-databricks)",

View File

@ -109,9 +109,9 @@ func TestBuiltinPythonTemplateValid(t *testing.T) {
// Test option combinations
options := []string{"yes", "no"}
isServicePrincipal := false
build := false
catalog := "hive_metastore"
cachedCatalog = &catalog
build := false
for _, includeNotebook := range options {
for _, includeDlt := range options {
for _, includePython := range options {
@ -149,6 +149,24 @@ func TestBuiltinPythonTemplateValid(t *testing.T) {
defer os.RemoveAll(tempDir)
}
func TestBuiltinSQLTemplateValid(t *testing.T) {
for _, personal_schemas := range []string{"yes", "no"} {
for _, target := range []string{"dev", "prod"} {
for _, isServicePrincipal := range []bool{true, false} {
config := map[string]any{
"project_name": "my_project",
"http_path": "/sql/1.0/warehouses/123abc",
"default_catalog": "users",
"shared_schema": "lennart",
"personal_schemas": personal_schemas,
}
build := false
assertBuiltinTemplateValid(t, "default-sql", config, target, isServicePrincipal, build, t.TempDir())
}
}
}
}
func TestBuiltinDbtTemplateValid(t *testing.T) {
for _, personal_schemas := range []string{"yes", "no"} {
for _, target := range []string{"dev", "prod"} {

View File

@ -3,8 +3,8 @@
"properties": {
"project_name": {
"type": "string",
"pattern": "^[A-Za-z_][A-Za-z0-9_]+$",
"pattern_match_failure_message": "Name must consist of letters, numbers, and underscores.",
"pattern": "^[A-Za-z_][A-Za-z0-9-_]+$",
"pattern_match_failure_message": "Name must consist of letters, numbers, dashes, and underscores.",
"default": "dbt_project",
"description": "\nPlease provide a unique name for this project.\nproject_name",
"order": 1

View File

@ -19,7 +19,7 @@ targets:
host: {{workspace_host}}
## Optionally, there could be a 'staging' target here.
## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.)
## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/ci-cd.html.)
#
# staging:
# workspace:

View File

@ -0,0 +1,3 @@
# sql template
This folder provides a template for using SQL with Databricks Asset Bundles.

View File

@ -0,0 +1,53 @@
{
"welcome_message": "\nWelcome to the (EXPERIMENTAL) default SQL template for Databricks Asset Bundles!",
"properties": {
"project_name": {
"type": "string",
"default": "sql_project",
"description": "\nPlease provide a unique name for this project.\nproject_name",
"order": 1,
"pattern": "^[A-Za-z_][A-Za-z0-9-_]+$",
"pattern_match_failure_message": "Name must consist of letters, numbers, dashes, and underscores."
},
"http_path": {
"type": "string",
"pattern": "^/sql/.\\../warehouses/[a-z0-9]+$",
"pattern_match_failure_message": "Path must be of the form /sql/1.0/warehouses/<warehouse id>",
"description": "\nPlease provide the HTTP Path of the SQL warehouse you would like to use with dbt during development.\nYou can find this path by clicking on \"Connection details\" for your SQL warehouse.\nhttp_path [example: /sql/1.0/warehouses/abcdef1234567890]",
"order": 2
},
"default_catalog": {
"type": "string",
"default": "{{if eq (default_catalog) \"\"}}hive_metastore{{else}}{{default_catalog}}{{end}}",
"pattern": "^\\w*$",
"pattern_match_failure_message": "Invalid catalog name.",
"description": "\nPlease provide an initial catalog{{if eq (default_catalog) \"\"}} or metastore{{end}}.\ndefault_catalog",
"order": 3
},
"personal_schemas": {
"type": "string",
"description": "\nWould you like to use a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas",
"enum": [
"yes, automatically use a schema based on the current user name during development",
"no, use a single schema for all users during development"
],
"order": 4
},
"shared_schema": {
"skip_prompt_if": {
"properties": {
"personal_schemas": {
"const": "yes, automatically use a schema based on the current user name during development"
}
}
},
"type": "string",
"default": "default",
"pattern": "^\\w+$",
"pattern_match_failure_message": "Invalid schema name.",
"description": "\nPlease provide an initial schema during development.\ndefault_schema",
"order": 5
}
},
"success_message": "\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html."
}

View File

@ -0,0 +1,7 @@
{{define "latest_lts_dbr_version" -}}
13.3.x-scala2.12
{{- end}}
{{define "latest_lts_db_connect_version_spec" -}}
>=13.3,<13.4
{{- end}}

View File

@ -0,0 +1,7 @@
{
"recommendations": [
"databricks.databricks",
"redhat.vscode-yaml",
"databricks.sqltools-databricks-driver",
]
}

View File

@ -0,0 +1,28 @@
{
"python.analysis.stubPath": ".vscode",
"databricks.python.envFile": "${workspaceFolder}/.env",
"jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
"jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
"python.testing.pytestArgs": [
"."
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.analysis.extraPaths": ["src"],
"files.exclude": {
"**/*.egg-info": true,
"**/__pycache__": true,
".pytest_cache": true,
},
"sqltools.connections": [
{
"connectionMethod": "VS Code Extension (beta)",
"catalog": "{{.default_catalog}}",
"previewLimit": 50,
"driver": "Databricks",
"name": "databricks",
"path": "{{.http_path}}"
}
],
"sqltools.autoConnectTo": "",
}

View File

@ -0,0 +1,41 @@
# {{.project_name}}
The '{{.project_name}}' project was generated by using the default-sql template.
## Getting started
1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/install.html
2. Authenticate to your Databricks workspace (if you have not done so already):
```
$ databricks configure
```
3. To deploy a development copy of this project, type:
```
$ databricks bundle deploy --target dev
```
(Note that "dev" is the default target, so the `--target` parameter
is optional here.)
This deploys everything that's defined for this project.
For example, the default template would deploy a job called
`[dev yourname] {{.project_name}}_job` to your workspace.
You can find that job by opening your workpace and clicking on **Workflows**.
4. Similarly, to deploy a production copy, type:
```
$ databricks bundle deploy --target prod
```
5. To run a job, use the "run" command:
```
$ databricks bundle run
```
6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
https://docs.databricks.com/dev-tools/vscode-ext.html.
7. For documentation on the Databricks Asset Bundles format used
for this project, and for CI/CD configuration, see
https://docs.databricks.com/dev-tools/bundles/index.html.

View File

@ -0,0 +1,71 @@
# This is a Databricks asset bundle definition for {{.project_name}}.
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle:
name: {{.project_name}}
include:
- resources/*.yml
# Variable declarations. These variables are assigned in the dev/prod targets below.
variables:
warehouse_id:
description: The warehouse to use
catalog:
description: The catalog to use
schema:
description: The schema to use
{{- $dev_schema := .shared_schema }}
{{- $prod_schema := .shared_schema }}
{{- if (regexp "^yes").MatchString .personal_schemas}}
{{- $dev_schema = "${workspace.current_user.short_name}"}}
{{- $prod_schema = "default"}}
{{- end}}
# Deployment targets.
targets:
# The 'dev' target, for development purposes. This target is the default.
dev:
# We use 'mode: development' to indicate this is a personal development copy.
# Any job schedules and triggers are paused by default
mode: development
default: true
workspace:
host: {{workspace_host}}
variables:
warehouse_id: {{index ((regexp "[^/]+$").FindStringSubmatch .http_path) 0}}
catalog: {{.default_catalog}}
schema: {{$dev_schema}}
## Optionally, there could be a 'staging' target here.
## (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/ci-cd.html.)
#
# staging:
# workspace:
# host: {{workspace_host}}
# The 'prod' target, used for production deployment.
prod:
# We use 'mode: production' to indicate this is a production deployment.
# Doing so enables strict verification of the settings below.
mode: production
workspace:
host: {{workspace_host}}
# We always use /Users/{{user_name}} for all resources to make sure we only have a single copy.
{{- /*
Internal note 2023-12: CLI versions v0.211.0 and before would show an error when using `mode: production`
with a path that doesn't say "/Shared". For now, we'll include an extra comment in the template
to explain that customers should update if they see this.
*/}}
# If this path results in an error, please make sure you have a recent version of the CLI installed.
root_path: /Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target}
variables:
warehouse_id: {{index ((regexp "[^/]+$").FindStringSubmatch .http_path) 0}}
catalog: {{.default_catalog}}
schema: {{$prod_schema}}
{{- if not is_service_principal}}
run_as:
# This runs as {{user_name}} in production. We could also use a service principal here
# using service_principal_name (see https://docs.databricks.com/en/dev-tools/bundles/permissions.html).
user_name: {{user_name}}
{{end -}}

View File

@ -0,0 +1,43 @@
# A job running SQL queries on a SQL warehouse
resources:
jobs:
{{.project_name}}_sql_job:
name: {{.project_name}}_sql_job
schedule:
# Run every day at 7:17 AM
quartz_cron_expression: '44 17 7 * * ?'
timezone_id: Europe/Amsterdam
{{- if not is_service_principal}}
email_notifications:
on_failure:
- {{user_name}}
{{else}}
{{end -}}
parameters:
- name: catalog
default: ${var.catalog}
- name: schema
default: ${var.schema}
- name: bundle_target
default: ${bundle.target}
tasks:
- task_key: orders_raw
sql_task:
warehouse_id: ${var.warehouse_id}
file:
path: ../src/orders_raw.sql
- task_key: orders_daily
depends_on:
- task_key: orders_raw
sql_task:
warehouse_id: ${var.warehouse_id}
file:
path: ../src/orders_daily.sql

View File

@ -0,0 +1,4 @@
# scratch
This folder is reserved for personal, exploratory notebooks and SQL files.
By default these are not committed to Git, as 'scratch' is listed in .gitignore.

View File

@ -0,0 +1,35 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "dc8c630c-1ea0-42e4-873f-e4dec4d3d416",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"%sql\n",
"SELECT * FROM json.`/databricks-datasets/nyctaxi/sample/json/`"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 2
},
"notebookName": "exploration",
"widgets": {}
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -0,0 +1,14 @@
-- This query is executed using Databricks Workflows (see resources/{{.project_name}}_sql_job.yml)
{{- /* We can't use a materialized view here since they don't support 'create or refresh yet.*/}}
CREATE OR REPLACE VIEW
IDENTIFIER(CONCAT({{"{{"}}catalog{{"}}"}}, '.', {{"{{"}}schema{{"}}"}}, '.', 'orders_daily'))
AS SELECT
order_date, count(*) AS number_of_orders
FROM
IDENTIFIER(CONCAT({{"{{"}}catalog{{"}}"}}, '.', {{"{{"}}schema{{"}}"}}, '.', 'orders_raw'))
-- During development, only process a smaller range of data
WHERE {{"{{"}}bundle_target{{"}}"}} == "prod" OR (order_date >= '2019-08-01' AND order_date < '2019-09-01')
GROUP BY order_date

View File

@ -0,0 +1,16 @@
-- This query is executed using Databricks Workflows (see resources/{{.project_name}}_sql_job.yml)
--
-- The streaming table below ingests all JSON files in /databricks-datasets/retail-org/sales_orders/
-- See also https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-streaming-table.html
CREATE OR REFRESH STREAMING TABLE
IDENTIFIER(CONCAT({{"{{"}}catalog{{"}}"}}, '.', {{"{{"}}schema{{"}}"}}, '.', 'orders_raw'))
AS SELECT
customer_name,
DATE(TIMESTAMP(FROM_UNIXTIME(TRY_CAST(order_datetime AS BIGINT)))) AS order_date,
order_number
FROM STREAM READ_FILES(
"/databricks-datasets/retail-org/sales_orders/",
format => "json",
header => true
)