Make the default `databricks bundle init` template more self-explanatory (#796)

This makes the default-python template more self-explanatory and adds a
few other tweaks for a better out-of-the-box experience.
This commit is contained in:
Lennart Kats (databricks) 2023-09-26 11:12:34 +02:00 committed by GitHub
parent 757d5efe8d
commit 0c1516c4ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 72 additions and 11 deletions

View File

@ -41,6 +41,7 @@ func assertBuiltinTemplateValid(t *testing.T, settings map[string]any, target st
templatePath, err := prepareBuiltinTemplates("default-python", tempDir) templatePath, err := prepareBuiltinTemplates("default-python", tempDir)
require.NoError(t, err) require.NoError(t, err)
libraryPath := filepath.Join(templatePath, "library")
w := &databricks.WorkspaceClient{ w := &databricks.WorkspaceClient{
Config: &workspaceConfig.Config{Host: "https://myhost.com"}, Config: &workspaceConfig.Config{Host: "https://myhost.com"},
@ -52,7 +53,7 @@ func assertBuiltinTemplateValid(t *testing.T, settings map[string]any, target st
ctx = root.SetWorkspaceClient(ctx, w) ctx = root.SetWorkspaceClient(ctx, w)
helpers := loadHelpers(ctx) helpers := loadHelpers(ctx)
renderer, err := newRenderer(ctx, settings, helpers, templatePath, "./testdata/template-in-path/library", tempDir) renderer, err := newRenderer(ctx, settings, helpers, templatePath, libraryPath, tempDir)
require.NoError(t, err) require.NoError(t, err)
// Evaluate template // Evaluate template

View File

@ -0,0 +1,7 @@
{{define "latest_lts_dbr_version" -}}
13.3.x-scala2.12
{{- end}}
{{define "latest_lts_db_connect_version_spec" -}}
>=13.3,<13.4
{{- end}}

View File

@ -8,7 +8,10 @@
], ],
"python.testing.unittestEnabled": false, "python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true, "python.testing.pytestEnabled": true,
"python.analysis.extraPaths": ["src"],
"files.exclude": { "files.exclude": {
"**/*.egg-info": true "**/*.egg-info": true,
"**/__pycache__": true,
".pytest_cache": true,
}, },
} }

View File

@ -30,7 +30,7 @@ The '{{.project_name}}' project was generated by using the default-python templa
5. To run a job or pipeline, use the "run" comand: 5. To run a job or pipeline, use the "run" comand:
``` ```
$ databricks bundle run {{.project_name}}_job $ databricks bundle run
``` ```
6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from

View File

@ -0,0 +1,22 @@
## requirements-dev.txt: dependencies for local development.
##
## For defining dependencies used by jobs in Databricks Workflows, see
## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
## pytest is the default package used for testing
pytest
## databricks-connect can be used to run parts of this project locally.
## See https://docs.databricks.com/dev-tools/databricks-connect.html.
##
## databricks-connect is automatically installed if you're using Databricks
## extension for Visual Studio Code
## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html).
##
## To manually install databricks-connect, either follow the instructions
## at https://docs.databricks.com/dev-tools/databricks-connect.html
## to install the package system-wide. Or uncomment the line below to install a
## version of db-connect that corresponds to the Databricks Runtime version used
## for this project.
#
# databricks-connect{{template "latest_lts_db_connect_version_spec"}}

View File

@ -49,6 +49,9 @@ resources:
package_name: {{.project_name}} package_name: {{.project_name}}
entry_point: main entry_point: main
libraries: libraries:
# By default we just include the .whl file generated for the {{.project_name}} package.
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
# for more information on how to add other libraries.
- whl: ../dist/*.whl - whl: ../dist/*.whl
{{else}} {{else}}
@ -56,8 +59,7 @@ resources:
job_clusters: job_clusters:
- job_cluster_key: job_cluster - job_cluster_key: job_cluster
new_cluster: new_cluster:
{{- /* we should always use an LTS version in our templates */}} spark_version: {{template "latest_lts_dbr_version"}}
spark_version: 13.3.x-scala2.12
node_type_id: {{smallest_node_type}} node_type_id: {{smallest_node_type}}
autoscale: autoscale:
min_workers: 1 min_workers: 1

View File

@ -1,8 +1,9 @@
""" """
Setup script for {{.project_name}}. setup.py configuration script describing how to build and package this project.
This script packages and distributes the associated wheel file(s). This file is primarily used by the setuptools library and typically should not
Source code is in ./src/. Run 'python setup.py sdist bdist_wheel' to build. be executed directly. See README.md for how to deploy, test, and run
the {{.project_name}} project.
""" """
from setuptools import setup, find_packages from setuptools import setup, find_packages
@ -16,9 +17,18 @@ setup(
version={{.project_name}}.__version__, version={{.project_name}}.__version__,
url="https://databricks.com", url="https://databricks.com",
author="{{user_name}}", author="{{user_name}}",
description="my test wheel", description="wheel file based on {{.project_name}}/src",
packages=find_packages(where='./src'), packages=find_packages(where='./src'),
package_dir={'': 'src'}, package_dir={'': 'src'},
entry_points={"entry_points": "main={{.project_name}}.main:main"}, entry_points={
install_requires=["setuptools"], "packages": [
"main={{.project_name}}.main:main"
]
},
install_requires=[
# Dependencies in case the output wheel file is used as a library dependency.
# For defining dependencies, when this package is used in Databricks, see:
# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
"setuptools"
],
) )

View File

@ -1,5 +1,21 @@
from databricks.connect import DatabricksSession
from pyspark.sql import SparkSession
from {{.project_name}} import main from {{.project_name}} import main
# Create a new Databricks Connect session. If this fails,
# check that you have configured Databricks Connect correctly.
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
{{/*
The below works around a problematic error message from Databricks Connect.
The standard SparkSession is supported in all configurations (workspace, IDE,
all runtime versions, CLI). But on the CLI it currently gives a confusing
error message if SPARK_REMOTE is not set. We can't directly use
DatabricksSession.builder in main.py, so we're re-assigning it here so
everything works out of the box, even for CLI users who don't set SPARK_REMOTE.
*/}}
SparkSession.builder = DatabricksSession.builder
SparkSession.builder.getOrCreate()
def test_main(): def test_main():
taxis = main.get_taxis() taxis = main.get_taxis()
assert taxis.count() > 5 assert taxis.count() > 5