Make the default `databricks bundle init` template more self-explanatory (#796)

This makes the default-python template more self-explanatory and adds a
few other tweaks for a better out-of-the-box experience.
This commit is contained in:
Lennart Kats (databricks) 2023-09-26 11:12:34 +02:00 committed by GitHub
parent 757d5efe8d
commit 0c1516c4ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 72 additions and 11 deletions

View File

@ -41,6 +41,7 @@ func assertBuiltinTemplateValid(t *testing.T, settings map[string]any, target st
templatePath, err := prepareBuiltinTemplates("default-python", tempDir)
require.NoError(t, err)
libraryPath := filepath.Join(templatePath, "library")
w := &databricks.WorkspaceClient{
Config: &workspaceConfig.Config{Host: "https://myhost.com"},
@ -52,7 +53,7 @@ func assertBuiltinTemplateValid(t *testing.T, settings map[string]any, target st
ctx = root.SetWorkspaceClient(ctx, w)
helpers := loadHelpers(ctx)
renderer, err := newRenderer(ctx, settings, helpers, templatePath, "./testdata/template-in-path/library", tempDir)
renderer, err := newRenderer(ctx, settings, helpers, templatePath, libraryPath, tempDir)
require.NoError(t, err)
// Evaluate template

View File

@ -0,0 +1,7 @@
{{define "latest_lts_dbr_version" -}}
13.3.x-scala2.12
{{- end}}
{{define "latest_lts_db_connect_version_spec" -}}
>=13.3,<13.4
{{- end}}

View File

@ -8,7 +8,10 @@
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.analysis.extraPaths": ["src"],
"files.exclude": {
"**/*.egg-info": true
"**/*.egg-info": true,
"**/__pycache__": true,
".pytest_cache": true,
},
}

View File

@ -30,7 +30,7 @@ The '{{.project_name}}' project was generated by using the default-python templa
5. To run a job or pipeline, use the "run" comand:
```
$ databricks bundle run {{.project_name}}_job
$ databricks bundle run
```
6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from

View File

@ -0,0 +1,22 @@
## requirements-dev.txt: dependencies for local development.
##
## For defining dependencies used by jobs in Databricks Workflows, see
## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
## pytest is the default package used for testing
pytest
## databricks-connect can be used to run parts of this project locally.
## See https://docs.databricks.com/dev-tools/databricks-connect.html.
##
## databricks-connect is automatically installed if you're using Databricks
## extension for Visual Studio Code
## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html).
##
## To manually install databricks-connect, either follow the instructions
## at https://docs.databricks.com/dev-tools/databricks-connect.html
## to install the package system-wide. Or uncomment the line below to install a
## version of db-connect that corresponds to the Databricks Runtime version used
## for this project.
#
# databricks-connect{{template "latest_lts_db_connect_version_spec"}}

View File

@ -49,6 +49,9 @@ resources:
package_name: {{.project_name}}
entry_point: main
libraries:
# By default we just include the .whl file generated for the {{.project_name}} package.
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
# for more information on how to add other libraries.
- whl: ../dist/*.whl
{{else}}
@ -56,8 +59,7 @@ resources:
job_clusters:
- job_cluster_key: job_cluster
new_cluster:
{{- /* we should always use an LTS version in our templates */}}
spark_version: 13.3.x-scala2.12
spark_version: {{template "latest_lts_dbr_version"}}
node_type_id: {{smallest_node_type}}
autoscale:
min_workers: 1

View File

@ -1,8 +1,9 @@
"""
Setup script for {{.project_name}}.
setup.py configuration script describing how to build and package this project.
This script packages and distributes the associated wheel file(s).
Source code is in ./src/. Run 'python setup.py sdist bdist_wheel' to build.
This file is primarily used by the setuptools library and typically should not
be executed directly. See README.md for how to deploy, test, and run
the {{.project_name}} project.
"""
from setuptools import setup, find_packages
@ -16,9 +17,18 @@ setup(
version={{.project_name}}.__version__,
url="https://databricks.com",
author="{{user_name}}",
description="my test wheel",
description="wheel file based on {{.project_name}}/src",
packages=find_packages(where='./src'),
package_dir={'': 'src'},
entry_points={"entry_points": "main={{.project_name}}.main:main"},
install_requires=["setuptools"],
entry_points={
"packages": [
"main={{.project_name}}.main:main"
]
},
install_requires=[
# Dependencies in case the output wheel file is used as a library dependency.
# For defining dependencies, when this package is used in Databricks, see:
# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
"setuptools"
],
)

View File

@ -1,5 +1,21 @@
from databricks.connect import DatabricksSession
from pyspark.sql import SparkSession
from {{.project_name}} import main
# Create a new Databricks Connect session. If this fails,
# check that you have configured Databricks Connect correctly.
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
{{/*
The below works around a problematic error message from Databricks Connect.
The standard SparkSession is supported in all configurations (workspace, IDE,
all runtime versions, CLI). But on the CLI it currently gives a confusing
error message if SPARK_REMOTE is not set. We can't directly use
DatabricksSession.builder in main.py, so we're re-assigning it here so
everything works out of the box, even for CLI users who don't set SPARK_REMOTE.
*/}}
SparkSession.builder = DatabricksSession.builder
SparkSession.builder.getOrCreate()
def test_main():
taxis = main.get_taxis()
assert taxis.count() > 5