Make the default `databricks bundle init` template more self-explanatory (#796)

This makes the default-python template more self-explanatory and adds a few other tweaks for a better out-of-the-box experience.
2023-09-26 11:12:34 +02:00 · 2023-09-26 11:12:34 +02:00 · 0c1516c4ba
parent 757d5efe8d
commit 0c1516c4ba
8 changed files with 72 additions and 11 deletions
--- a/libs/template/renderer_test.go
+++ b/libs/template/renderer_test.go
@ -41,6 +41,7 @@ func assertBuiltinTemplateValid(t *testing.T, settings map[string]any, target st

 	templatePath, err := prepareBuiltinTemplates("default-python", tempDir)
 	require.NoError(t, err)
+	libraryPath := filepath.Join(templatePath, "library")

 	w := &databricks.WorkspaceClient{
 		Config: &workspaceConfig.Config{Host: "https://myhost.com"},
@ -52,7 +53,7 @@ func assertBuiltinTemplateValid(t *testing.T, settings map[string]any, target st
 	ctx = root.SetWorkspaceClient(ctx, w)
 	helpers := loadHelpers(ctx)

-	renderer, err := newRenderer(ctx, settings, helpers, templatePath, "./testdata/template-in-path/library", tempDir)
+	renderer, err := newRenderer(ctx, settings, helpers, templatePath, libraryPath, tempDir)
 	require.NoError(t, err)

 	// Evaluate template
--- a/libs/template/templates/default-python/library/versions.tmpl
+++ b/libs/template/templates/default-python/library/versions.tmpl
@ -0,0 +1,7 @@
+{{define "latest_lts_dbr_version" -}}
+  13.3.x-scala2.12
+{{- end}}
+
+{{define "latest_lts_db_connect_version_spec" -}}
+  >=13.3,<13.4
+{{- end}}
--- a/libs/template/templates/default-python/template/{{.project_name}}/.vscode/settings.json
+++ b/libs/template/templates/default-python/template/{{.project_name}}/.vscode/settings.json
@ -8,7 +8,10 @@
    ],
    "python.testing.unittestEnabled": false,
    "python.testing.pytestEnabled": true,
+    "python.analysis.extraPaths": ["src"],
    "files.exclude": {
-        "**/*.egg-info": true
+        "**/*.egg-info": true,
+        "**/__pycache__": true,
+        ".pytest_cache": true,
    },
 }
--- a/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl
@ -30,7 +30,7 @@ The '{{.project_name}}' project was generated by using the default-python templa

 5. To run a job or pipeline, use the "run" comand:
   ```
-   $ databricks bundle run {{.project_name}}_job
+   $ databricks bundle run
   ```

 6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
--- a/libs/template/templates/default-python/template/{{.project_name}}/requirements-dev.txt.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/requirements-dev.txt.tmpl
@ -0,0 +1,22 @@
+## requirements-dev.txt: dependencies for local development.
+##
+## For defining dependencies used by jobs in Databricks Workflows, see
+## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+
+## pytest is the default package used for testing
+pytest
+
+## databricks-connect can be used to run parts of this project locally.
+## See https://docs.databricks.com/dev-tools/databricks-connect.html.
+##
+## databricks-connect is automatically installed if you're using Databricks
+## extension for Visual Studio Code
+## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html).
+##
+## To manually install databricks-connect, either follow the instructions
+## at https://docs.databricks.com/dev-tools/databricks-connect.html
+## to install the package system-wide. Or uncomment the line below to install a
+## version of db-connect that corresponds to the Databricks Runtime version used
+## for this project.
+#
+# databricks-connect{{template "latest_lts_db_connect_version_spec"}}
--- a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}_job.yml.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}_job.yml.tmpl
@ -49,6 +49,9 @@ resources:
            package_name: {{.project_name}}
            entry_point: main
          libraries:
+            # By default we just include the .whl file generated for the {{.project_name}} package.
+            # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+            # for more information on how to add other libraries.
            - whl: ../dist/*.whl

      {{else}}
@ -56,8 +59,7 @@ resources:
      job_clusters:
        - job_cluster_key: job_cluster
          new_cluster:
-            {{- /* we should always use an LTS version in our templates */}}
-            spark_version: 13.3.x-scala2.12
+            spark_version: {{template "latest_lts_dbr_version"}}
            node_type_id: {{smallest_node_type}}
            autoscale:
                min_workers: 1
--- a/libs/template/templates/default-python/template/{{.project_name}}/setup.py.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/setup.py.tmpl
@ -1,8 +1,9 @@
 """
-Setup script for {{.project_name}}.
+setup.py configuration script describing how to build and package this project.

-This script packages and distributes the associated wheel file(s).
-Source code is in ./src/. Run 'python setup.py sdist bdist_wheel' to build.
+This file is primarily used by the setuptools library and typically should not
+be executed directly. See README.md for how to deploy, test, and run
+the {{.project_name}} project.
 """
 from setuptools import setup, find_packages

@ -16,9 +17,18 @@ setup(
    version={{.project_name}}.__version__,
    url="https://databricks.com",
    author="{{user_name}}",
-    description="my test wheel",
+    description="wheel file based on {{.project_name}}/src",
    packages=find_packages(where='./src'),
    package_dir={'': 'src'},
-    entry_points={"entry_points": "main={{.project_name}}.main:main"},
-    install_requires=["setuptools"],
+    entry_points={
+        "packages": [
+            "main={{.project_name}}.main:main"
+        ]
+    },
+    install_requires=[
+        # Dependencies in case the output wheel file is used as a library dependency.
+        # For defining dependencies, when this package is used in Databricks, see:
+        # https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
+        "setuptools"
+    ],
 )
--- a/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl
@ -1,5 +1,21 @@
+from databricks.connect import DatabricksSession
+from pyspark.sql import SparkSession
 from {{.project_name}} import main

+# Create a new Databricks Connect session. If this fails,
+# check that you have configured Databricks Connect correctly.
+# See https://docs.databricks.com/dev-tools/databricks-connect.html.
+{{/*
+  The below works around a problematic error message from Databricks Connect.
+  The standard SparkSession is supported in all configurations (workspace, IDE,
+  all runtime versions, CLI). But on the CLI it currently gives a confusing
+  error message if SPARK_REMOTE is not set. We can't directly use
+  DatabricksSession.builder in main.py, so we're re-assigning it here so
+  everything works out of the box, even for CLI users who don't set SPARK_REMOTE.
+*/}}
+SparkSession.builder = DatabricksSession.builder
+SparkSession.builder.getOrCreate()
+
 def test_main():
    taxis = main.get_taxis()
    assert taxis.count() > 5