databricks bundle init template v1 (#686)

## Changes This adds a built-in "default-python" template to the CLI. This is based on the new default-template support of https://github.com/databricks/cli/pull/685. The goal here is to offer an experience where customers can simply type `databricks bundle init` to get a default template: ``` $ databricks bundle init Template to use [default-python]: default-python Unique name for this project [my_project]: my_project ✨ Successfully initialized template ``` The present template: - [x] Works well with VS Code - [x] Works well with the workspace - [x] Works well with DB Connect - [x] Uses minimal stubs rather than boiler-plate-heavy examples I'll have a followup with tests + DLT support. --------- Co-authored-by: Andrew Nester <andrew.nester@databricks.com> Co-authored-by: PaulCornellDB <paul.cornell@databricks.com> Co-authored-by: Pieter Noordhuis <pieter.noordhuis@databricks.com>
2023-09-05 04:58:34 -07:00 · 2023-09-05 04:58:34 -07:00 · 8c2cc07f7b
parent 947d5b1e5c
commit 8c2cc07f7b
19 changed files with 361 additions and 5 deletions
--- a/cmd/bundle/init.go
+++ b/cmd/bundle/init.go
@ -59,7 +59,7 @@ func newInitCommand() *cobra.Command {
 		} else {
 			return errors.New("please specify a template")

-			/* TODO: propose to use default-python (once template is ready)
+			/* TODO: propose to use default-python (once #708 is merged)
 			var err error
 			if !cmdio.IsOutTTY(ctx) || !cmdio.IsInTTY(ctx) {
 				return errors.New("please specify a template")
--- a/libs/template/templates/default-python/databricks_template_schema.json
+++ b/libs/template/templates/default-python/databricks_template_schema.json
@ -3,7 +3,7 @@
        "project_name": {
            "type": "string",
            "default": "my_project",
-            "description": "Name of the directory"
+            "description": "Unique name for this project"
        }
    }
 }
--- a/libs/template/templates/default-python/template/{{.project_name}}/.gitignore
+++ b/libs/template/templates/default-python/template/{{.project_name}}/.gitignore
@ -0,0 +1,9 @@
+
+.databricks/
+build/
+dist/
+__pycache__/
+*.egg-info
+.venv/
+scratch/**
+!scratch/README.md
--- a/libs/template/templates/default-python/template/{{.project_name}}/.vscode/builtins.pyi
+++ b/libs/template/templates/default-python/template/{{.project_name}}/.vscode/builtins.pyi
@ -0,0 +1,3 @@
+# Typings for Pylance in Visual Studio Code
+# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
+from databricks.sdk.runtime import *
--- a/libs/template/templates/default-python/template/{{.project_name}}/.vscode/extensions.json
+++ b/libs/template/templates/default-python/template/{{.project_name}}/.vscode/extensions.json
@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "databricks.databricks",
+        "ms-python.vscode-pylance",
+        "redhat.vscode-yaml"
+    ]
+}
--- a/libs/template/templates/default-python/template/{{.project_name}}/.vscode/settings.json
+++ b/libs/template/templates/default-python/template/{{.project_name}}/.vscode/settings.json
@ -0,0 +1,14 @@
+{
+    "python.analysis.stubPath": ".vscode",
+    "databricks.python.envFile": "${workspaceFolder}/.env",
+    "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
+    "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
+    "python.testing.pytestArgs": [
+        "."
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "files.exclude": {
+        "**/*.egg-info": true
+    },
+}
--- a/libs/template/templates/default-python/template/{{.project_name}}/README.md
+++ b/libs/template/templates/default-python/template/{{.project_name}}/README.md
@ -1,3 +0,0 @@
-# {{.project_name}}
-
-The '{{.project_name}}' bundle was generated using the default-python template.
--- a/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl
@ -0,0 +1,37 @@
+# {{.project_name}}
+
+The '{{.project_name}}' project was generated by using the default-python template.
+
+## Getting started
+
+1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
+
+2. Authenticate to your Databricks workspace:
+    ```
+    $ databricks configure
+    ```
+
+3. To deploy a development copy of this project, type:
+    ```
+    $ databricks bundle deploy --target dev
+    ```
+    (Note that "dev" is the default target, so the `--target` parameter
+    is optional here.)
+
+    This deploys everything that's defined for this project.
+    For example, the default template would deploy a job called
+    `[dev yourname] {{.project_name}}-job` to your workspace.
+    You can find that job by opening your workpace and clicking on **Workflows**.
+
+4. Similarly, to deploy a production copy, type:
+   ```
+   $ databricks bundle deploy --target prod
+   ```
+
+5. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
+   https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
+   **Databricks Connect** for instructions on running the included Python code from a different IDE.
+
+6. For documentation on the Databricks asset bundles format used
+   for this project, and for CI/CD configuration, see
+   https://docs.databricks.com/dev-tools/bundles/index.html.
--- a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl
@ -0,0 +1,52 @@
+# This is a Databricks asset bundle definition for {{.project_name}}.
+# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
+bundle:
+  name: {{.project_name}}
+
+include:
+  - resources/*.yml
+
+targets:
+  # The 'dev' target, used development purposes.
+  # Whenever a developer deploys using 'dev', they get their own copy.
+  dev:
+    # We use 'mode: development' to make everything deployed to this target gets a prefix
+    # like '[dev my_user_name]'. Setting this mode also disables any schedules and
+    # automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines.
+    mode: development
+    default: true
+    workspace:
+      host: {{workspace_host}}
+
+  # Optionally, there could be a 'staging' target here.
+  # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.)
+  #
+  # staging:
+  #  workspace:
+  #    host: {{workspace_host}}
+
+  # The 'prod' target, used for production deployment.
+  prod:
+    # For production deployments, we only have a single copy, so we override the
+    # workspace.root_path default of
+    # /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name}
+    # to a path that is not specific to the current user.
+    {{- /*
+    Explaining 'mode: production' isn't as pressing as explaining 'mode: development'.
+    As we already talked about the other mode above, users can just
+    look at documentation or ask the assistant about 'mode: production'.
+    #
+    # By making use of 'mode: production' we enable strict checks
+    # to make sure we have correctly configured this target.
+    */}}
+    mode: production
+    workspace:
+      host: {{workspace_host}}
+      root_path: /Shared/.bundle/prod/${bundle.name}
+    {{- if not is_service_principal}}
+    run_as:
+      # This runs as {{user_name}} in production. Alternatively,
+      # a service principal could be used here using service_principal_name
+      # (see Databricks documentation).
+      user_name: {{user_name}}
+    {{end -}}
--- a/libs/template/templates/default-python/template/{{.project_name}}/fixtures/.gitkeep.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/fixtures/.gitkeep.tmpl
@ -0,0 +1,27 @@
+# Fixtures
+{{- /*
+We don't want to have too many README.md files, since they
+stand out so much. But we do need to have a file here to make
+sure the folder is added to Git.
+*/}}
+
+This folder is reserved for fixtures, such as CSV files.
+
+Below is an example of how to load fixtures as a data frame:
+
+```
+import pandas as pd
+import os
+
+def get_absolute_path(*relative_parts):
+    if 'dbutils' in globals():
+        base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
+        path = os.path.normpath(os.path.join(base_dir, *relative_parts))
+        return path if path.startswith("/Workspace") else os.path.join("/Workspace", path)
+    else:
+        return os.path.join(*relative_parts)
+
+csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
+df = pd.read_csv(csv_file)
+display(df)
+```
--- a/libs/template/templates/default-python/template/{{.project_name}}/pytest.ini
+++ b/libs/template/templates/default-python/template/{{.project_name}}/pytest.ini
@ -0,0 +1,3 @@
+[pytest]
+testpaths = tests
+pythonpath = src
--- a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}_job.yml.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}_job.yml.tmpl
@ -0,0 +1,42 @@
+# The main job for {{.project_name}}
+resources:
+
+  jobs:
+    {{.project_name}}_job:
+      name: {{.project_name}}_job
+
+      schedule:
+        quartz_cron_expression: '44 37 8 * * ?'
+        timezone_id: Europe/Amsterdam
+
+      {{- if not is_service_principal}}
+      email_notifications:
+        on_failure:
+          - {{user_name}}
+      {{end -}}
+
+      tasks:
+        - task_key: notebook_task
+          job_cluster_key: job_cluster
+          notebook_task:
+            notebook_path: ../src/notebook.ipynb
+
+        - task_key: python_wheel_task
+          depends_on:
+            - task_key: notebook_task
+          job_cluster_key: job_cluster
+          python_wheel_task:
+            package_name: {{.project_name}}
+            entry_point: main
+          libraries:
+            - whl: ../dist/*.whl
+
+      job_clusters:
+        - job_cluster_key: job_cluster
+          new_cluster:
+            {{- /* we should always use an LTS version in our templates */}}
+            spark_version: 13.3.x-scala2.12
+            node_type_id: {{smallest_node_type}}
+            autoscale:
+                min_workers: 1
+                max_workers: 4
--- a/libs/template/templates/default-python/template/{{.project_name}}/scratch/README.md
+++ b/libs/template/templates/default-python/template/{{.project_name}}/scratch/README.md
@ -0,0 +1,4 @@
+# scratch
+
+This folder is reserved for personal, exploratory notebooks.
+By default these are not committed to Git, as 'scratch' is listed in .gitignore.
--- a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb
+++ b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb
@ -0,0 +1,50 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.append('../src')\n",
+    "from project import main\n",
+    "\n",
+    "main.taxis.show(10)"
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 2
+   },
+   "notebookName": "ipynb-notebook",
+   "widgets": {}
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/libs/template/templates/default-python/template/{{.project_name}}/setup.py.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/setup.py.tmpl
@ -0,0 +1,24 @@
+"""
+Setup script for {{.project_name}}.
+
+This script packages and distributes the associated wheel file(s).
+Source code is in ./src/. Run 'python setup.py sdist bdist_wheel' to build.
+"""
+from setuptools import setup, find_packages
+
+import sys
+sys.path.append('./src')
+
+import {{.project_name}}
+
+setup(
+    name="{{.project_name}}",
+    version={{.project_name}}.__version__,
+    url="https://databricks.com",
+    author="{{.user_name}}",
+    description="my test wheel",
+    packages=find_packages(where='./src'),
+    package_dir={'': 'src'},
+    entry_points={"entry_points": "main={{.project_name}}.main:main"},
+    install_requires=["setuptools"],
+)
--- a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl
@ -0,0 +1,65 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "# Default notebook\n",
+    "\n",
+    "This default notebook is executed using Databricks Workflows as defined in resources/{{.my_project}}_job.yml."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from {{.project_name}} import main\n",
+    "\n",
+    "main.get_taxis().show(10)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 2
+   },
+   "notebookName": "notebook",
+   "widgets": {}
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/init.py
+++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/init.py
@ -0,0 +1 @@
+__version__ = "0.0.1"
--- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl
@ -0,0 +1,16 @@
+{{- /*
+We use pyspark.sql rather than DatabricksSession.builder.getOrCreate()
+for compatibility with older runtimes. With a new runtime, it's
+equivalent to DatabricksSession.builder.getOrCreate().
+*/ -}}
+from pyspark.sql import SparkSession
+
+def get_taxis():
+  spark = SparkSession.builder.getOrCreate()
+  return spark.read.table("samples.nyctaxi.trips")
+
+def main():
+  get_taxis().show(5)
+
+if __name__ == '__main__':
+  main()
--- a/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl
@ -0,0 +1,5 @@
+from {{.project_name}} import main
+
+def test_main():
+    taxis = main.get_taxis()
+    assert taxis.count() == 5