databricks bundle init template v1 (#686)

## Changes

This adds a built-in "default-python" template to the CLI. This is based
on the new default-template support of
https://github.com/databricks/cli/pull/685.

The goal here is to offer an experience where customers can simply type
`databricks bundle init` to get a default template:

```
$ databricks bundle init
Template to use [default-python]: default-python
Unique name for this project [my_project]: my_project
 Successfully initialized template
```

The present template:
- [x] Works well with VS Code
- [x] Works well with the workspace
- [x] Works well with DB Connect
- [x] Uses minimal stubs rather than boiler-plate-heavy examples

I'll have a followup with tests + DLT support.

---------

Co-authored-by: Andrew Nester <andrew.nester@databricks.com>
Co-authored-by: PaulCornellDB <paul.cornell@databricks.com>
Co-authored-by: Pieter Noordhuis <pieter.noordhuis@databricks.com>
This commit is contained in:
Lennart Kats (databricks) 2023-09-05 04:58:34 -07:00 committed by GitHub
parent 947d5b1e5c
commit 8c2cc07f7b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 361 additions and 5 deletions

View File

@ -59,7 +59,7 @@ func newInitCommand() *cobra.Command {
} else { } else {
return errors.New("please specify a template") return errors.New("please specify a template")
/* TODO: propose to use default-python (once template is ready) /* TODO: propose to use default-python (once #708 is merged)
var err error var err error
if !cmdio.IsOutTTY(ctx) || !cmdio.IsInTTY(ctx) { if !cmdio.IsOutTTY(ctx) || !cmdio.IsInTTY(ctx) {
return errors.New("please specify a template") return errors.New("please specify a template")

View File

@ -3,7 +3,7 @@
"project_name": { "project_name": {
"type": "string", "type": "string",
"default": "my_project", "default": "my_project",
"description": "Name of the directory" "description": "Unique name for this project"
} }
} }
} }

View File

@ -0,0 +1,9 @@
.databricks/
build/
dist/
__pycache__/
*.egg-info
.venv/
scratch/**
!scratch/README.md

View File

@ -0,0 +1,3 @@
# Typings for Pylance in Visual Studio Code
# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
from databricks.sdk.runtime import *

View File

@ -0,0 +1,7 @@
{
"recommendations": [
"databricks.databricks",
"ms-python.vscode-pylance",
"redhat.vscode-yaml"
]
}

View File

@ -0,0 +1,14 @@
{
"python.analysis.stubPath": ".vscode",
"databricks.python.envFile": "${workspaceFolder}/.env",
"jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
"jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
"python.testing.pytestArgs": [
"."
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"files.exclude": {
"**/*.egg-info": true
},
}

View File

@ -1,3 +0,0 @@
# {{.project_name}}
The '{{.project_name}}' bundle was generated using the default-python template.

View File

@ -0,0 +1,37 @@
# {{.project_name}}
The '{{.project_name}}' project was generated by using the default-python template.
## Getting started
1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
2. Authenticate to your Databricks workspace:
```
$ databricks configure
```
3. To deploy a development copy of this project, type:
```
$ databricks bundle deploy --target dev
```
(Note that "dev" is the default target, so the `--target` parameter
is optional here.)
This deploys everything that's defined for this project.
For example, the default template would deploy a job called
`[dev yourname] {{.project_name}}-job` to your workspace.
You can find that job by opening your workpace and clicking on **Workflows**.
4. Similarly, to deploy a production copy, type:
```
$ databricks bundle deploy --target prod
```
5. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
**Databricks Connect** for instructions on running the included Python code from a different IDE.
6. For documentation on the Databricks asset bundles format used
for this project, and for CI/CD configuration, see
https://docs.databricks.com/dev-tools/bundles/index.html.

View File

@ -0,0 +1,52 @@
# This is a Databricks asset bundle definition for {{.project_name}}.
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle:
name: {{.project_name}}
include:
- resources/*.yml
targets:
# The 'dev' target, used development purposes.
# Whenever a developer deploys using 'dev', they get their own copy.
dev:
# We use 'mode: development' to make everything deployed to this target gets a prefix
# like '[dev my_user_name]'. Setting this mode also disables any schedules and
# automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines.
mode: development
default: true
workspace:
host: {{workspace_host}}
# Optionally, there could be a 'staging' target here.
# (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.)
#
# staging:
# workspace:
# host: {{workspace_host}}
# The 'prod' target, used for production deployment.
prod:
# For production deployments, we only have a single copy, so we override the
# workspace.root_path default of
# /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name}
# to a path that is not specific to the current user.
{{- /*
Explaining 'mode: production' isn't as pressing as explaining 'mode: development'.
As we already talked about the other mode above, users can just
look at documentation or ask the assistant about 'mode: production'.
#
# By making use of 'mode: production' we enable strict checks
# to make sure we have correctly configured this target.
*/}}
mode: production
workspace:
host: {{workspace_host}}
root_path: /Shared/.bundle/prod/${bundle.name}
{{- if not is_service_principal}}
run_as:
# This runs as {{user_name}} in production. Alternatively,
# a service principal could be used here using service_principal_name
# (see Databricks documentation).
user_name: {{user_name}}
{{end -}}

View File

@ -0,0 +1,27 @@
# Fixtures
{{- /*
We don't want to have too many README.md files, since they
stand out so much. But we do need to have a file here to make
sure the folder is added to Git.
*/}}
This folder is reserved for fixtures, such as CSV files.
Below is an example of how to load fixtures as a data frame:
```
import pandas as pd
import os
def get_absolute_path(*relative_parts):
if 'dbutils' in globals():
base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
path = os.path.normpath(os.path.join(base_dir, *relative_parts))
return path if path.startswith("/Workspace") else os.path.join("/Workspace", path)
else:
return os.path.join(*relative_parts)
csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
df = pd.read_csv(csv_file)
display(df)
```

View File

@ -0,0 +1,3 @@
[pytest]
testpaths = tests
pythonpath = src

View File

@ -0,0 +1,42 @@
# The main job for {{.project_name}}
resources:
jobs:
{{.project_name}}_job:
name: {{.project_name}}_job
schedule:
quartz_cron_expression: '44 37 8 * * ?'
timezone_id: Europe/Amsterdam
{{- if not is_service_principal}}
email_notifications:
on_failure:
- {{user_name}}
{{end -}}
tasks:
- task_key: notebook_task
job_cluster_key: job_cluster
notebook_task:
notebook_path: ../src/notebook.ipynb
- task_key: python_wheel_task
depends_on:
- task_key: notebook_task
job_cluster_key: job_cluster
python_wheel_task:
package_name: {{.project_name}}
entry_point: main
libraries:
- whl: ../dist/*.whl
job_clusters:
- job_cluster_key: job_cluster
new_cluster:
{{- /* we should always use an LTS version in our templates */}}
spark_version: 13.3.x-scala2.12
node_type_id: {{smallest_node_type}}
autoscale:
min_workers: 1
max_workers: 4

View File

@ -0,0 +1,4 @@
# scratch
This folder is reserved for personal, exploratory notebooks.
By default these are not committed to Git, as 'scratch' is listed in .gitignore.

View File

@ -0,0 +1,50 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('../src')\n",
"from project import main\n",
"\n",
"main.taxis.show(10)"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 2
},
"notebookName": "ipynb-notebook",
"widgets": {}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -0,0 +1,24 @@
"""
Setup script for {{.project_name}}.
This script packages and distributes the associated wheel file(s).
Source code is in ./src/. Run 'python setup.py sdist bdist_wheel' to build.
"""
from setuptools import setup, find_packages
import sys
sys.path.append('./src')
import {{.project_name}}
setup(
name="{{.project_name}}",
version={{.project_name}}.__version__,
url="https://databricks.com",
author="{{.user_name}}",
description="my test wheel",
packages=find_packages(where='./src'),
package_dir={'': 'src'},
entry_points={"entry_points": "main={{.project_name}}.main:main"},
install_requires=["setuptools"],
)

View File

@ -0,0 +1,65 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
"showTitle": false,
"title": ""
}
},
"source": [
"# Default notebook\n",
"\n",
"This default notebook is executed using Databricks Workflows as defined in resources/{{.my_project}}_job.yml."
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"from {{.project_name}} import main\n",
"\n",
"main.get_taxis().show(10)\n"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 2
},
"notebookName": "notebook",
"widgets": {}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -0,0 +1,16 @@
{{- /*
We use pyspark.sql rather than DatabricksSession.builder.getOrCreate()
for compatibility with older runtimes. With a new runtime, it's
equivalent to DatabricksSession.builder.getOrCreate().
*/ -}}
from pyspark.sql import SparkSession
def get_taxis():
spark = SparkSession.builder.getOrCreate()
return spark.read.table("samples.nyctaxi.trips")
def main():
get_taxis().show(5)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,5 @@
from {{.project_name}} import main
def test_main():
taxis = main.get_taxis()
assert taxis.count() == 5