Add experimental-jobs-as-code template (#2177)

## Changes

Add experimental-jobs-as-code template allowing defining jobs using
Python instead of YAML through the `databricks-bundles` PyPI package.

## Tests

Manually and acceptance tests.
This commit is contained in:
Gleb Kanterov 2025-01-20 11:15:11 +01:00 committed by GitHub
parent 7034793d1d
commit 31c10c1b82
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
36 changed files with 1182 additions and 0 deletions

View File

@ -8,6 +8,7 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"regexp"
"runtime" "runtime"
"slices" "slices"
"sort" "sort"
@ -393,6 +394,16 @@ func CopyDir(src, dst string, inputs, outputs map[string]bool) error {
} }
func ListDir(t *testing.T, src string) ([]string, error) { func ListDir(t *testing.T, src string) ([]string, error) {
// exclude folders in .gitignore from comparison
ignored := []string{
"\\.ruff_cache",
"\\.venv",
".*\\.egg-info",
"__pycache__",
// depends on uv version
"uv.lock",
}
var files []string var files []string
err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error { err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error {
if err != nil { if err != nil {
@ -400,7 +411,19 @@ func ListDir(t *testing.T, src string) ([]string, error) {
} }
if info.IsDir() { if info.IsDir() {
for _, ignoredFolder := range ignored {
if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched {
return filepath.SkipDir
}
}
return nil return nil
} else {
for _, ignoredFolder := range ignored {
if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched {
return nil
}
}
} }
relPath, err := filepath.Rel(src, path) relPath, err := filepath.Rel(src, path)

View File

@ -0,0 +1,5 @@
{
"project_name": "my_jobs_as_code",
"include_notebook": "yes",
"include_python": "yes"
}

View File

@ -0,0 +1,85 @@
>>> $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output
Welcome to (EXPERIMENTAL) "Jobs as code" template for Databricks Asset Bundles!
Workspace to use (auto-detected, edit in 'my_jobs_as_code/databricks.yml'): $DATABRICKS_URL
✨ Your new project has been created in the 'my_jobs_as_code' directory!
Please refer to the README.md file for "getting started" instructions.
See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html.
>>> $CLI bundle validate -t dev --output json
{
"jobs": {
"my_jobs_as_code_job": {
"deployment": {
"kind": "BUNDLE",
"metadata_file_path": "/Workspace/Users/$USERNAME/.bundle/my_jobs_as_code/dev/state/metadata.json"
},
"edit_mode": "UI_LOCKED",
"email_notifications": {
"on_failure": [
"$USERNAME"
]
},
"format": "MULTI_TASK",
"job_clusters": [
{
"job_cluster_key": "job_cluster",
"new_cluster": {
"autoscale": {
"max_workers": 4,
"min_workers": 1
},
"node_type_id": "i3.xlarge",
"spark_version": "15.4.x-scala2.12"
}
}
],
"max_concurrent_runs": 4,
"name": "[dev $USERNAME] my_jobs_as_code_job",
"permissions": [],
"queue": {
"enabled": true
},
"tags": {
"dev": "$USERNAME"
},
"tasks": [
{
"job_cluster_key": "job_cluster",
"notebook_task": {
"notebook_path": "/Workspace/Users/$USERNAME/.bundle/my_jobs_as_code/dev/files/src/notebook"
},
"task_key": "notebook_task"
},
{
"depends_on": [
{
"task_key": "notebook_task"
}
],
"job_cluster_key": "job_cluster",
"libraries": [
{
"whl": "dist/*.whl"
}
],
"python_wheel_task": {
"entry_point": "main",
"package_name": "my_jobs_as_code"
},
"task_key": "main_task"
}
],
"trigger": {
"pause_status": "PAUSED",
"periodic": {
"interval": 1,
"unit": "DAYS"
}
}
}
}
}

View File

@ -0,0 +1,8 @@
.databricks/
build/
dist/
__pycache__/
*.egg-info
.venv/
scratch/**
!scratch/README.md

View File

@ -0,0 +1,58 @@
# my_jobs_as_code
The 'my_jobs_as_code' project was generated by using the "Jobs as code" template.
## Prerequisites
1. Install Databricks CLI 0.238 or later.
See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html).
2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
We use uv to create a virtual environment and install the required dependencies.
3. Authenticate to your Databricks workspace if you have not done so already:
```
$ databricks configure
```
4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
**Databricks Connect** for instructions on running the included Python code from a different IDE.
5. For documentation on the Databricks Asset Bundles format used
for this project, and for CI/CD configuration, see
https://docs.databricks.com/dev-tools/bundles/index.html.
## Deploy and run jobs
1. Create a new virtual environment and install the required dependencies:
```
$ uv sync
```
2. To deploy the bundle to the development target:
```
$ databricks bundle deploy --target dev
```
*(Note that "dev" is the default target, so the `--target` parameter is optional here.)*
This deploys everything that's defined for this project.
For example, the default template would deploy a job called
`[dev yourname] my_jobs_as_code_job` to your workspace.
You can find that job by opening your workspace and clicking on **Workflows**.
3. Similarly, to deploy a production copy, type:
```
$ databricks bundle deploy --target prod
```
Note that the default job from the template has a schedule that runs every day
(defined in resources/my_jobs_as_code_job.py). The schedule
is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes](
https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)).
4. To run a job:
```
$ databricks bundle run
```

View File

@ -0,0 +1,48 @@
# This is a Databricks asset bundle definition for my_jobs_as_code.
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle:
name: my_jobs_as_code
uuid: <UUID>
experimental:
python:
# Activate virtual environment before loading resources defined in Python.
# If disabled, defaults to using the Python interpreter available in the current shell.
venv_path: .venv
# Functions called to load resources defined in Python. See resources/__init__.py
resources:
- "resources:load_resources"
artifacts:
default:
type: whl
path: .
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build
include:
- resources/*.yml
targets:
dev:
# The default target uses 'mode: development' to create a development copy.
# - Deployed resources get prefixed with '[dev my_user_name]'
# - Any job schedules and triggers are paused by default.
# See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
mode: development
default: true
workspace:
host: $DATABRICKS_URL
prod:
mode: production
workspace:
host: $DATABRICKS_URL
# We explicitly specify /Workspace/Users/$USERNAME to make sure we only have a single copy.
root_path: /Workspace/Users/$USERNAME/.bundle/${bundle.name}/${bundle.target}
permissions:
- user_name: $USERNAME
level: CAN_MANAGE
run_as:
user_name: $USERNAME

View File

@ -0,0 +1,22 @@
# Fixtures
This folder is reserved for fixtures, such as CSV files.
Below is an example of how to load fixtures as a data frame:
```
import pandas as pd
import os
def get_absolute_path(*relative_parts):
if 'dbutils' in globals():
base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
path = os.path.normpath(os.path.join(base_dir, *relative_parts))
return path if path.startswith("/Workspace") else "/Workspace" + path
else:
return os.path.join(*relative_parts)
csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
df = pd.read_csv(csv_file)
display(df)
```

View File

@ -0,0 +1,49 @@
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "my_jobs_as_code"
requires-python = ">=3.10"
description = "wheel file based on my_jobs_as_code"
# Dependencies in case the output wheel file is used as a library dependency.
# For defining dependencies, when this package is used in Databricks, see:
# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
#
# Example:
# dependencies = [
# "requests==x.y.z",
# ]
dependencies = [
]
# see setup.py
dynamic = ["version"]
[project.entry-points.packages]
main = "my_jobs_as_code.main:main"
[tool.setuptools]
py-modules = ["resources", "my_jobs_as_code"]
[tool.uv]
## Dependencies for local development
dev-dependencies = [
"databricks-bundles==0.7.0",
## Add code completion support for DLT
# "databricks-dlt",
## databricks-connect can be used to run parts of this project locally.
## See https://docs.databricks.com/dev-tools/databricks-connect.html.
##
## Uncomment line below to install a version of db-connect that corresponds to
## the Databricks Runtime version used for this project.
# "databricks-connect>=15.4,<15.5",
]
override-dependencies = [
# pyspark package conflicts with 'databricks-connect'
"pyspark; sys_platform == 'never'",
]

View File

@ -0,0 +1,16 @@
from databricks.bundles.core import (
Bundle,
Resources,
load_resources_from_current_package_module,
)
def load_resources(bundle: Bundle) -> Resources:
"""
'load_resources' function is referenced in databricks.yml and is responsible for loading
bundle resources defined in Python code. This function is called by Databricks CLI during
bundle deployment. After deployment, this function is not used.
"""
# the default implementation loads all Python files in 'resources' directory
return load_resources_from_current_package_module()

View File

@ -0,0 +1,67 @@
from databricks.bundles.jobs import Job
"""
The main job for my_jobs_as_code.
"""
my_jobs_as_code_job = Job.from_dict(
{
"name": "my_jobs_as_code_job",
"trigger": {
# Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
"periodic": {
"interval": 1,
"unit": "DAYS",
},
},
"email_notifications": {
"on_failure": [
"$USERNAME",
],
},
"tasks": [
{
"task_key": "notebook_task",
"job_cluster_key": "job_cluster",
"notebook_task": {
"notebook_path": "src/notebook.ipynb",
},
},
{
"task_key": "main_task",
"depends_on": [
{
"task_key": "notebook_task",
},
],
"job_cluster_key": "job_cluster",
"python_wheel_task": {
"package_name": "my_jobs_as_code",
"entry_point": "main",
},
"libraries": [
# By default we just include the .whl file generated for the my_jobs_as_code package.
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
# for more information on how to add other libraries.
{
"whl": "dist/*.whl",
},
],
},
],
"job_clusters": [
{
"job_cluster_key": "job_cluster",
"new_cluster": {
"spark_version": "15.4.x-scala2.12",
"node_type_id": "i3.xlarge",
"autoscale": {
"min_workers": 1,
"max_workers": 4,
},
},
},
],
}
)

View File

@ -0,0 +1,4 @@
# scratch
This folder is reserved for personal, exploratory notebooks.
By default these are not committed to Git, as 'scratch' is listed in .gitignore.

View File

@ -0,0 +1,18 @@
"""
setup.py configuration script describing how to build and package this project.
This file is primarily used by the setuptools library and typically should not
be executed directly. See README.md for how to deploy, test, and run
the my_jobs_as_code project.
"""
import os
from setuptools import setup
local_version = os.getenv("LOCAL_VERSION")
version = "0.0.1"
setup(
version=f"{version}+{local_version}" if local_version else version,
)

View File

@ -0,0 +1,25 @@
from pyspark.sql import SparkSession, DataFrame
def get_taxis(spark: SparkSession) -> DataFrame:
return spark.read.table("samples.nyctaxi.trips")
# Create a new Databricks Connect session. If this fails,
# check that you have configured Databricks Connect correctly.
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
def get_spark() -> SparkSession:
try:
from databricks.connect import DatabricksSession
return DatabricksSession.builder.getOrCreate()
except ImportError:
return SparkSession.builder.getOrCreate()
def main():
get_taxis(get_spark()).show(5)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,75 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "<UUID>",
"showTitle": false,
"title": ""
}
},
"source": [
"# Default notebook\n",
"\n",
"This default notebook is executed using Databricks Workflows as defined in resources/my_jobs_as_code.job.yml."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "<UUID>",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"from my_jobs_as_code import main\n",
"\n",
"main.get_taxis(spark).show(10)"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 2
},
"notebookName": "notebook",
"widgets": {}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -0,0 +1,8 @@
from my_jobs_as_code.main import get_taxis, get_spark
# running tests requires installing databricks-connect, e.g. by uncommenting it in pyproject.toml
def test_main():
taxis = get_taxis(get_spark())
assert taxis.count() > 5

View File

@ -0,0 +1,12 @@
trace $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output
cd output/my_jobs_as_code
# silence uv output because it's non-deterministic
uv sync 2> /dev/null
# remove version constraint because it always creates a warning on dev builds
cat databricks.yml | grep -v databricks_cli_version > databricks.yml.new
mv databricks.yml.new databricks.yml
trace $CLI bundle validate -t dev --output json | jq ".resources"

View File

@ -59,6 +59,11 @@ var nativeTemplates = []nativeTemplate{
hidden: true, hidden: true,
description: "The default PyDABs template", description: "The default PyDABs template",
}, },
{
name: "experimental-jobs-as-code",
hidden: true,
description: "Jobs as code template (experimental)",
},
{ {
name: customTemplate, name: customTemplate,
description: "Bring your own template", description: "Bring your own template",

View File

@ -0,0 +1,28 @@
{
"welcome_message": "\nWelcome to (EXPERIMENTAL) \"Jobs as code\" template for Databricks Asset Bundles!",
"properties": {
"project_name": {
"type": "string",
"default": "jobs_as_code_project",
"description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project",
"order": 1,
"pattern": "^[A-Za-z0-9_]+$",
"pattern_match_failure_message": "Name must consist of letters, numbers, and underscores."
},
"include_notebook": {
"type": "string",
"default": "yes",
"enum": ["yes", "no"],
"description": "Include a stub (sample) notebook in '{{.project_name}}{{path_separator}}src'",
"order": 2
},
"include_python": {
"type": "string",
"default": "yes",
"enum": ["yes", "no"],
"description": "Include a stub (sample) Python package in '{{.project_name}}/src'",
"order": 3
}
},
"success_message": "Workspace to use (auto-detected, edit in '{{.project_name}}/databricks.yml'): {{workspace_host}}\n\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html."
}

View File

@ -0,0 +1,7 @@
{{define "latest_lts_dbr_version" -}}
15.4.x-scala2.12
{{- end}}
{{define "latest_lts_db_connect_version_spec" -}}
>=15.4,<15.5
{{- end}}

View File

@ -0,0 +1,30 @@
# Preamble
This file only template directives; it is skipped for the actual output.
{{skip "__preamble"}}
# TODO add DLT support, placeholder for now
{{$notDLT := true }}
{{$notNotebook := not (eq .include_notebook "yes")}}
{{$notPython := not (eq .include_python "yes")}}
{{if $notPython}}
{{skip "{{.project_name}}/src/{{.project_name}}"}}
{{skip "{{.project_name}}/tests/main_test.py"}}
{{end}}
{{if $notDLT}}
{{skip "{{.project_name}}/src/dlt_pipeline.ipynb"}}
{{skip "{{.project_name}}/resources/{{.project_name}}_pipeline.py"}}
{{end}}
{{if $notNotebook}}
{{skip "{{.project_name}}/src/notebook.ipynb"}}
{{end}}
{{if (and $notDLT $notNotebook $notPython)}}
{{skip "{{.project_name}}/resources/{{.project_name}}_job.py"}}
{{else}}
{{skip "{{.project_name}}/resources/.gitkeep"}}
{{end}}

View File

@ -0,0 +1,8 @@
.databricks/
build/
dist/
__pycache__/
*.egg-info
.venv/
scratch/**
!scratch/README.md

View File

@ -0,0 +1,60 @@
# {{.project_name}}
The '{{.project_name}}' project was generated by using the "Jobs as code" template.
## Prerequisites
1. Install Databricks CLI 0.238 or later.
See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html).
2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
We use uv to create a virtual environment and install the required dependencies.
3. Authenticate to your Databricks workspace if you have not done so already:
```
$ databricks configure
```
4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
https://docs.databricks.com/dev-tools/vscode-ext.html.
{{- if (eq .include_python "yes") }} Or read the "getting started" documentation for
**Databricks Connect** for instructions on running the included Python code from a different IDE.
{{- end}}
5. For documentation on the Databricks Asset Bundles format used
for this project, and for CI/CD configuration, see
https://docs.databricks.com/dev-tools/bundles/index.html.
## Deploy and run jobs
1. Create a new virtual environment and install the required dependencies:
```
$ uv sync
```
2. To deploy the bundle to the development target:
```
$ databricks bundle deploy --target dev
```
*(Note that "dev" is the default target, so the `--target` parameter is optional here.)*
This deploys everything that's defined for this project.
For example, the default template would deploy a job called
`[dev yourname] {{.project_name}}_job` to your workspace.
You can find that job by opening your workspace and clicking on **Workflows**.
3. Similarly, to deploy a production copy, type:
```
$ databricks bundle deploy --target prod
```
Note that the default job from the template has a schedule that runs every day
(defined in resources/{{.project_name}}_job.py). The schedule
is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes](
https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)).
4. To run a job:
```
$ databricks bundle run
```

View File

@ -0,0 +1,51 @@
# This is a Databricks asset bundle definition for {{.project_name}}.
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle:
name: {{.project_name}}
uuid: {{bundle_uuid}}
databricks_cli_version: ">= 0.238.0"
experimental:
python:
# Activate virtual environment before loading resources defined in Python.
# If disabled, defaults to using the Python interpreter available in the current shell.
venv_path: .venv
# Functions called to load resources defined in Python. See resources/__init__.py
resources:
- "resources:load_resources"
{{ if .include_python -}}
artifacts:
default:
type: whl
path: .
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build
{{ end -}}
include:
- resources/*.yml
targets:
dev:
# The default target uses 'mode: development' to create a development copy.
# - Deployed resources get prefixed with '[dev my_user_name]'
# - Any job schedules and triggers are paused by default.
# See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
mode: development
default: true
workspace:
host: {{workspace_host}}
prod:
mode: production
workspace:
host: {{workspace_host}}
# We explicitly specify /Workspace/Users/{{user_name}} to make sure we only have a single copy.
root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target}
permissions:
- {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}}
level: CAN_MANAGE
run_as:
{{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}}

View File

@ -0,0 +1,27 @@
# Fixtures
{{- /*
We don't want to have too many README.md files, since they
stand out so much. But we do need to have a file here to make
sure the folder is added to Git.
*/}}
This folder is reserved for fixtures, such as CSV files.
Below is an example of how to load fixtures as a data frame:
```
import pandas as pd
import os
def get_absolute_path(*relative_parts):
if 'dbutils' in globals():
base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
path = os.path.normpath(os.path.join(base_dir, *relative_parts))
return path if path.startswith("/Workspace") else "/Workspace" + path
else:
return os.path.join(*relative_parts)
csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
df = pd.read_csv(csv_file)
display(df)
```

View File

@ -0,0 +1,57 @@
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "{{.project_name}}"
requires-python = ">=3.10"
description = "wheel file based on {{.project_name}}"
# Dependencies in case the output wheel file is used as a library dependency.
# For defining dependencies, when this package is used in Databricks, see:
# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
#
# Example:
# dependencies = [
# "requests==x.y.z",
# ]
dependencies = [
]
# see setup.py
dynamic = ["version"]
{{ if .include_python -}}
[project.entry-points.packages]
main = "{{.project_name}}.main:main"
{{ end -}}
[tool.setuptools]
{{ if .include_python -}}
py-modules = ["resources", "{{.project_name}}"]
{{ else }}
py-modules = ["resources"]
{{ end -}}
[tool.uv]
## Dependencies for local development
dev-dependencies = [
"databricks-bundles==0.7.0",
## Add code completion support for DLT
# "databricks-dlt",
## databricks-connect can be used to run parts of this project locally.
## See https://docs.databricks.com/dev-tools/databricks-connect.html.
##
## Uncomment line below to install a version of db-connect that corresponds to
## the Databricks Runtime version used for this project.
# "databricks-connect{{template "latest_lts_db_connect_version_spec"}}",
]
override-dependencies = [
# pyspark package conflicts with 'databricks-connect'
"pyspark; sys_platform == 'never'",
]

View File

@ -0,0 +1,16 @@
from databricks.bundles.core import (
Bundle,
Resources,
load_resources_from_current_package_module,
)
def load_resources(bundle: Bundle) -> Resources:
"""
'load_resources' function is referenced in databricks.yml and is responsible for loading
bundle resources defined in Python code. This function is called by Databricks CLI during
bundle deployment. After deployment, this function is not used.
"""
# the default implementation loads all Python files in 'resources' directory
return load_resources_from_current_package_module()

View File

@ -0,0 +1,108 @@
{{$include_dlt := "no" -}}
from databricks.bundles.jobs import Job
"""
The main job for {{.project_name}}.
{{- /* Clarify what this job is for for DLT-only users. */}}
{{if and (eq $include_dlt "yes") (and (eq .include_notebook "no") (eq .include_python "no")) -}}
This job runs {{.project_name}}_pipeline on a schedule.
{{end -}}
"""
{{.project_name}}_job = Job.from_dict(
{
"name": "{{.project_name}}_job",
"trigger": {
# Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
"periodic": {
"interval": 1,
"unit": "DAYS",
},
},
{{- if not is_service_principal}}
"email_notifications": {
"on_failure": [
"{{user_name}}",
],
},
{{else}}
{{- end -}}
"tasks": [
{{- if eq .include_notebook "yes" -}}
{{- "\n " -}}
{
"task_key": "notebook_task",
"job_cluster_key": "job_cluster",
"notebook_task": {
"notebook_path": "src/notebook.ipynb",
},
},
{{- end -}}
{{- if (eq $include_dlt "yes") -}}
{{- "\n " -}}
{
"task_key": "refresh_pipeline",
{{- if (eq .include_notebook "yes" )}}
"depends_on": [
{
"task_key": "notebook_task",
},
],
{{- end}}
"pipeline_task": {
{{- /* TODO: we should find a way that doesn't use magics for the below, like ./{{project_name}}.pipeline.yml */}}
"pipeline_id": "${resources.pipelines.{{.project_name}}_pipeline.id}",
},
},
{{- end -}}
{{- if (eq .include_python "yes") -}}
{{- "\n " -}}
{
"task_key": "main_task",
{{- if (eq $include_dlt "yes") }}
"depends_on": [
{
"task_key": "refresh_pipeline",
},
],
{{- else if (eq .include_notebook "yes" )}}
"depends_on": [
{
"task_key": "notebook_task",
},
],
{{- end}}
"job_cluster_key": "job_cluster",
"python_wheel_task": {
"package_name": "{{.project_name}}",
"entry_point": "main",
},
"libraries": [
# By default we just include the .whl file generated for the {{.project_name}} package.
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
# for more information on how to add other libraries.
{
"whl": "dist/*.whl",
},
],
},
{{- end -}}
{{""}}
],
"job_clusters": [
{
"job_cluster_key": "job_cluster",
"new_cluster": {
"spark_version": "{{template "latest_lts_dbr_version"}}",
"node_type_id": "{{smallest_node_type}}",
"autoscale": {
"min_workers": 1,
"max_workers": 4,
},
},
},
],
}
)

View File

@ -0,0 +1,24 @@
from databricks.bundles.pipelines import Pipeline
{{.project_name}}_pipeline = Pipeline.from_dict(
{
"name": "{{.project_name}}_pipeline",
"target": "{{.project_name}}_${bundle.target}",
{{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}}
## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog:
"catalog": "catalog_name",
{{- else}}
"catalog": "{{default_catalog}}",
{{- end}}
"libraries": [
{
"notebook": {
"path": "src/dlt_pipeline.ipynb",
},
},
],
"configuration": {
"bundle.sourcePath": "${workspace.file_path}/src",
},
}
)

View File

@ -0,0 +1,4 @@
# scratch
This folder is reserved for personal, exploratory notebooks.
By default these are not committed to Git, as 'scratch' is listed in .gitignore.

View File

@ -0,0 +1,18 @@
"""
setup.py configuration script describing how to build and package this project.
This file is primarily used by the setuptools library and typically should not
be executed directly. See README.md for how to deploy, test, and run
the {{.project_name}} project.
"""
import os
from setuptools import setup
local_version = os.getenv("LOCAL_VERSION")
version = "0.0.1"
setup(
version=f"{version}+{local_version}" if local_version else version,
)

View File

@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec",
"showTitle": false,
"title": ""
}
},
"source": [
"# DLT pipeline\n",
"\n",
"This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/{{.project_name}}.pipeline.yml."
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
{{- if (eq .include_python "yes") }}
"# Import DLT and src/{{.project_name}}\n",
"import dlt\n",
"import sys\n",
"\n",
"sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n",
"from pyspark.sql.functions import expr\n",
"from {{.project_name}} import main"
{{else}}
"import dlt\n",
"from pyspark.sql.functions import expr\n",
"from pyspark.sql import SparkSession\n",
"\n",
"spark = SparkSession.builder.getOrCreate()"
{{end -}}
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
{{- if (eq .include_python "yes") }}
"@dlt.view\n",
"def taxi_raw():\n",
" return main.get_taxis(spark)\n",
{{else}}
"@dlt.view\n",
"def taxi_raw():\n",
" return spark.read.format(\"json\").load(\"/databricks-datasets/nyctaxi/sample/json/\")\n",
{{end -}}
"\n",
"\n",
"@dlt.table\n",
"def filtered_taxis():\n",
" return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 2
},
"notebookName": "dlt_pipeline",
"widgets": {}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -0,0 +1,79 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
"showTitle": false,
"title": ""
}
},
"source": [
"# Default notebook\n",
"\n",
"This default notebook is executed using Databricks Workflows as defined in resources/{{.project_name}}.job.yml."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
{{- if (eq .include_python "yes") }}
"from {{.project_name}} import main\n",
"\n",
"main.get_taxis(spark).show(10)"
{{else}}
"spark.range(10)"
{{end -}}
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 2
},
"notebookName": "notebook",
"widgets": {}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -0,0 +1,25 @@
from pyspark.sql import SparkSession, DataFrame
def get_taxis(spark: SparkSession) -> DataFrame:
return spark.read.table("samples.nyctaxi.trips")
# Create a new Databricks Connect session. If this fails,
# check that you have configured Databricks Connect correctly.
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
def get_spark() -> SparkSession:
try:
from databricks.connect import DatabricksSession
return DatabricksSession.builder.getOrCreate()
except ImportError:
return SparkSession.builder.getOrCreate()
def main():
get_taxis(get_spark()).show(5)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,8 @@
from {{.project_name}}.main import get_taxis, get_spark
# running tests requires installing databricks-connect, e.g. by uncommenting it in pyproject.toml
def test_main():
taxis = get_taxis(get_spark())
assert taxis.count() > 5