mirror of https://github.com/databricks/cli.git
Add experimental-jobs-as-code template (#2177)
## Changes Add experimental-jobs-as-code template allowing defining jobs using Python instead of YAML through the `databricks-bundles` PyPI package. ## Tests Manually and acceptance tests.
This commit is contained in:
parent
7034793d1d
commit
31c10c1b82
|
@ -8,6 +8,7 @@ import (
|
|||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"slices"
|
||||
"sort"
|
||||
|
@ -393,6 +394,16 @@ func CopyDir(src, dst string, inputs, outputs map[string]bool) error {
|
|||
}
|
||||
|
||||
func ListDir(t *testing.T, src string) ([]string, error) {
|
||||
// exclude folders in .gitignore from comparison
|
||||
ignored := []string{
|
||||
"\\.ruff_cache",
|
||||
"\\.venv",
|
||||
".*\\.egg-info",
|
||||
"__pycache__",
|
||||
// depends on uv version
|
||||
"uv.lock",
|
||||
}
|
||||
|
||||
var files []string
|
||||
err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
|
@ -400,7 +411,19 @@ func ListDir(t *testing.T, src string) ([]string, error) {
|
|||
}
|
||||
|
||||
if info.IsDir() {
|
||||
for _, ignoredFolder := range ignored {
|
||||
if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
} else {
|
||||
for _, ignoredFolder := range ignored {
|
||||
if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
relPath, err := filepath.Rel(src, path)
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
"project_name": "my_jobs_as_code",
|
||||
"include_notebook": "yes",
|
||||
"include_python": "yes"
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
|
||||
>>> $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output
|
||||
|
||||
Welcome to (EXPERIMENTAL) "Jobs as code" template for Databricks Asset Bundles!
|
||||
Workspace to use (auto-detected, edit in 'my_jobs_as_code/databricks.yml'): $DATABRICKS_URL
|
||||
|
||||
✨ Your new project has been created in the 'my_jobs_as_code' directory!
|
||||
|
||||
Please refer to the README.md file for "getting started" instructions.
|
||||
See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html.
|
||||
|
||||
>>> $CLI bundle validate -t dev --output json
|
||||
{
|
||||
"jobs": {
|
||||
"my_jobs_as_code_job": {
|
||||
"deployment": {
|
||||
"kind": "BUNDLE",
|
||||
"metadata_file_path": "/Workspace/Users/$USERNAME/.bundle/my_jobs_as_code/dev/state/metadata.json"
|
||||
},
|
||||
"edit_mode": "UI_LOCKED",
|
||||
"email_notifications": {
|
||||
"on_failure": [
|
||||
"$USERNAME"
|
||||
]
|
||||
},
|
||||
"format": "MULTI_TASK",
|
||||
"job_clusters": [
|
||||
{
|
||||
"job_cluster_key": "job_cluster",
|
||||
"new_cluster": {
|
||||
"autoscale": {
|
||||
"max_workers": 4,
|
||||
"min_workers": 1
|
||||
},
|
||||
"node_type_id": "i3.xlarge",
|
||||
"spark_version": "15.4.x-scala2.12"
|
||||
}
|
||||
}
|
||||
],
|
||||
"max_concurrent_runs": 4,
|
||||
"name": "[dev $USERNAME] my_jobs_as_code_job",
|
||||
"permissions": [],
|
||||
"queue": {
|
||||
"enabled": true
|
||||
},
|
||||
"tags": {
|
||||
"dev": "$USERNAME"
|
||||
},
|
||||
"tasks": [
|
||||
{
|
||||
"job_cluster_key": "job_cluster",
|
||||
"notebook_task": {
|
||||
"notebook_path": "/Workspace/Users/$USERNAME/.bundle/my_jobs_as_code/dev/files/src/notebook"
|
||||
},
|
||||
"task_key": "notebook_task"
|
||||
},
|
||||
{
|
||||
"depends_on": [
|
||||
{
|
||||
"task_key": "notebook_task"
|
||||
}
|
||||
],
|
||||
"job_cluster_key": "job_cluster",
|
||||
"libraries": [
|
||||
{
|
||||
"whl": "dist/*.whl"
|
||||
}
|
||||
],
|
||||
"python_wheel_task": {
|
||||
"entry_point": "main",
|
||||
"package_name": "my_jobs_as_code"
|
||||
},
|
||||
"task_key": "main_task"
|
||||
}
|
||||
],
|
||||
"trigger": {
|
||||
"pause_status": "PAUSED",
|
||||
"periodic": {
|
||||
"interval": 1,
|
||||
"unit": "DAYS"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
8
acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore
vendored
Normal file
8
acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
.databricks/
|
||||
build/
|
||||
dist/
|
||||
__pycache__/
|
||||
*.egg-info
|
||||
.venv/
|
||||
scratch/**
|
||||
!scratch/README.md
|
|
@ -0,0 +1,58 @@
|
|||
# my_jobs_as_code
|
||||
|
||||
The 'my_jobs_as_code' project was generated by using the "Jobs as code" template.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Install Databricks CLI 0.238 or later.
|
||||
See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html).
|
||||
|
||||
2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
|
||||
We use uv to create a virtual environment and install the required dependencies.
|
||||
|
||||
3. Authenticate to your Databricks workspace if you have not done so already:
|
||||
```
|
||||
$ databricks configure
|
||||
```
|
||||
|
||||
4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
|
||||
https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
|
||||
**Databricks Connect** for instructions on running the included Python code from a different IDE.
|
||||
|
||||
5. For documentation on the Databricks Asset Bundles format used
|
||||
for this project, and for CI/CD configuration, see
|
||||
https://docs.databricks.com/dev-tools/bundles/index.html.
|
||||
|
||||
## Deploy and run jobs
|
||||
|
||||
1. Create a new virtual environment and install the required dependencies:
|
||||
```
|
||||
$ uv sync
|
||||
```
|
||||
|
||||
2. To deploy the bundle to the development target:
|
||||
```
|
||||
$ databricks bundle deploy --target dev
|
||||
```
|
||||
|
||||
*(Note that "dev" is the default target, so the `--target` parameter is optional here.)*
|
||||
|
||||
This deploys everything that's defined for this project.
|
||||
For example, the default template would deploy a job called
|
||||
`[dev yourname] my_jobs_as_code_job` to your workspace.
|
||||
You can find that job by opening your workspace and clicking on **Workflows**.
|
||||
|
||||
3. Similarly, to deploy a production copy, type:
|
||||
```
|
||||
$ databricks bundle deploy --target prod
|
||||
```
|
||||
|
||||
Note that the default job from the template has a schedule that runs every day
|
||||
(defined in resources/my_jobs_as_code_job.py). The schedule
|
||||
is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes](
|
||||
https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)).
|
||||
|
||||
4. To run a job:
|
||||
```
|
||||
$ databricks bundle run
|
||||
```
|
|
@ -0,0 +1,48 @@
|
|||
# This is a Databricks asset bundle definition for my_jobs_as_code.
|
||||
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
|
||||
bundle:
|
||||
name: my_jobs_as_code
|
||||
uuid: <UUID>
|
||||
|
||||
experimental:
|
||||
python:
|
||||
# Activate virtual environment before loading resources defined in Python.
|
||||
# If disabled, defaults to using the Python interpreter available in the current shell.
|
||||
venv_path: .venv
|
||||
# Functions called to load resources defined in Python. See resources/__init__.py
|
||||
resources:
|
||||
- "resources:load_resources"
|
||||
|
||||
artifacts:
|
||||
default:
|
||||
type: whl
|
||||
path: .
|
||||
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
|
||||
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
|
||||
build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build
|
||||
|
||||
include:
|
||||
- resources/*.yml
|
||||
|
||||
targets:
|
||||
dev:
|
||||
# The default target uses 'mode: development' to create a development copy.
|
||||
# - Deployed resources get prefixed with '[dev my_user_name]'
|
||||
# - Any job schedules and triggers are paused by default.
|
||||
# See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
|
||||
mode: development
|
||||
default: true
|
||||
workspace:
|
||||
host: $DATABRICKS_URL
|
||||
|
||||
prod:
|
||||
mode: production
|
||||
workspace:
|
||||
host: $DATABRICKS_URL
|
||||
# We explicitly specify /Workspace/Users/$USERNAME to make sure we only have a single copy.
|
||||
root_path: /Workspace/Users/$USERNAME/.bundle/${bundle.name}/${bundle.target}
|
||||
permissions:
|
||||
- user_name: $USERNAME
|
||||
level: CAN_MANAGE
|
||||
run_as:
|
||||
user_name: $USERNAME
|
|
@ -0,0 +1,22 @@
|
|||
# Fixtures
|
||||
|
||||
This folder is reserved for fixtures, such as CSV files.
|
||||
|
||||
Below is an example of how to load fixtures as a data frame:
|
||||
|
||||
```
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
def get_absolute_path(*relative_parts):
|
||||
if 'dbutils' in globals():
|
||||
base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
|
||||
path = os.path.normpath(os.path.join(base_dir, *relative_parts))
|
||||
return path if path.startswith("/Workspace") else "/Workspace" + path
|
||||
else:
|
||||
return os.path.join(*relative_parts)
|
||||
|
||||
csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
|
||||
df = pd.read_csv(csv_file)
|
||||
display(df)
|
||||
```
|
|
@ -0,0 +1,49 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=61.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "my_jobs_as_code"
|
||||
requires-python = ">=3.10"
|
||||
description = "wheel file based on my_jobs_as_code"
|
||||
|
||||
# Dependencies in case the output wheel file is used as a library dependency.
|
||||
# For defining dependencies, when this package is used in Databricks, see:
|
||||
# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||
#
|
||||
# Example:
|
||||
# dependencies = [
|
||||
# "requests==x.y.z",
|
||||
# ]
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
# see setup.py
|
||||
dynamic = ["version"]
|
||||
|
||||
[project.entry-points.packages]
|
||||
main = "my_jobs_as_code.main:main"
|
||||
|
||||
[tool.setuptools]
|
||||
py-modules = ["resources", "my_jobs_as_code"]
|
||||
|
||||
[tool.uv]
|
||||
## Dependencies for local development
|
||||
dev-dependencies = [
|
||||
"databricks-bundles==0.7.0",
|
||||
|
||||
## Add code completion support for DLT
|
||||
# "databricks-dlt",
|
||||
|
||||
## databricks-connect can be used to run parts of this project locally.
|
||||
## See https://docs.databricks.com/dev-tools/databricks-connect.html.
|
||||
##
|
||||
## Uncomment line below to install a version of db-connect that corresponds to
|
||||
## the Databricks Runtime version used for this project.
|
||||
# "databricks-connect>=15.4,<15.5",
|
||||
]
|
||||
|
||||
override-dependencies = [
|
||||
# pyspark package conflicts with 'databricks-connect'
|
||||
"pyspark; sys_platform == 'never'",
|
||||
]
|
|
@ -0,0 +1,16 @@
|
|||
from databricks.bundles.core import (
|
||||
Bundle,
|
||||
Resources,
|
||||
load_resources_from_current_package_module,
|
||||
)
|
||||
|
||||
|
||||
def load_resources(bundle: Bundle) -> Resources:
|
||||
"""
|
||||
'load_resources' function is referenced in databricks.yml and is responsible for loading
|
||||
bundle resources defined in Python code. This function is called by Databricks CLI during
|
||||
bundle deployment. After deployment, this function is not used.
|
||||
"""
|
||||
|
||||
# the default implementation loads all Python files in 'resources' directory
|
||||
return load_resources_from_current_package_module()
|
|
@ -0,0 +1,67 @@
|
|||
from databricks.bundles.jobs import Job
|
||||
|
||||
"""
|
||||
The main job for my_jobs_as_code.
|
||||
"""
|
||||
|
||||
|
||||
my_jobs_as_code_job = Job.from_dict(
|
||||
{
|
||||
"name": "my_jobs_as_code_job",
|
||||
"trigger": {
|
||||
# Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
|
||||
"periodic": {
|
||||
"interval": 1,
|
||||
"unit": "DAYS",
|
||||
},
|
||||
},
|
||||
"email_notifications": {
|
||||
"on_failure": [
|
||||
"$USERNAME",
|
||||
],
|
||||
},
|
||||
"tasks": [
|
||||
{
|
||||
"task_key": "notebook_task",
|
||||
"job_cluster_key": "job_cluster",
|
||||
"notebook_task": {
|
||||
"notebook_path": "src/notebook.ipynb",
|
||||
},
|
||||
},
|
||||
{
|
||||
"task_key": "main_task",
|
||||
"depends_on": [
|
||||
{
|
||||
"task_key": "notebook_task",
|
||||
},
|
||||
],
|
||||
"job_cluster_key": "job_cluster",
|
||||
"python_wheel_task": {
|
||||
"package_name": "my_jobs_as_code",
|
||||
"entry_point": "main",
|
||||
},
|
||||
"libraries": [
|
||||
# By default we just include the .whl file generated for the my_jobs_as_code package.
|
||||
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||
# for more information on how to add other libraries.
|
||||
{
|
||||
"whl": "dist/*.whl",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
"job_clusters": [
|
||||
{
|
||||
"job_cluster_key": "job_cluster",
|
||||
"new_cluster": {
|
||||
"spark_version": "15.4.x-scala2.12",
|
||||
"node_type_id": "i3.xlarge",
|
||||
"autoscale": {
|
||||
"min_workers": 1,
|
||||
"max_workers": 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
|
@ -0,0 +1,4 @@
|
|||
# scratch
|
||||
|
||||
This folder is reserved for personal, exploratory notebooks.
|
||||
By default these are not committed to Git, as 'scratch' is listed in .gitignore.
|
|
@ -0,0 +1,18 @@
|
|||
"""
|
||||
setup.py configuration script describing how to build and package this project.
|
||||
|
||||
This file is primarily used by the setuptools library and typically should not
|
||||
be executed directly. See README.md for how to deploy, test, and run
|
||||
the my_jobs_as_code project.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
local_version = os.getenv("LOCAL_VERSION")
|
||||
version = "0.0.1"
|
||||
|
||||
setup(
|
||||
version=f"{version}+{local_version}" if local_version else version,
|
||||
)
|
|
@ -0,0 +1,25 @@
|
|||
from pyspark.sql import SparkSession, DataFrame
|
||||
|
||||
|
||||
def get_taxis(spark: SparkSession) -> DataFrame:
|
||||
return spark.read.table("samples.nyctaxi.trips")
|
||||
|
||||
|
||||
# Create a new Databricks Connect session. If this fails,
|
||||
# check that you have configured Databricks Connect correctly.
|
||||
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
|
||||
def get_spark() -> SparkSession:
|
||||
try:
|
||||
from databricks.connect import DatabricksSession
|
||||
|
||||
return DatabricksSession.builder.getOrCreate()
|
||||
except ImportError:
|
||||
return SparkSession.builder.getOrCreate()
|
||||
|
||||
|
||||
def main():
|
||||
get_taxis(get_spark()).show(5)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,75 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {},
|
||||
"inputWidgets": {},
|
||||
"nuid": "<UUID>",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Default notebook\n",
|
||||
"\n",
|
||||
"This default notebook is executed using Databricks Workflows as defined in resources/my_jobs_as_code.job.yml."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 0,
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {
|
||||
"byteLimit": 2048000,
|
||||
"rowLimit": 10000
|
||||
},
|
||||
"inputWidgets": {},
|
||||
"nuid": "<UUID>",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from my_jobs_as_code import main\n",
|
||||
"\n",
|
||||
"main.get_taxis(spark).show(10)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+notebook": {
|
||||
"dashboards": [],
|
||||
"language": "python",
|
||||
"notebookMetadata": {
|
||||
"pythonIndentUnit": 2
|
||||
},
|
||||
"notebookName": "notebook",
|
||||
"widgets": {}
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
from my_jobs_as_code.main import get_taxis, get_spark
|
||||
|
||||
# running tests requires installing databricks-connect, e.g. by uncommenting it in pyproject.toml
|
||||
|
||||
|
||||
def test_main():
|
||||
taxis = get_taxis(get_spark())
|
||||
assert taxis.count() > 5
|
|
@ -0,0 +1,12 @@
|
|||
trace $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output
|
||||
|
||||
cd output/my_jobs_as_code
|
||||
|
||||
# silence uv output because it's non-deterministic
|
||||
uv sync 2> /dev/null
|
||||
|
||||
# remove version constraint because it always creates a warning on dev builds
|
||||
cat databricks.yml | grep -v databricks_cli_version > databricks.yml.new
|
||||
mv databricks.yml.new databricks.yml
|
||||
|
||||
trace $CLI bundle validate -t dev --output json | jq ".resources"
|
|
@ -59,6 +59,11 @@ var nativeTemplates = []nativeTemplate{
|
|||
hidden: true,
|
||||
description: "The default PyDABs template",
|
||||
},
|
||||
{
|
||||
name: "experimental-jobs-as-code",
|
||||
hidden: true,
|
||||
description: "Jobs as code template (experimental)",
|
||||
},
|
||||
{
|
||||
name: customTemplate,
|
||||
description: "Bring your own template",
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"welcome_message": "\nWelcome to (EXPERIMENTAL) \"Jobs as code\" template for Databricks Asset Bundles!",
|
||||
"properties": {
|
||||
"project_name": {
|
||||
"type": "string",
|
||||
"default": "jobs_as_code_project",
|
||||
"description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project",
|
||||
"order": 1,
|
||||
"pattern": "^[A-Za-z0-9_]+$",
|
||||
"pattern_match_failure_message": "Name must consist of letters, numbers, and underscores."
|
||||
},
|
||||
"include_notebook": {
|
||||
"type": "string",
|
||||
"default": "yes",
|
||||
"enum": ["yes", "no"],
|
||||
"description": "Include a stub (sample) notebook in '{{.project_name}}{{path_separator}}src'",
|
||||
"order": 2
|
||||
},
|
||||
"include_python": {
|
||||
"type": "string",
|
||||
"default": "yes",
|
||||
"enum": ["yes", "no"],
|
||||
"description": "Include a stub (sample) Python package in '{{.project_name}}/src'",
|
||||
"order": 3
|
||||
}
|
||||
},
|
||||
"success_message": "Workspace to use (auto-detected, edit in '{{.project_name}}/databricks.yml'): {{workspace_host}}\n\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html."
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
{{define "latest_lts_dbr_version" -}}
|
||||
15.4.x-scala2.12
|
||||
{{- end}}
|
||||
|
||||
{{define "latest_lts_db_connect_version_spec" -}}
|
||||
>=15.4,<15.5
|
||||
{{- end}}
|
|
@ -0,0 +1,30 @@
|
|||
# Preamble
|
||||
|
||||
This file only template directives; it is skipped for the actual output.
|
||||
|
||||
{{skip "__preamble"}}
|
||||
|
||||
# TODO add DLT support, placeholder for now
|
||||
{{$notDLT := true }}
|
||||
{{$notNotebook := not (eq .include_notebook "yes")}}
|
||||
{{$notPython := not (eq .include_python "yes")}}
|
||||
|
||||
{{if $notPython}}
|
||||
{{skip "{{.project_name}}/src/{{.project_name}}"}}
|
||||
{{skip "{{.project_name}}/tests/main_test.py"}}
|
||||
{{end}}
|
||||
|
||||
{{if $notDLT}}
|
||||
{{skip "{{.project_name}}/src/dlt_pipeline.ipynb"}}
|
||||
{{skip "{{.project_name}}/resources/{{.project_name}}_pipeline.py"}}
|
||||
{{end}}
|
||||
|
||||
{{if $notNotebook}}
|
||||
{{skip "{{.project_name}}/src/notebook.ipynb"}}
|
||||
{{end}}
|
||||
|
||||
{{if (and $notDLT $notNotebook $notPython)}}
|
||||
{{skip "{{.project_name}}/resources/{{.project_name}}_job.py"}}
|
||||
{{else}}
|
||||
{{skip "{{.project_name}}/resources/.gitkeep"}}
|
||||
{{end}}
|
8
libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/.gitignore
vendored
Normal file
8
libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/.gitignore
vendored
Normal file
|
@ -0,0 +1,8 @@
|
|||
.databricks/
|
||||
build/
|
||||
dist/
|
||||
__pycache__/
|
||||
*.egg-info
|
||||
.venv/
|
||||
scratch/**
|
||||
!scratch/README.md
|
|
@ -0,0 +1,60 @@
|
|||
# {{.project_name}}
|
||||
|
||||
The '{{.project_name}}' project was generated by using the "Jobs as code" template.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Install Databricks CLI 0.238 or later.
|
||||
See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html).
|
||||
|
||||
2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
|
||||
We use uv to create a virtual environment and install the required dependencies.
|
||||
|
||||
3. Authenticate to your Databricks workspace if you have not done so already:
|
||||
```
|
||||
$ databricks configure
|
||||
```
|
||||
|
||||
4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
|
||||
https://docs.databricks.com/dev-tools/vscode-ext.html.
|
||||
{{- if (eq .include_python "yes") }} Or read the "getting started" documentation for
|
||||
**Databricks Connect** for instructions on running the included Python code from a different IDE.
|
||||
{{- end}}
|
||||
|
||||
5. For documentation on the Databricks Asset Bundles format used
|
||||
for this project, and for CI/CD configuration, see
|
||||
https://docs.databricks.com/dev-tools/bundles/index.html.
|
||||
|
||||
## Deploy and run jobs
|
||||
|
||||
1. Create a new virtual environment and install the required dependencies:
|
||||
```
|
||||
$ uv sync
|
||||
```
|
||||
|
||||
2. To deploy the bundle to the development target:
|
||||
```
|
||||
$ databricks bundle deploy --target dev
|
||||
```
|
||||
|
||||
*(Note that "dev" is the default target, so the `--target` parameter is optional here.)*
|
||||
|
||||
This deploys everything that's defined for this project.
|
||||
For example, the default template would deploy a job called
|
||||
`[dev yourname] {{.project_name}}_job` to your workspace.
|
||||
You can find that job by opening your workspace and clicking on **Workflows**.
|
||||
|
||||
3. Similarly, to deploy a production copy, type:
|
||||
```
|
||||
$ databricks bundle deploy --target prod
|
||||
```
|
||||
|
||||
Note that the default job from the template has a schedule that runs every day
|
||||
(defined in resources/{{.project_name}}_job.py). The schedule
|
||||
is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes](
|
||||
https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)).
|
||||
|
||||
4. To run a job:
|
||||
```
|
||||
$ databricks bundle run
|
||||
```
|
|
@ -0,0 +1,51 @@
|
|||
# This is a Databricks asset bundle definition for {{.project_name}}.
|
||||
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
|
||||
bundle:
|
||||
name: {{.project_name}}
|
||||
uuid: {{bundle_uuid}}
|
||||
databricks_cli_version: ">= 0.238.0"
|
||||
|
||||
experimental:
|
||||
python:
|
||||
# Activate virtual environment before loading resources defined in Python.
|
||||
# If disabled, defaults to using the Python interpreter available in the current shell.
|
||||
venv_path: .venv
|
||||
# Functions called to load resources defined in Python. See resources/__init__.py
|
||||
resources:
|
||||
- "resources:load_resources"
|
||||
|
||||
{{ if .include_python -}}
|
||||
artifacts:
|
||||
default:
|
||||
type: whl
|
||||
path: .
|
||||
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
|
||||
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
|
||||
build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build
|
||||
|
||||
{{ end -}}
|
||||
include:
|
||||
- resources/*.yml
|
||||
|
||||
targets:
|
||||
dev:
|
||||
# The default target uses 'mode: development' to create a development copy.
|
||||
# - Deployed resources get prefixed with '[dev my_user_name]'
|
||||
# - Any job schedules and triggers are paused by default.
|
||||
# See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
|
||||
mode: development
|
||||
default: true
|
||||
workspace:
|
||||
host: {{workspace_host}}
|
||||
|
||||
prod:
|
||||
mode: production
|
||||
workspace:
|
||||
host: {{workspace_host}}
|
||||
# We explicitly specify /Workspace/Users/{{user_name}} to make sure we only have a single copy.
|
||||
root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target}
|
||||
permissions:
|
||||
- {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}}
|
||||
level: CAN_MANAGE
|
||||
run_as:
|
||||
{{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}}
|
|
@ -0,0 +1,27 @@
|
|||
# Fixtures
|
||||
{{- /*
|
||||
We don't want to have too many README.md files, since they
|
||||
stand out so much. But we do need to have a file here to make
|
||||
sure the folder is added to Git.
|
||||
*/}}
|
||||
|
||||
This folder is reserved for fixtures, such as CSV files.
|
||||
|
||||
Below is an example of how to load fixtures as a data frame:
|
||||
|
||||
```
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
def get_absolute_path(*relative_parts):
|
||||
if 'dbutils' in globals():
|
||||
base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
|
||||
path = os.path.normpath(os.path.join(base_dir, *relative_parts))
|
||||
return path if path.startswith("/Workspace") else "/Workspace" + path
|
||||
else:
|
||||
return os.path.join(*relative_parts)
|
||||
|
||||
csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
|
||||
df = pd.read_csv(csv_file)
|
||||
display(df)
|
||||
```
|
|
@ -0,0 +1,57 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=61.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "{{.project_name}}"
|
||||
requires-python = ">=3.10"
|
||||
description = "wheel file based on {{.project_name}}"
|
||||
|
||||
# Dependencies in case the output wheel file is used as a library dependency.
|
||||
# For defining dependencies, when this package is used in Databricks, see:
|
||||
# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||
#
|
||||
# Example:
|
||||
# dependencies = [
|
||||
# "requests==x.y.z",
|
||||
# ]
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
# see setup.py
|
||||
dynamic = ["version"]
|
||||
|
||||
{{ if .include_python -}}
|
||||
[project.entry-points.packages]
|
||||
main = "{{.project_name}}.main:main"
|
||||
|
||||
{{ end -}}
|
||||
|
||||
[tool.setuptools]
|
||||
{{ if .include_python -}}
|
||||
py-modules = ["resources", "{{.project_name}}"]
|
||||
|
||||
{{ else }}
|
||||
py-modules = ["resources"]
|
||||
|
||||
{{ end -}}
|
||||
[tool.uv]
|
||||
## Dependencies for local development
|
||||
dev-dependencies = [
|
||||
"databricks-bundles==0.7.0",
|
||||
|
||||
## Add code completion support for DLT
|
||||
# "databricks-dlt",
|
||||
|
||||
## databricks-connect can be used to run parts of this project locally.
|
||||
## See https://docs.databricks.com/dev-tools/databricks-connect.html.
|
||||
##
|
||||
## Uncomment line below to install a version of db-connect that corresponds to
|
||||
## the Databricks Runtime version used for this project.
|
||||
# "databricks-connect{{template "latest_lts_db_connect_version_spec"}}",
|
||||
]
|
||||
|
||||
override-dependencies = [
|
||||
# pyspark package conflicts with 'databricks-connect'
|
||||
"pyspark; sys_platform == 'never'",
|
||||
]
|
|
@ -0,0 +1,16 @@
|
|||
from databricks.bundles.core import (
|
||||
Bundle,
|
||||
Resources,
|
||||
load_resources_from_current_package_module,
|
||||
)
|
||||
|
||||
|
||||
def load_resources(bundle: Bundle) -> Resources:
|
||||
"""
|
||||
'load_resources' function is referenced in databricks.yml and is responsible for loading
|
||||
bundle resources defined in Python code. This function is called by Databricks CLI during
|
||||
bundle deployment. After deployment, this function is not used.
|
||||
"""
|
||||
|
||||
# the default implementation loads all Python files in 'resources' directory
|
||||
return load_resources_from_current_package_module()
|
|
@ -0,0 +1,108 @@
|
|||
{{$include_dlt := "no" -}}
|
||||
from databricks.bundles.jobs import Job
|
||||
|
||||
"""
|
||||
The main job for {{.project_name}}.
|
||||
|
||||
{{- /* Clarify what this job is for for DLT-only users. */}}
|
||||
{{if and (eq $include_dlt "yes") (and (eq .include_notebook "no") (eq .include_python "no")) -}}
|
||||
This job runs {{.project_name}}_pipeline on a schedule.
|
||||
{{end -}}
|
||||
"""
|
||||
|
||||
|
||||
{{.project_name}}_job = Job.from_dict(
|
||||
{
|
||||
"name": "{{.project_name}}_job",
|
||||
"trigger": {
|
||||
# Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
|
||||
"periodic": {
|
||||
"interval": 1,
|
||||
"unit": "DAYS",
|
||||
},
|
||||
},
|
||||
{{- if not is_service_principal}}
|
||||
"email_notifications": {
|
||||
"on_failure": [
|
||||
"{{user_name}}",
|
||||
],
|
||||
},
|
||||
{{else}}
|
||||
{{- end -}}
|
||||
"tasks": [
|
||||
{{- if eq .include_notebook "yes" -}}
|
||||
{{- "\n " -}}
|
||||
{
|
||||
"task_key": "notebook_task",
|
||||
"job_cluster_key": "job_cluster",
|
||||
"notebook_task": {
|
||||
"notebook_path": "src/notebook.ipynb",
|
||||
},
|
||||
},
|
||||
{{- end -}}
|
||||
{{- if (eq $include_dlt "yes") -}}
|
||||
{{- "\n " -}}
|
||||
{
|
||||
"task_key": "refresh_pipeline",
|
||||
{{- if (eq .include_notebook "yes" )}}
|
||||
"depends_on": [
|
||||
{
|
||||
"task_key": "notebook_task",
|
||||
},
|
||||
],
|
||||
{{- end}}
|
||||
"pipeline_task": {
|
||||
{{- /* TODO: we should find a way that doesn't use magics for the below, like ./{{project_name}}.pipeline.yml */}}
|
||||
"pipeline_id": "${resources.pipelines.{{.project_name}}_pipeline.id}",
|
||||
},
|
||||
},
|
||||
{{- end -}}
|
||||
{{- if (eq .include_python "yes") -}}
|
||||
{{- "\n " -}}
|
||||
{
|
||||
"task_key": "main_task",
|
||||
{{- if (eq $include_dlt "yes") }}
|
||||
"depends_on": [
|
||||
{
|
||||
"task_key": "refresh_pipeline",
|
||||
},
|
||||
],
|
||||
{{- else if (eq .include_notebook "yes" )}}
|
||||
"depends_on": [
|
||||
{
|
||||
"task_key": "notebook_task",
|
||||
},
|
||||
],
|
||||
{{- end}}
|
||||
"job_cluster_key": "job_cluster",
|
||||
"python_wheel_task": {
|
||||
"package_name": "{{.project_name}}",
|
||||
"entry_point": "main",
|
||||
},
|
||||
"libraries": [
|
||||
# By default we just include the .whl file generated for the {{.project_name}} package.
|
||||
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||
# for more information on how to add other libraries.
|
||||
{
|
||||
"whl": "dist/*.whl",
|
||||
},
|
||||
],
|
||||
},
|
||||
{{- end -}}
|
||||
{{""}}
|
||||
],
|
||||
"job_clusters": [
|
||||
{
|
||||
"job_cluster_key": "job_cluster",
|
||||
"new_cluster": {
|
||||
"spark_version": "{{template "latest_lts_dbr_version"}}",
|
||||
"node_type_id": "{{smallest_node_type}}",
|
||||
"autoscale": {
|
||||
"min_workers": 1,
|
||||
"max_workers": 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
)
|
|
@ -0,0 +1,24 @@
|
|||
from databricks.bundles.pipelines import Pipeline
|
||||
|
||||
{{.project_name}}_pipeline = Pipeline.from_dict(
|
||||
{
|
||||
"name": "{{.project_name}}_pipeline",
|
||||
"target": "{{.project_name}}_${bundle.target}",
|
||||
{{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}}
|
||||
## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog:
|
||||
"catalog": "catalog_name",
|
||||
{{- else}}
|
||||
"catalog": "{{default_catalog}}",
|
||||
{{- end}}
|
||||
"libraries": [
|
||||
{
|
||||
"notebook": {
|
||||
"path": "src/dlt_pipeline.ipynb",
|
||||
},
|
||||
},
|
||||
],
|
||||
"configuration": {
|
||||
"bundle.sourcePath": "${workspace.file_path}/src",
|
||||
},
|
||||
}
|
||||
)
|
|
@ -0,0 +1,4 @@
|
|||
# scratch
|
||||
|
||||
This folder is reserved for personal, exploratory notebooks.
|
||||
By default these are not committed to Git, as 'scratch' is listed in .gitignore.
|
|
@ -0,0 +1,18 @@
|
|||
"""
|
||||
setup.py configuration script describing how to build and package this project.
|
||||
|
||||
This file is primarily used by the setuptools library and typically should not
|
||||
be executed directly. See README.md for how to deploy, test, and run
|
||||
the {{.project_name}} project.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
local_version = os.getenv("LOCAL_VERSION")
|
||||
version = "0.0.1"
|
||||
|
||||
setup(
|
||||
version=f"{version}+{local_version}" if local_version else version,
|
||||
)
|
|
@ -0,0 +1,104 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {},
|
||||
"inputWidgets": {},
|
||||
"nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# DLT pipeline\n",
|
||||
"\n",
|
||||
"This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/{{.project_name}}.pipeline.yml."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 0,
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {},
|
||||
"inputWidgets": {},
|
||||
"nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
{{- if (eq .include_python "yes") }}
|
||||
"# Import DLT and src/{{.project_name}}\n",
|
||||
"import dlt\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n",
|
||||
"from pyspark.sql.functions import expr\n",
|
||||
"from {{.project_name}} import main"
|
||||
{{else}}
|
||||
"import dlt\n",
|
||||
"from pyspark.sql.functions import expr\n",
|
||||
"from pyspark.sql import SparkSession\n",
|
||||
"\n",
|
||||
"spark = SparkSession.builder.getOrCreate()"
|
||||
{{end -}}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 0,
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {},
|
||||
"inputWidgets": {},
|
||||
"nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
{{- if (eq .include_python "yes") }}
|
||||
"@dlt.view\n",
|
||||
"def taxi_raw():\n",
|
||||
" return main.get_taxis(spark)\n",
|
||||
{{else}}
|
||||
"@dlt.view\n",
|
||||
"def taxi_raw():\n",
|
||||
" return spark.read.format(\"json\").load(\"/databricks-datasets/nyctaxi/sample/json/\")\n",
|
||||
{{end -}}
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.table\n",
|
||||
"def filtered_taxis():\n",
|
||||
" return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+notebook": {
|
||||
"dashboards": [],
|
||||
"language": "python",
|
||||
"notebookMetadata": {
|
||||
"pythonIndentUnit": 2
|
||||
},
|
||||
"notebookName": "dlt_pipeline",
|
||||
"widgets": {}
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {},
|
||||
"inputWidgets": {},
|
||||
"nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Default notebook\n",
|
||||
"\n",
|
||||
"This default notebook is executed using Databricks Workflows as defined in resources/{{.project_name}}.job.yml."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 0,
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {
|
||||
"byteLimit": 2048000,
|
||||
"rowLimit": 10000
|
||||
},
|
||||
"inputWidgets": {},
|
||||
"nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
{{- if (eq .include_python "yes") }}
|
||||
"from {{.project_name}} import main\n",
|
||||
"\n",
|
||||
"main.get_taxis(spark).show(10)"
|
||||
{{else}}
|
||||
"spark.range(10)"
|
||||
{{end -}}
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+notebook": {
|
||||
"dashboards": [],
|
||||
"language": "python",
|
||||
"notebookMetadata": {
|
||||
"pythonIndentUnit": 2
|
||||
},
|
||||
"notebookName": "notebook",
|
||||
"widgets": {}
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
from pyspark.sql import SparkSession, DataFrame
|
||||
|
||||
|
||||
def get_taxis(spark: SparkSession) -> DataFrame:
|
||||
return spark.read.table("samples.nyctaxi.trips")
|
||||
|
||||
|
||||
# Create a new Databricks Connect session. If this fails,
|
||||
# check that you have configured Databricks Connect correctly.
|
||||
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
|
||||
def get_spark() -> SparkSession:
|
||||
try:
|
||||
from databricks.connect import DatabricksSession
|
||||
|
||||
return DatabricksSession.builder.getOrCreate()
|
||||
except ImportError:
|
||||
return SparkSession.builder.getOrCreate()
|
||||
|
||||
|
||||
def main():
|
||||
get_taxis(get_spark()).show(5)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,8 @@
|
|||
from {{.project_name}}.main import get_taxis, get_spark
|
||||
|
||||
# running tests requires installing databricks-connect, e.g. by uncommenting it in pyproject.toml
|
||||
|
||||
|
||||
def test_main():
|
||||
taxis = get_taxis(get_spark())
|
||||
assert taxis.count() > 5
|
Loading…
Reference in New Issue