mirror of https://github.com/databricks/cli.git
Add experimental-jobs-as-code template (#2177)
## Changes Add experimental-jobs-as-code template allowing defining jobs using Python instead of YAML through the `databricks-bundles` PyPI package. ## Tests Manually and acceptance tests.
This commit is contained in:
parent
7034793d1d
commit
31c10c1b82
|
@ -8,6 +8,7 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"runtime"
|
"runtime"
|
||||||
"slices"
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
|
@ -393,6 +394,16 @@ func CopyDir(src, dst string, inputs, outputs map[string]bool) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func ListDir(t *testing.T, src string) ([]string, error) {
|
func ListDir(t *testing.T, src string) ([]string, error) {
|
||||||
|
// exclude folders in .gitignore from comparison
|
||||||
|
ignored := []string{
|
||||||
|
"\\.ruff_cache",
|
||||||
|
"\\.venv",
|
||||||
|
".*\\.egg-info",
|
||||||
|
"__pycache__",
|
||||||
|
// depends on uv version
|
||||||
|
"uv.lock",
|
||||||
|
}
|
||||||
|
|
||||||
var files []string
|
var files []string
|
||||||
err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error {
|
err := filepath.Walk(src, func(path string, info os.FileInfo, err error) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -400,7 +411,19 @@ func ListDir(t *testing.T, src string) ([]string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if info.IsDir() {
|
if info.IsDir() {
|
||||||
|
for _, ignoredFolder := range ignored {
|
||||||
|
if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched {
|
||||||
|
return filepath.SkipDir
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
} else {
|
||||||
|
for _, ignoredFolder := range ignored {
|
||||||
|
if matched, _ := regexp.MatchString(ignoredFolder, info.Name()); matched {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
relPath, err := filepath.Rel(src, path)
|
relPath, err := filepath.Rel(src, path)
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
{
|
||||||
|
"project_name": "my_jobs_as_code",
|
||||||
|
"include_notebook": "yes",
|
||||||
|
"include_python": "yes"
|
||||||
|
}
|
|
@ -0,0 +1,85 @@
|
||||||
|
|
||||||
|
>>> $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output
|
||||||
|
|
||||||
|
Welcome to (EXPERIMENTAL) "Jobs as code" template for Databricks Asset Bundles!
|
||||||
|
Workspace to use (auto-detected, edit in 'my_jobs_as_code/databricks.yml'): $DATABRICKS_URL
|
||||||
|
|
||||||
|
✨ Your new project has been created in the 'my_jobs_as_code' directory!
|
||||||
|
|
||||||
|
Please refer to the README.md file for "getting started" instructions.
|
||||||
|
See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html.
|
||||||
|
|
||||||
|
>>> $CLI bundle validate -t dev --output json
|
||||||
|
{
|
||||||
|
"jobs": {
|
||||||
|
"my_jobs_as_code_job": {
|
||||||
|
"deployment": {
|
||||||
|
"kind": "BUNDLE",
|
||||||
|
"metadata_file_path": "/Workspace/Users/$USERNAME/.bundle/my_jobs_as_code/dev/state/metadata.json"
|
||||||
|
},
|
||||||
|
"edit_mode": "UI_LOCKED",
|
||||||
|
"email_notifications": {
|
||||||
|
"on_failure": [
|
||||||
|
"$USERNAME"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"format": "MULTI_TASK",
|
||||||
|
"job_clusters": [
|
||||||
|
{
|
||||||
|
"job_cluster_key": "job_cluster",
|
||||||
|
"new_cluster": {
|
||||||
|
"autoscale": {
|
||||||
|
"max_workers": 4,
|
||||||
|
"min_workers": 1
|
||||||
|
},
|
||||||
|
"node_type_id": "i3.xlarge",
|
||||||
|
"spark_version": "15.4.x-scala2.12"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"max_concurrent_runs": 4,
|
||||||
|
"name": "[dev $USERNAME] my_jobs_as_code_job",
|
||||||
|
"permissions": [],
|
||||||
|
"queue": {
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
"tags": {
|
||||||
|
"dev": "$USERNAME"
|
||||||
|
},
|
||||||
|
"tasks": [
|
||||||
|
{
|
||||||
|
"job_cluster_key": "job_cluster",
|
||||||
|
"notebook_task": {
|
||||||
|
"notebook_path": "/Workspace/Users/$USERNAME/.bundle/my_jobs_as_code/dev/files/src/notebook"
|
||||||
|
},
|
||||||
|
"task_key": "notebook_task"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"depends_on": [
|
||||||
|
{
|
||||||
|
"task_key": "notebook_task"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"job_cluster_key": "job_cluster",
|
||||||
|
"libraries": [
|
||||||
|
{
|
||||||
|
"whl": "dist/*.whl"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"python_wheel_task": {
|
||||||
|
"entry_point": "main",
|
||||||
|
"package_name": "my_jobs_as_code"
|
||||||
|
},
|
||||||
|
"task_key": "main_task"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"trigger": {
|
||||||
|
"pause_status": "PAUSED",
|
||||||
|
"periodic": {
|
||||||
|
"interval": 1,
|
||||||
|
"unit": "DAYS"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
8
acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore
vendored
Normal file
8
acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/.gitignore
vendored
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
.databricks/
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
__pycache__/
|
||||||
|
*.egg-info
|
||||||
|
.venv/
|
||||||
|
scratch/**
|
||||||
|
!scratch/README.md
|
|
@ -0,0 +1,58 @@
|
||||||
|
# my_jobs_as_code
|
||||||
|
|
||||||
|
The 'my_jobs_as_code' project was generated by using the "Jobs as code" template.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
1. Install Databricks CLI 0.238 or later.
|
||||||
|
See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html).
|
||||||
|
|
||||||
|
2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
|
||||||
|
We use uv to create a virtual environment and install the required dependencies.
|
||||||
|
|
||||||
|
3. Authenticate to your Databricks workspace if you have not done so already:
|
||||||
|
```
|
||||||
|
$ databricks configure
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
|
||||||
|
https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for
|
||||||
|
**Databricks Connect** for instructions on running the included Python code from a different IDE.
|
||||||
|
|
||||||
|
5. For documentation on the Databricks Asset Bundles format used
|
||||||
|
for this project, and for CI/CD configuration, see
|
||||||
|
https://docs.databricks.com/dev-tools/bundles/index.html.
|
||||||
|
|
||||||
|
## Deploy and run jobs
|
||||||
|
|
||||||
|
1. Create a new virtual environment and install the required dependencies:
|
||||||
|
```
|
||||||
|
$ uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
2. To deploy the bundle to the development target:
|
||||||
|
```
|
||||||
|
$ databricks bundle deploy --target dev
|
||||||
|
```
|
||||||
|
|
||||||
|
*(Note that "dev" is the default target, so the `--target` parameter is optional here.)*
|
||||||
|
|
||||||
|
This deploys everything that's defined for this project.
|
||||||
|
For example, the default template would deploy a job called
|
||||||
|
`[dev yourname] my_jobs_as_code_job` to your workspace.
|
||||||
|
You can find that job by opening your workspace and clicking on **Workflows**.
|
||||||
|
|
||||||
|
3. Similarly, to deploy a production copy, type:
|
||||||
|
```
|
||||||
|
$ databricks bundle deploy --target prod
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the default job from the template has a schedule that runs every day
|
||||||
|
(defined in resources/my_jobs_as_code_job.py). The schedule
|
||||||
|
is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes](
|
||||||
|
https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)).
|
||||||
|
|
||||||
|
4. To run a job:
|
||||||
|
```
|
||||||
|
$ databricks bundle run
|
||||||
|
```
|
|
@ -0,0 +1,48 @@
|
||||||
|
# This is a Databricks asset bundle definition for my_jobs_as_code.
|
||||||
|
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
|
||||||
|
bundle:
|
||||||
|
name: my_jobs_as_code
|
||||||
|
uuid: <UUID>
|
||||||
|
|
||||||
|
experimental:
|
||||||
|
python:
|
||||||
|
# Activate virtual environment before loading resources defined in Python.
|
||||||
|
# If disabled, defaults to using the Python interpreter available in the current shell.
|
||||||
|
venv_path: .venv
|
||||||
|
# Functions called to load resources defined in Python. See resources/__init__.py
|
||||||
|
resources:
|
||||||
|
- "resources:load_resources"
|
||||||
|
|
||||||
|
artifacts:
|
||||||
|
default:
|
||||||
|
type: whl
|
||||||
|
path: .
|
||||||
|
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
|
||||||
|
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
|
||||||
|
build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build
|
||||||
|
|
||||||
|
include:
|
||||||
|
- resources/*.yml
|
||||||
|
|
||||||
|
targets:
|
||||||
|
dev:
|
||||||
|
# The default target uses 'mode: development' to create a development copy.
|
||||||
|
# - Deployed resources get prefixed with '[dev my_user_name]'
|
||||||
|
# - Any job schedules and triggers are paused by default.
|
||||||
|
# See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
|
||||||
|
mode: development
|
||||||
|
default: true
|
||||||
|
workspace:
|
||||||
|
host: $DATABRICKS_URL
|
||||||
|
|
||||||
|
prod:
|
||||||
|
mode: production
|
||||||
|
workspace:
|
||||||
|
host: $DATABRICKS_URL
|
||||||
|
# We explicitly specify /Workspace/Users/$USERNAME to make sure we only have a single copy.
|
||||||
|
root_path: /Workspace/Users/$USERNAME/.bundle/${bundle.name}/${bundle.target}
|
||||||
|
permissions:
|
||||||
|
- user_name: $USERNAME
|
||||||
|
level: CAN_MANAGE
|
||||||
|
run_as:
|
||||||
|
user_name: $USERNAME
|
|
@ -0,0 +1,22 @@
|
||||||
|
# Fixtures
|
||||||
|
|
||||||
|
This folder is reserved for fixtures, such as CSV files.
|
||||||
|
|
||||||
|
Below is an example of how to load fixtures as a data frame:
|
||||||
|
|
||||||
|
```
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
|
||||||
|
def get_absolute_path(*relative_parts):
|
||||||
|
if 'dbutils' in globals():
|
||||||
|
base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
|
||||||
|
path = os.path.normpath(os.path.join(base_dir, *relative_parts))
|
||||||
|
return path if path.startswith("/Workspace") else "/Workspace" + path
|
||||||
|
else:
|
||||||
|
return os.path.join(*relative_parts)
|
||||||
|
|
||||||
|
csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
|
||||||
|
df = pd.read_csv(csv_file)
|
||||||
|
display(df)
|
||||||
|
```
|
|
@ -0,0 +1,49 @@
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61.0"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "my_jobs_as_code"
|
||||||
|
requires-python = ">=3.10"
|
||||||
|
description = "wheel file based on my_jobs_as_code"
|
||||||
|
|
||||||
|
# Dependencies in case the output wheel file is used as a library dependency.
|
||||||
|
# For defining dependencies, when this package is used in Databricks, see:
|
||||||
|
# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# dependencies = [
|
||||||
|
# "requests==x.y.z",
|
||||||
|
# ]
|
||||||
|
dependencies = [
|
||||||
|
]
|
||||||
|
|
||||||
|
# see setup.py
|
||||||
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
[project.entry-points.packages]
|
||||||
|
main = "my_jobs_as_code.main:main"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
py-modules = ["resources", "my_jobs_as_code"]
|
||||||
|
|
||||||
|
[tool.uv]
|
||||||
|
## Dependencies for local development
|
||||||
|
dev-dependencies = [
|
||||||
|
"databricks-bundles==0.7.0",
|
||||||
|
|
||||||
|
## Add code completion support for DLT
|
||||||
|
# "databricks-dlt",
|
||||||
|
|
||||||
|
## databricks-connect can be used to run parts of this project locally.
|
||||||
|
## See https://docs.databricks.com/dev-tools/databricks-connect.html.
|
||||||
|
##
|
||||||
|
## Uncomment line below to install a version of db-connect that corresponds to
|
||||||
|
## the Databricks Runtime version used for this project.
|
||||||
|
# "databricks-connect>=15.4,<15.5",
|
||||||
|
]
|
||||||
|
|
||||||
|
override-dependencies = [
|
||||||
|
# pyspark package conflicts with 'databricks-connect'
|
||||||
|
"pyspark; sys_platform == 'never'",
|
||||||
|
]
|
|
@ -0,0 +1,16 @@
|
||||||
|
from databricks.bundles.core import (
|
||||||
|
Bundle,
|
||||||
|
Resources,
|
||||||
|
load_resources_from_current_package_module,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_resources(bundle: Bundle) -> Resources:
|
||||||
|
"""
|
||||||
|
'load_resources' function is referenced in databricks.yml and is responsible for loading
|
||||||
|
bundle resources defined in Python code. This function is called by Databricks CLI during
|
||||||
|
bundle deployment. After deployment, this function is not used.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# the default implementation loads all Python files in 'resources' directory
|
||||||
|
return load_resources_from_current_package_module()
|
|
@ -0,0 +1,67 @@
|
||||||
|
from databricks.bundles.jobs import Job
|
||||||
|
|
||||||
|
"""
|
||||||
|
The main job for my_jobs_as_code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
my_jobs_as_code_job = Job.from_dict(
|
||||||
|
{
|
||||||
|
"name": "my_jobs_as_code_job",
|
||||||
|
"trigger": {
|
||||||
|
# Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
|
||||||
|
"periodic": {
|
||||||
|
"interval": 1,
|
||||||
|
"unit": "DAYS",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"email_notifications": {
|
||||||
|
"on_failure": [
|
||||||
|
"$USERNAME",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
"tasks": [
|
||||||
|
{
|
||||||
|
"task_key": "notebook_task",
|
||||||
|
"job_cluster_key": "job_cluster",
|
||||||
|
"notebook_task": {
|
||||||
|
"notebook_path": "src/notebook.ipynb",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"task_key": "main_task",
|
||||||
|
"depends_on": [
|
||||||
|
{
|
||||||
|
"task_key": "notebook_task",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"job_cluster_key": "job_cluster",
|
||||||
|
"python_wheel_task": {
|
||||||
|
"package_name": "my_jobs_as_code",
|
||||||
|
"entry_point": "main",
|
||||||
|
},
|
||||||
|
"libraries": [
|
||||||
|
# By default we just include the .whl file generated for the my_jobs_as_code package.
|
||||||
|
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||||
|
# for more information on how to add other libraries.
|
||||||
|
{
|
||||||
|
"whl": "dist/*.whl",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"job_clusters": [
|
||||||
|
{
|
||||||
|
"job_cluster_key": "job_cluster",
|
||||||
|
"new_cluster": {
|
||||||
|
"spark_version": "15.4.x-scala2.12",
|
||||||
|
"node_type_id": "i3.xlarge",
|
||||||
|
"autoscale": {
|
||||||
|
"min_workers": 1,
|
||||||
|
"max_workers": 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
|
@ -0,0 +1,4 @@
|
||||||
|
# scratch
|
||||||
|
|
||||||
|
This folder is reserved for personal, exploratory notebooks.
|
||||||
|
By default these are not committed to Git, as 'scratch' is listed in .gitignore.
|
|
@ -0,0 +1,18 @@
|
||||||
|
"""
|
||||||
|
setup.py configuration script describing how to build and package this project.
|
||||||
|
|
||||||
|
This file is primarily used by the setuptools library and typically should not
|
||||||
|
be executed directly. See README.md for how to deploy, test, and run
|
||||||
|
the my_jobs_as_code project.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
local_version = os.getenv("LOCAL_VERSION")
|
||||||
|
version = "0.0.1"
|
||||||
|
|
||||||
|
setup(
|
||||||
|
version=f"{version}+{local_version}" if local_version else version,
|
||||||
|
)
|
|
@ -0,0 +1,25 @@
|
||||||
|
from pyspark.sql import SparkSession, DataFrame
|
||||||
|
|
||||||
|
|
||||||
|
def get_taxis(spark: SparkSession) -> DataFrame:
|
||||||
|
return spark.read.table("samples.nyctaxi.trips")
|
||||||
|
|
||||||
|
|
||||||
|
# Create a new Databricks Connect session. If this fails,
|
||||||
|
# check that you have configured Databricks Connect correctly.
|
||||||
|
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
|
||||||
|
def get_spark() -> SparkSession:
|
||||||
|
try:
|
||||||
|
from databricks.connect import DatabricksSession
|
||||||
|
|
||||||
|
return DatabricksSession.builder.getOrCreate()
|
||||||
|
except ImportError:
|
||||||
|
return SparkSession.builder.getOrCreate()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
get_taxis(get_spark()).show(5)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,75 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"application/vnd.databricks.v1+cell": {
|
||||||
|
"cellMetadata": {},
|
||||||
|
"inputWidgets": {},
|
||||||
|
"nuid": "<UUID>",
|
||||||
|
"showTitle": false,
|
||||||
|
"title": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# Default notebook\n",
|
||||||
|
"\n",
|
||||||
|
"This default notebook is executed using Databricks Workflows as defined in resources/my_jobs_as_code.job.yml."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%load_ext autoreload\n",
|
||||||
|
"%autoreload 2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 0,
|
||||||
|
"metadata": {
|
||||||
|
"application/vnd.databricks.v1+cell": {
|
||||||
|
"cellMetadata": {
|
||||||
|
"byteLimit": 2048000,
|
||||||
|
"rowLimit": 10000
|
||||||
|
},
|
||||||
|
"inputWidgets": {},
|
||||||
|
"nuid": "<UUID>",
|
||||||
|
"showTitle": false,
|
||||||
|
"title": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from my_jobs_as_code import main\n",
|
||||||
|
"\n",
|
||||||
|
"main.get_taxis(spark).show(10)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"application/vnd.databricks.v1+notebook": {
|
||||||
|
"dashboards": [],
|
||||||
|
"language": "python",
|
||||||
|
"notebookMetadata": {
|
||||||
|
"pythonIndentUnit": 2
|
||||||
|
},
|
||||||
|
"notebookName": "notebook",
|
||||||
|
"widgets": {}
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"version": "3.11.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
|
@ -0,0 +1,8 @@
|
||||||
|
from my_jobs_as_code.main import get_taxis, get_spark
|
||||||
|
|
||||||
|
# running tests requires installing databricks-connect, e.g. by uncommenting it in pyproject.toml
|
||||||
|
|
||||||
|
|
||||||
|
def test_main():
|
||||||
|
taxis = get_taxis(get_spark())
|
||||||
|
assert taxis.count() > 5
|
|
@ -0,0 +1,12 @@
|
||||||
|
trace $CLI bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output
|
||||||
|
|
||||||
|
cd output/my_jobs_as_code
|
||||||
|
|
||||||
|
# silence uv output because it's non-deterministic
|
||||||
|
uv sync 2> /dev/null
|
||||||
|
|
||||||
|
# remove version constraint because it always creates a warning on dev builds
|
||||||
|
cat databricks.yml | grep -v databricks_cli_version > databricks.yml.new
|
||||||
|
mv databricks.yml.new databricks.yml
|
||||||
|
|
||||||
|
trace $CLI bundle validate -t dev --output json | jq ".resources"
|
|
@ -59,6 +59,11 @@ var nativeTemplates = []nativeTemplate{
|
||||||
hidden: true,
|
hidden: true,
|
||||||
description: "The default PyDABs template",
|
description: "The default PyDABs template",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "experimental-jobs-as-code",
|
||||||
|
hidden: true,
|
||||||
|
description: "Jobs as code template (experimental)",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: customTemplate,
|
name: customTemplate,
|
||||||
description: "Bring your own template",
|
description: "Bring your own template",
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
{
|
||||||
|
"welcome_message": "\nWelcome to (EXPERIMENTAL) \"Jobs as code\" template for Databricks Asset Bundles!",
|
||||||
|
"properties": {
|
||||||
|
"project_name": {
|
||||||
|
"type": "string",
|
||||||
|
"default": "jobs_as_code_project",
|
||||||
|
"description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project",
|
||||||
|
"order": 1,
|
||||||
|
"pattern": "^[A-Za-z0-9_]+$",
|
||||||
|
"pattern_match_failure_message": "Name must consist of letters, numbers, and underscores."
|
||||||
|
},
|
||||||
|
"include_notebook": {
|
||||||
|
"type": "string",
|
||||||
|
"default": "yes",
|
||||||
|
"enum": ["yes", "no"],
|
||||||
|
"description": "Include a stub (sample) notebook in '{{.project_name}}{{path_separator}}src'",
|
||||||
|
"order": 2
|
||||||
|
},
|
||||||
|
"include_python": {
|
||||||
|
"type": "string",
|
||||||
|
"default": "yes",
|
||||||
|
"enum": ["yes", "no"],
|
||||||
|
"description": "Include a stub (sample) Python package in '{{.project_name}}/src'",
|
||||||
|
"order": 3
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"success_message": "Workspace to use (auto-detected, edit in '{{.project_name}}/databricks.yml'): {{workspace_host}}\n\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html."
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
{{define "latest_lts_dbr_version" -}}
|
||||||
|
15.4.x-scala2.12
|
||||||
|
{{- end}}
|
||||||
|
|
||||||
|
{{define "latest_lts_db_connect_version_spec" -}}
|
||||||
|
>=15.4,<15.5
|
||||||
|
{{- end}}
|
|
@ -0,0 +1,30 @@
|
||||||
|
# Preamble
|
||||||
|
|
||||||
|
This file only template directives; it is skipped for the actual output.
|
||||||
|
|
||||||
|
{{skip "__preamble"}}
|
||||||
|
|
||||||
|
# TODO add DLT support, placeholder for now
|
||||||
|
{{$notDLT := true }}
|
||||||
|
{{$notNotebook := not (eq .include_notebook "yes")}}
|
||||||
|
{{$notPython := not (eq .include_python "yes")}}
|
||||||
|
|
||||||
|
{{if $notPython}}
|
||||||
|
{{skip "{{.project_name}}/src/{{.project_name}}"}}
|
||||||
|
{{skip "{{.project_name}}/tests/main_test.py"}}
|
||||||
|
{{end}}
|
||||||
|
|
||||||
|
{{if $notDLT}}
|
||||||
|
{{skip "{{.project_name}}/src/dlt_pipeline.ipynb"}}
|
||||||
|
{{skip "{{.project_name}}/resources/{{.project_name}}_pipeline.py"}}
|
||||||
|
{{end}}
|
||||||
|
|
||||||
|
{{if $notNotebook}}
|
||||||
|
{{skip "{{.project_name}}/src/notebook.ipynb"}}
|
||||||
|
{{end}}
|
||||||
|
|
||||||
|
{{if (and $notDLT $notNotebook $notPython)}}
|
||||||
|
{{skip "{{.project_name}}/resources/{{.project_name}}_job.py"}}
|
||||||
|
{{else}}
|
||||||
|
{{skip "{{.project_name}}/resources/.gitkeep"}}
|
||||||
|
{{end}}
|
8
libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/.gitignore
vendored
Normal file
8
libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/.gitignore
vendored
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
.databricks/
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
__pycache__/
|
||||||
|
*.egg-info
|
||||||
|
.venv/
|
||||||
|
scratch/**
|
||||||
|
!scratch/README.md
|
|
@ -0,0 +1,60 @@
|
||||||
|
# {{.project_name}}
|
||||||
|
|
||||||
|
The '{{.project_name}}' project was generated by using the "Jobs as code" template.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
1. Install Databricks CLI 0.238 or later.
|
||||||
|
See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html).
|
||||||
|
|
||||||
|
2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
|
||||||
|
We use uv to create a virtual environment and install the required dependencies.
|
||||||
|
|
||||||
|
3. Authenticate to your Databricks workspace if you have not done so already:
|
||||||
|
```
|
||||||
|
$ databricks configure
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from
|
||||||
|
https://docs.databricks.com/dev-tools/vscode-ext.html.
|
||||||
|
{{- if (eq .include_python "yes") }} Or read the "getting started" documentation for
|
||||||
|
**Databricks Connect** for instructions on running the included Python code from a different IDE.
|
||||||
|
{{- end}}
|
||||||
|
|
||||||
|
5. For documentation on the Databricks Asset Bundles format used
|
||||||
|
for this project, and for CI/CD configuration, see
|
||||||
|
https://docs.databricks.com/dev-tools/bundles/index.html.
|
||||||
|
|
||||||
|
## Deploy and run jobs
|
||||||
|
|
||||||
|
1. Create a new virtual environment and install the required dependencies:
|
||||||
|
```
|
||||||
|
$ uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
2. To deploy the bundle to the development target:
|
||||||
|
```
|
||||||
|
$ databricks bundle deploy --target dev
|
||||||
|
```
|
||||||
|
|
||||||
|
*(Note that "dev" is the default target, so the `--target` parameter is optional here.)*
|
||||||
|
|
||||||
|
This deploys everything that's defined for this project.
|
||||||
|
For example, the default template would deploy a job called
|
||||||
|
`[dev yourname] {{.project_name}}_job` to your workspace.
|
||||||
|
You can find that job by opening your workspace and clicking on **Workflows**.
|
||||||
|
|
||||||
|
3. Similarly, to deploy a production copy, type:
|
||||||
|
```
|
||||||
|
$ databricks bundle deploy --target prod
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the default job from the template has a schedule that runs every day
|
||||||
|
(defined in resources/{{.project_name}}_job.py). The schedule
|
||||||
|
is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes](
|
||||||
|
https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)).
|
||||||
|
|
||||||
|
4. To run a job:
|
||||||
|
```
|
||||||
|
$ databricks bundle run
|
||||||
|
```
|
|
@ -0,0 +1,51 @@
|
||||||
|
# This is a Databricks asset bundle definition for {{.project_name}}.
|
||||||
|
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
|
||||||
|
bundle:
|
||||||
|
name: {{.project_name}}
|
||||||
|
uuid: {{bundle_uuid}}
|
||||||
|
databricks_cli_version: ">= 0.238.0"
|
||||||
|
|
||||||
|
experimental:
|
||||||
|
python:
|
||||||
|
# Activate virtual environment before loading resources defined in Python.
|
||||||
|
# If disabled, defaults to using the Python interpreter available in the current shell.
|
||||||
|
venv_path: .venv
|
||||||
|
# Functions called to load resources defined in Python. See resources/__init__.py
|
||||||
|
resources:
|
||||||
|
- "resources:load_resources"
|
||||||
|
|
||||||
|
{{ if .include_python -}}
|
||||||
|
artifacts:
|
||||||
|
default:
|
||||||
|
type: whl
|
||||||
|
path: .
|
||||||
|
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
|
||||||
|
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
|
||||||
|
build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build
|
||||||
|
|
||||||
|
{{ end -}}
|
||||||
|
include:
|
||||||
|
- resources/*.yml
|
||||||
|
|
||||||
|
targets:
|
||||||
|
dev:
|
||||||
|
# The default target uses 'mode: development' to create a development copy.
|
||||||
|
# - Deployed resources get prefixed with '[dev my_user_name]'
|
||||||
|
# - Any job schedules and triggers are paused by default.
|
||||||
|
# See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
|
||||||
|
mode: development
|
||||||
|
default: true
|
||||||
|
workspace:
|
||||||
|
host: {{workspace_host}}
|
||||||
|
|
||||||
|
prod:
|
||||||
|
mode: production
|
||||||
|
workspace:
|
||||||
|
host: {{workspace_host}}
|
||||||
|
# We explicitly specify /Workspace/Users/{{user_name}} to make sure we only have a single copy.
|
||||||
|
root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target}
|
||||||
|
permissions:
|
||||||
|
- {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}}
|
||||||
|
level: CAN_MANAGE
|
||||||
|
run_as:
|
||||||
|
{{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}}
|
|
@ -0,0 +1,27 @@
|
||||||
|
# Fixtures
|
||||||
|
{{- /*
|
||||||
|
We don't want to have too many README.md files, since they
|
||||||
|
stand out so much. But we do need to have a file here to make
|
||||||
|
sure the folder is added to Git.
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
This folder is reserved for fixtures, such as CSV files.
|
||||||
|
|
||||||
|
Below is an example of how to load fixtures as a data frame:
|
||||||
|
|
||||||
|
```
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
|
||||||
|
def get_absolute_path(*relative_parts):
|
||||||
|
if 'dbutils' in globals():
|
||||||
|
base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
|
||||||
|
path = os.path.normpath(os.path.join(base_dir, *relative_parts))
|
||||||
|
return path if path.startswith("/Workspace") else "/Workspace" + path
|
||||||
|
else:
|
||||||
|
return os.path.join(*relative_parts)
|
||||||
|
|
||||||
|
csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
|
||||||
|
df = pd.read_csv(csv_file)
|
||||||
|
display(df)
|
||||||
|
```
|
|
@ -0,0 +1,57 @@
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61.0"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "{{.project_name}}"
|
||||||
|
requires-python = ">=3.10"
|
||||||
|
description = "wheel file based on {{.project_name}}"
|
||||||
|
|
||||||
|
# Dependencies in case the output wheel file is used as a library dependency.
|
||||||
|
# For defining dependencies, when this package is used in Databricks, see:
|
||||||
|
# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||||
|
#
|
||||||
|
# Example:
|
||||||
|
# dependencies = [
|
||||||
|
# "requests==x.y.z",
|
||||||
|
# ]
|
||||||
|
dependencies = [
|
||||||
|
]
|
||||||
|
|
||||||
|
# see setup.py
|
||||||
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
{{ if .include_python -}}
|
||||||
|
[project.entry-points.packages]
|
||||||
|
main = "{{.project_name}}.main:main"
|
||||||
|
|
||||||
|
{{ end -}}
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
{{ if .include_python -}}
|
||||||
|
py-modules = ["resources", "{{.project_name}}"]
|
||||||
|
|
||||||
|
{{ else }}
|
||||||
|
py-modules = ["resources"]
|
||||||
|
|
||||||
|
{{ end -}}
|
||||||
|
[tool.uv]
|
||||||
|
## Dependencies for local development
|
||||||
|
dev-dependencies = [
|
||||||
|
"databricks-bundles==0.7.0",
|
||||||
|
|
||||||
|
## Add code completion support for DLT
|
||||||
|
# "databricks-dlt",
|
||||||
|
|
||||||
|
## databricks-connect can be used to run parts of this project locally.
|
||||||
|
## See https://docs.databricks.com/dev-tools/databricks-connect.html.
|
||||||
|
##
|
||||||
|
## Uncomment line below to install a version of db-connect that corresponds to
|
||||||
|
## the Databricks Runtime version used for this project.
|
||||||
|
# "databricks-connect{{template "latest_lts_db_connect_version_spec"}}",
|
||||||
|
]
|
||||||
|
|
||||||
|
override-dependencies = [
|
||||||
|
# pyspark package conflicts with 'databricks-connect'
|
||||||
|
"pyspark; sys_platform == 'never'",
|
||||||
|
]
|
|
@ -0,0 +1,16 @@
|
||||||
|
from databricks.bundles.core import (
|
||||||
|
Bundle,
|
||||||
|
Resources,
|
||||||
|
load_resources_from_current_package_module,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_resources(bundle: Bundle) -> Resources:
|
||||||
|
"""
|
||||||
|
'load_resources' function is referenced in databricks.yml and is responsible for loading
|
||||||
|
bundle resources defined in Python code. This function is called by Databricks CLI during
|
||||||
|
bundle deployment. After deployment, this function is not used.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# the default implementation loads all Python files in 'resources' directory
|
||||||
|
return load_resources_from_current_package_module()
|
|
@ -0,0 +1,108 @@
|
||||||
|
{{$include_dlt := "no" -}}
|
||||||
|
from databricks.bundles.jobs import Job
|
||||||
|
|
||||||
|
"""
|
||||||
|
The main job for {{.project_name}}.
|
||||||
|
|
||||||
|
{{- /* Clarify what this job is for for DLT-only users. */}}
|
||||||
|
{{if and (eq $include_dlt "yes") (and (eq .include_notebook "no") (eq .include_python "no")) -}}
|
||||||
|
This job runs {{.project_name}}_pipeline on a schedule.
|
||||||
|
{{end -}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
{{.project_name}}_job = Job.from_dict(
|
||||||
|
{
|
||||||
|
"name": "{{.project_name}}_job",
|
||||||
|
"trigger": {
|
||||||
|
# Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
|
||||||
|
"periodic": {
|
||||||
|
"interval": 1,
|
||||||
|
"unit": "DAYS",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{{- if not is_service_principal}}
|
||||||
|
"email_notifications": {
|
||||||
|
"on_failure": [
|
||||||
|
"{{user_name}}",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{{else}}
|
||||||
|
{{- end -}}
|
||||||
|
"tasks": [
|
||||||
|
{{- if eq .include_notebook "yes" -}}
|
||||||
|
{{- "\n " -}}
|
||||||
|
{
|
||||||
|
"task_key": "notebook_task",
|
||||||
|
"job_cluster_key": "job_cluster",
|
||||||
|
"notebook_task": {
|
||||||
|
"notebook_path": "src/notebook.ipynb",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{{- end -}}
|
||||||
|
{{- if (eq $include_dlt "yes") -}}
|
||||||
|
{{- "\n " -}}
|
||||||
|
{
|
||||||
|
"task_key": "refresh_pipeline",
|
||||||
|
{{- if (eq .include_notebook "yes" )}}
|
||||||
|
"depends_on": [
|
||||||
|
{
|
||||||
|
"task_key": "notebook_task",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
{{- end}}
|
||||||
|
"pipeline_task": {
|
||||||
|
{{- /* TODO: we should find a way that doesn't use magics for the below, like ./{{project_name}}.pipeline.yml */}}
|
||||||
|
"pipeline_id": "${resources.pipelines.{{.project_name}}_pipeline.id}",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{{- end -}}
|
||||||
|
{{- if (eq .include_python "yes") -}}
|
||||||
|
{{- "\n " -}}
|
||||||
|
{
|
||||||
|
"task_key": "main_task",
|
||||||
|
{{- if (eq $include_dlt "yes") }}
|
||||||
|
"depends_on": [
|
||||||
|
{
|
||||||
|
"task_key": "refresh_pipeline",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
{{- else if (eq .include_notebook "yes" )}}
|
||||||
|
"depends_on": [
|
||||||
|
{
|
||||||
|
"task_key": "notebook_task",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
{{- end}}
|
||||||
|
"job_cluster_key": "job_cluster",
|
||||||
|
"python_wheel_task": {
|
||||||
|
"package_name": "{{.project_name}}",
|
||||||
|
"entry_point": "main",
|
||||||
|
},
|
||||||
|
"libraries": [
|
||||||
|
# By default we just include the .whl file generated for the {{.project_name}} package.
|
||||||
|
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||||
|
# for more information on how to add other libraries.
|
||||||
|
{
|
||||||
|
"whl": "dist/*.whl",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{{- end -}}
|
||||||
|
{{""}}
|
||||||
|
],
|
||||||
|
"job_clusters": [
|
||||||
|
{
|
||||||
|
"job_cluster_key": "job_cluster",
|
||||||
|
"new_cluster": {
|
||||||
|
"spark_version": "{{template "latest_lts_dbr_version"}}",
|
||||||
|
"node_type_id": "{{smallest_node_type}}",
|
||||||
|
"autoscale": {
|
||||||
|
"min_workers": 1,
|
||||||
|
"max_workers": 4,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
)
|
|
@ -0,0 +1,24 @@
|
||||||
|
from databricks.bundles.pipelines import Pipeline
|
||||||
|
|
||||||
|
{{.project_name}}_pipeline = Pipeline.from_dict(
|
||||||
|
{
|
||||||
|
"name": "{{.project_name}}_pipeline",
|
||||||
|
"target": "{{.project_name}}_${bundle.target}",
|
||||||
|
{{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}}
|
||||||
|
## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog:
|
||||||
|
"catalog": "catalog_name",
|
||||||
|
{{- else}}
|
||||||
|
"catalog": "{{default_catalog}}",
|
||||||
|
{{- end}}
|
||||||
|
"libraries": [
|
||||||
|
{
|
||||||
|
"notebook": {
|
||||||
|
"path": "src/dlt_pipeline.ipynb",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"configuration": {
|
||||||
|
"bundle.sourcePath": "${workspace.file_path}/src",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
|
@ -0,0 +1,4 @@
|
||||||
|
# scratch
|
||||||
|
|
||||||
|
This folder is reserved for personal, exploratory notebooks.
|
||||||
|
By default these are not committed to Git, as 'scratch' is listed in .gitignore.
|
|
@ -0,0 +1,18 @@
|
||||||
|
"""
|
||||||
|
setup.py configuration script describing how to build and package this project.
|
||||||
|
|
||||||
|
This file is primarily used by the setuptools library and typically should not
|
||||||
|
be executed directly. See README.md for how to deploy, test, and run
|
||||||
|
the {{.project_name}} project.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
local_version = os.getenv("LOCAL_VERSION")
|
||||||
|
version = "0.0.1"
|
||||||
|
|
||||||
|
setup(
|
||||||
|
version=f"{version}+{local_version}" if local_version else version,
|
||||||
|
)
|
|
@ -0,0 +1,104 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"application/vnd.databricks.v1+cell": {
|
||||||
|
"cellMetadata": {},
|
||||||
|
"inputWidgets": {},
|
||||||
|
"nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec",
|
||||||
|
"showTitle": false,
|
||||||
|
"title": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# DLT pipeline\n",
|
||||||
|
"\n",
|
||||||
|
"This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/{{.project_name}}.pipeline.yml."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 0,
|
||||||
|
"metadata": {
|
||||||
|
"application/vnd.databricks.v1+cell": {
|
||||||
|
"cellMetadata": {},
|
||||||
|
"inputWidgets": {},
|
||||||
|
"nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f",
|
||||||
|
"showTitle": false,
|
||||||
|
"title": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
{{- if (eq .include_python "yes") }}
|
||||||
|
"# Import DLT and src/{{.project_name}}\n",
|
||||||
|
"import dlt\n",
|
||||||
|
"import sys\n",
|
||||||
|
"\n",
|
||||||
|
"sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n",
|
||||||
|
"from pyspark.sql.functions import expr\n",
|
||||||
|
"from {{.project_name}} import main"
|
||||||
|
{{else}}
|
||||||
|
"import dlt\n",
|
||||||
|
"from pyspark.sql.functions import expr\n",
|
||||||
|
"from pyspark.sql import SparkSession\n",
|
||||||
|
"\n",
|
||||||
|
"spark = SparkSession.builder.getOrCreate()"
|
||||||
|
{{end -}}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 0,
|
||||||
|
"metadata": {
|
||||||
|
"application/vnd.databricks.v1+cell": {
|
||||||
|
"cellMetadata": {},
|
||||||
|
"inputWidgets": {},
|
||||||
|
"nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14",
|
||||||
|
"showTitle": false,
|
||||||
|
"title": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
{{- if (eq .include_python "yes") }}
|
||||||
|
"@dlt.view\n",
|
||||||
|
"def taxi_raw():\n",
|
||||||
|
" return main.get_taxis(spark)\n",
|
||||||
|
{{else}}
|
||||||
|
"@dlt.view\n",
|
||||||
|
"def taxi_raw():\n",
|
||||||
|
" return spark.read.format(\"json\").load(\"/databricks-datasets/nyctaxi/sample/json/\")\n",
|
||||||
|
{{end -}}
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"@dlt.table\n",
|
||||||
|
"def filtered_taxis():\n",
|
||||||
|
" return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"application/vnd.databricks.v1+notebook": {
|
||||||
|
"dashboards": [],
|
||||||
|
"language": "python",
|
||||||
|
"notebookMetadata": {
|
||||||
|
"pythonIndentUnit": 2
|
||||||
|
},
|
||||||
|
"notebookName": "dlt_pipeline",
|
||||||
|
"widgets": {}
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"version": "3.11.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
|
@ -0,0 +1,79 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"application/vnd.databricks.v1+cell": {
|
||||||
|
"cellMetadata": {},
|
||||||
|
"inputWidgets": {},
|
||||||
|
"nuid": "ee353e42-ff58-4955-9608-12865bd0950e",
|
||||||
|
"showTitle": false,
|
||||||
|
"title": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# Default notebook\n",
|
||||||
|
"\n",
|
||||||
|
"This default notebook is executed using Databricks Workflows as defined in resources/{{.project_name}}.job.yml."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%load_ext autoreload\n",
|
||||||
|
"%autoreload 2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 0,
|
||||||
|
"metadata": {
|
||||||
|
"application/vnd.databricks.v1+cell": {
|
||||||
|
"cellMetadata": {
|
||||||
|
"byteLimit": 2048000,
|
||||||
|
"rowLimit": 10000
|
||||||
|
},
|
||||||
|
"inputWidgets": {},
|
||||||
|
"nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
|
||||||
|
"showTitle": false,
|
||||||
|
"title": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
{{- if (eq .include_python "yes") }}
|
||||||
|
"from {{.project_name}} import main\n",
|
||||||
|
"\n",
|
||||||
|
"main.get_taxis(spark).show(10)"
|
||||||
|
{{else}}
|
||||||
|
"spark.range(10)"
|
||||||
|
{{end -}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"application/vnd.databricks.v1+notebook": {
|
||||||
|
"dashboards": [],
|
||||||
|
"language": "python",
|
||||||
|
"notebookMetadata": {
|
||||||
|
"pythonIndentUnit": 2
|
||||||
|
},
|
||||||
|
"notebookName": "notebook",
|
||||||
|
"widgets": {}
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"version": "3.11.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
from pyspark.sql import SparkSession, DataFrame
|
||||||
|
|
||||||
|
|
||||||
|
def get_taxis(spark: SparkSession) -> DataFrame:
|
||||||
|
return spark.read.table("samples.nyctaxi.trips")
|
||||||
|
|
||||||
|
|
||||||
|
# Create a new Databricks Connect session. If this fails,
|
||||||
|
# check that you have configured Databricks Connect correctly.
|
||||||
|
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
|
||||||
|
def get_spark() -> SparkSession:
|
||||||
|
try:
|
||||||
|
from databricks.connect import DatabricksSession
|
||||||
|
|
||||||
|
return DatabricksSession.builder.getOrCreate()
|
||||||
|
except ImportError:
|
||||||
|
return SparkSession.builder.getOrCreate()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
get_taxis(get_spark()).show(5)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,8 @@
|
||||||
|
from {{.project_name}}.main import get_taxis, get_spark
|
||||||
|
|
||||||
|
# running tests requires installing databricks-connect, e.g. by uncommenting it in pyproject.toml
|
||||||
|
|
||||||
|
|
||||||
|
def test_main():
|
||||||
|
taxis = get_taxis(get_spark())
|
||||||
|
assert taxis.count() > 5
|
Loading…
Reference in New Issue