mirror of https://github.com/databricks/cli.git
acc: Include full output for default-python/classic (#2391)
## Tests Include full output of default-python/classic so it can be used as a base for diffs in cloud tests #2383
This commit is contained in:
parent
81606cfcbc
commit
13ac52391d
|
@ -0,0 +1,3 @@
|
|||
# Typings for Pylance in Visual Studio Code
|
||||
# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
|
||||
from databricks.sdk.runtime import *
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"recommendations": [
|
||||
"databricks.databricks",
|
||||
"ms-python.vscode-pylance",
|
||||
"redhat.vscode-yaml"
|
||||
]
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"python.analysis.stubPath": ".vscode",
|
||||
"jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
|
||||
"jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
|
||||
"python.testing.pytestArgs": [
|
||||
"."
|
||||
],
|
||||
"python.testing.unittestEnabled": false,
|
||||
"python.testing.pytestEnabled": true,
|
||||
"python.analysis.extraPaths": ["src"],
|
||||
"files.exclude": {
|
||||
"**/*.egg-info": true,
|
||||
"**/__pycache__": true,
|
||||
".pytest_cache": true,
|
||||
},
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
# my_default_python
|
||||
|
||||
The 'my_default_python' project was generated by using the default-python template.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html
|
||||
|
||||
2. Authenticate to your Databricks workspace, if you have not done so already:
|
||||
```
|
||||
$ databricks configure
|
||||
```
|
||||
|
||||
3. To deploy a development copy of this project, type:
|
||||
```
|
||||
$ databricks bundle deploy --target dev
|
||||
```
|
||||
(Note that "dev" is the default target, so the `--target` parameter
|
||||
is optional here.)
|
||||
|
||||
This deploys everything that's defined for this project.
|
||||
For example, the default template would deploy a job called
|
||||
`[dev yourname] my_default_python_job` to your workspace.
|
||||
You can find that job by opening your workpace and clicking on **Workflows**.
|
||||
|
||||
4. Similarly, to deploy a production copy, type:
|
||||
```
|
||||
$ databricks bundle deploy --target prod
|
||||
```
|
||||
|
||||
Note that the default job from the template has a schedule that runs every day
|
||||
(defined in resources/my_default_python.job.yml). The schedule
|
||||
is paused when deploying in development mode (see
|
||||
https://docs.databricks.com/dev-tools/bundles/deployment-modes.html).
|
||||
|
||||
5. To run a job or pipeline, use the "run" command:
|
||||
```
|
||||
$ databricks bundle run
|
||||
```
|
||||
6. Optionally, install the Databricks extension for Visual Studio code for local development from
|
||||
https://docs.databricks.com/dev-tools/vscode-ext.html. It can configure your
|
||||
virtual environment and setup Databricks Connect for running unit tests locally.
|
||||
When not using these tools, consult your development environment's documentation
|
||||
and/or the documentation for Databricks Connect for manually setting up your environment
|
||||
(https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html).
|
||||
|
||||
7. For documentation on the Databricks asset bundles format used
|
||||
for this project, and for CI/CD configuration, see
|
||||
https://docs.databricks.com/dev-tools/bundles/index.html.
|
|
@ -0,0 +1,29 @@
|
|||
# This is a Databricks asset bundle definition for my_default_python.
|
||||
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
|
||||
bundle:
|
||||
name: my_default_python
|
||||
uuid: [UUID]
|
||||
|
||||
include:
|
||||
- resources/*.yml
|
||||
|
||||
targets:
|
||||
dev:
|
||||
# The default target uses 'mode: development' to create a development copy.
|
||||
# - Deployed resources get prefixed with '[dev my_user_name]'
|
||||
# - Any job schedules and triggers are paused by default.
|
||||
# See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
|
||||
mode: development
|
||||
default: true
|
||||
workspace:
|
||||
host: [DATABRICKS_URL]
|
||||
|
||||
prod:
|
||||
mode: production
|
||||
workspace:
|
||||
host: [DATABRICKS_URL]
|
||||
# We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy.
|
||||
root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target}
|
||||
permissions:
|
||||
- user_name: [USERNAME]
|
||||
level: CAN_MANAGE
|
|
@ -0,0 +1,22 @@
|
|||
# Fixtures
|
||||
|
||||
This folder is reserved for fixtures, such as CSV files.
|
||||
|
||||
Below is an example of how to load fixtures as a data frame:
|
||||
|
||||
```
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
def get_absolute_path(*relative_parts):
|
||||
if 'dbutils' in globals():
|
||||
base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore
|
||||
path = os.path.normpath(os.path.join(base_dir, *relative_parts))
|
||||
return path if path.startswith("/Workspace") else "/Workspace" + path
|
||||
else:
|
||||
return os.path.join(*relative_parts)
|
||||
|
||||
csv_file = get_absolute_path("..", "fixtures", "mycsv.csv")
|
||||
df = pd.read_csv(csv_file)
|
||||
display(df)
|
||||
```
|
|
@ -0,0 +1,8 @@
|
|||
.databricks/
|
||||
build/
|
||||
dist/
|
||||
__pycache__/
|
||||
*.egg-info
|
||||
.venv/
|
||||
scratch/**
|
||||
!scratch/README.md
|
|
@ -0,0 +1,3 @@
|
|||
[pytest]
|
||||
testpaths = tests
|
||||
pythonpath = src
|
|
@ -0,0 +1,29 @@
|
|||
## requirements-dev.txt: dependencies for local development.
|
||||
##
|
||||
## For defining dependencies used by jobs in Databricks Workflows, see
|
||||
## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||
|
||||
## Add code completion support for DLT
|
||||
databricks-dlt
|
||||
|
||||
## pytest is the default package used for testing
|
||||
pytest
|
||||
|
||||
## Dependencies for building wheel files
|
||||
setuptools
|
||||
wheel
|
||||
|
||||
## databricks-connect can be used to run parts of this project locally.
|
||||
## See https://docs.databricks.com/dev-tools/databricks-connect.html.
|
||||
##
|
||||
## databricks-connect is automatically installed if you're using Databricks
|
||||
## extension for Visual Studio Code
|
||||
## (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html).
|
||||
##
|
||||
## To manually install databricks-connect, either follow the instructions
|
||||
## at https://docs.databricks.com/dev-tools/databricks-connect.html
|
||||
## to install the package system-wide. Or uncomment the line below to install a
|
||||
## version of db-connect that corresponds to the Databricks Runtime version used
|
||||
## for this project.
|
||||
#
|
||||
# databricks-connect>=15.4,<15.5
|
|
@ -0,0 +1,50 @@
|
|||
# The main job for my_default_python.
|
||||
resources:
|
||||
jobs:
|
||||
my_default_python_job:
|
||||
name: my_default_python_job
|
||||
|
||||
trigger:
|
||||
# Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
|
||||
periodic:
|
||||
interval: 1
|
||||
unit: DAYS
|
||||
|
||||
email_notifications:
|
||||
on_failure:
|
||||
- [USERNAME]
|
||||
|
||||
tasks:
|
||||
- task_key: notebook_task
|
||||
job_cluster_key: job_cluster
|
||||
notebook_task:
|
||||
notebook_path: ../src/notebook.ipynb
|
||||
|
||||
- task_key: refresh_pipeline
|
||||
depends_on:
|
||||
- task_key: notebook_task
|
||||
pipeline_task:
|
||||
pipeline_id: ${resources.pipelines.my_default_python_pipeline.id}
|
||||
|
||||
- task_key: main_task
|
||||
depends_on:
|
||||
- task_key: refresh_pipeline
|
||||
job_cluster_key: job_cluster
|
||||
python_wheel_task:
|
||||
package_name: my_default_python
|
||||
entry_point: main
|
||||
libraries:
|
||||
# By default we just include the .whl file generated for the my_default_python package.
|
||||
# See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||
# for more information on how to add other libraries.
|
||||
- whl: ../dist/*.whl
|
||||
|
||||
job_clusters:
|
||||
- job_cluster_key: job_cluster
|
||||
new_cluster:
|
||||
spark_version: 15.4.x-scala2.12
|
||||
node_type_id: i3.xlarge
|
||||
data_security_mode: SINGLE_USER
|
||||
autoscale:
|
||||
min_workers: 1
|
||||
max_workers: 4
|
|
@ -0,0 +1,14 @@
|
|||
# The main pipeline for my_default_python
|
||||
resources:
|
||||
pipelines:
|
||||
my_default_python_pipeline:
|
||||
name: my_default_python_pipeline
|
||||
## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog:
|
||||
# catalog: catalog_name
|
||||
target: my_default_python_${bundle.target}
|
||||
libraries:
|
||||
- notebook:
|
||||
path: ../src/dlt_pipeline.ipynb
|
||||
|
||||
configuration:
|
||||
bundle.sourcePath: ${workspace.file_path}/src
|
|
@ -0,0 +1,4 @@
|
|||
# scratch
|
||||
|
||||
This folder is reserved for personal, exploratory notebooks.
|
||||
By default these are not committed to Git, as 'scratch' is listed in .gitignore.
|
|
@ -0,0 +1,61 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {
|
||||
"byteLimit": 2048000,
|
||||
"rowLimit": 10000
|
||||
},
|
||||
"inputWidgets": {},
|
||||
"nuid": "[UUID]",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"sys.path.append(\"../src\")\n",
|
||||
"from my_default_python import main\n",
|
||||
"\n",
|
||||
"main.get_taxis(spark).show(10)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+notebook": {
|
||||
"dashboards": [],
|
||||
"language": "python",
|
||||
"notebookMetadata": {
|
||||
"pythonIndentUnit": 2
|
||||
},
|
||||
"notebookName": "ipynb-notebook",
|
||||
"widgets": {}
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
"""
|
||||
setup.py configuration script describing how to build and package this project.
|
||||
|
||||
This file is primarily used by the setuptools library and typically should not
|
||||
be executed directly. See README.md for how to deploy, test, and run
|
||||
the my_default_python project.
|
||||
"""
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
import sys
|
||||
|
||||
sys.path.append("./src")
|
||||
|
||||
import datetime
|
||||
import my_default_python
|
||||
|
||||
local_version = datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S")
|
||||
|
||||
setup(
|
||||
name="my_default_python",
|
||||
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
|
||||
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
|
||||
version=my_default_python.__version__ + "+" + local_version,
|
||||
url="https://databricks.com",
|
||||
author="[USERNAME]",
|
||||
description="wheel file based on my_default_python/src",
|
||||
packages=find_packages(where="./src"),
|
||||
package_dir={"": "src"},
|
||||
entry_points={
|
||||
"packages": [
|
||||
"main=my_default_python.main:main",
|
||||
],
|
||||
},
|
||||
install_requires=[
|
||||
# Dependencies in case the output wheel file is used as a library dependency.
|
||||
# For defining dependencies, when this package is used in Databricks, see:
|
||||
# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
|
||||
"setuptools"
|
||||
],
|
||||
)
|
|
@ -0,0 +1,90 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {},
|
||||
"inputWidgets": {},
|
||||
"nuid": "[UUID]",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# DLT pipeline\n",
|
||||
"\n",
|
||||
"This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/my_default_python.pipeline.yml."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 0,
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {},
|
||||
"inputWidgets": {},
|
||||
"nuid": "[UUID]",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Import DLT and src/my_default_python\n",
|
||||
"import dlt\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n",
|
||||
"from pyspark.sql.functions import expr\n",
|
||||
"from my_default_python import main"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 0,
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {},
|
||||
"inputWidgets": {},
|
||||
"nuid": "[UUID]",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"@dlt.view\n",
|
||||
"def taxi_raw():\n",
|
||||
" return main.get_taxis(spark)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dlt.table\n",
|
||||
"def filtered_taxis():\n",
|
||||
" return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+notebook": {
|
||||
"dashboards": [],
|
||||
"language": "python",
|
||||
"notebookMetadata": {
|
||||
"pythonIndentUnit": 2
|
||||
},
|
||||
"notebookName": "dlt_pipeline",
|
||||
"widgets": {}
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
__version__ = "0.0.1"
|
|
@ -0,0 +1,25 @@
|
|||
from pyspark.sql import SparkSession, DataFrame
|
||||
|
||||
|
||||
def get_taxis(spark: SparkSession) -> DataFrame:
|
||||
return spark.read.table("samples.nyctaxi.trips")
|
||||
|
||||
|
||||
# Create a new Databricks Connect session. If this fails,
|
||||
# check that you have configured Databricks Connect correctly.
|
||||
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
|
||||
def get_spark() -> SparkSession:
|
||||
try:
|
||||
from databricks.connect import DatabricksSession
|
||||
|
||||
return DatabricksSession.builder.getOrCreate()
|
||||
except ImportError:
|
||||
return SparkSession.builder.getOrCreate()
|
||||
|
||||
|
||||
def main():
|
||||
get_taxis(get_spark()).show(5)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,75 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {},
|
||||
"inputWidgets": {},
|
||||
"nuid": "[UUID]",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Default notebook\n",
|
||||
"\n",
|
||||
"This default notebook is executed using Databricks Workflows as defined in resources/my_default_python.job.yml."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 0,
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+cell": {
|
||||
"cellMetadata": {
|
||||
"byteLimit": 2048000,
|
||||
"rowLimit": 10000
|
||||
},
|
||||
"inputWidgets": {},
|
||||
"nuid": "[UUID]",
|
||||
"showTitle": false,
|
||||
"title": ""
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from my_default_python import main\n",
|
||||
"\n",
|
||||
"main.get_taxis(spark).show(10)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"application/vnd.databricks.v1+notebook": {
|
||||
"dashboards": [],
|
||||
"language": "python",
|
||||
"notebookMetadata": {
|
||||
"pythonIndentUnit": 2
|
||||
},
|
||||
"notebookName": "notebook",
|
||||
"widgets": {}
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
from my_default_python.main import get_taxis, get_spark
|
||||
|
||||
|
||||
def test_main():
|
||||
taxis = get_taxis(get_spark())
|
||||
assert taxis.count() > 5
|
|
@ -11,5 +11,3 @@ cd ../../
|
|||
|
||||
# Calculate the difference from the serverless template
|
||||
diff.py $TESTDIR/../serverless/output output/ > out.compare-vs-serverless.diff
|
||||
|
||||
rm -fr output
|
||||
|
|
Loading…
Reference in New Issue