This commit is contained in:
Pieter Noordhuis 2025-01-15 10:35:58 +01:00
parent 36c08d3cc5
commit c3c4dcd894
No known key found for this signature in database
GPG Key ID: 12ACCCC104CF2930
57 changed files with 193 additions and 47 deletions

View File

@ -1,30 +0,0 @@
>>> $CLI bundle init default-python --config-file ./input.json --output-dir output
Welcome to the default Python template for Databricks Asset Bundles!
Workspace to use (auto-detected, edit in 'my_default_python/databricks.yml'): http://$DATABRICKS_HOST
✨ Your new project has been created in the 'my_default_python' directory!
Please refer to the README.md file for "getting started" instructions.
See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html.
>>> $CLI bundle validate -t dev
Name: my_default_python
Target: dev
Workspace:
Host: http://$DATABRICKS_HOST
User: $USERNAME
Path: /Workspace/Users/$USERNAME/.bundle/my_default_python/dev
Validation OK!
>>> $CLI bundle validate -t prod
Name: my_default_python
Target: prod
Workspace:
Host: http://$DATABRICKS_HOST
User: $USERNAME
Path: /Workspace/Users/$USERNAME/.bundle/my_default_python/prod
Validation OK!

View File

@ -30,3 +30,12 @@ Workspace:
Path: /Workspace/Users/$USERNAME/.bundle/my_dbt_sql/prod Path: /Workspace/Users/$USERNAME/.bundle/my_dbt_sql/prod
Validation OK! Validation OK!
>>> ruff format --diff
warning: No Python files found under the given path(s)
Exit code: 0
>>> ruff clean
Exit code: 0

View File

@ -47,7 +47,7 @@ and deployment to production (using Databricks Asset Bundles).
(see https://docs.databricks.com/dev-tools/auth/pat.html). (see https://docs.databricks.com/dev-tools/auth/pat.html).
You can use OAuth as an alternative, but this currently requires manual configuration. You can use OAuth as an alternative, but this currently requires manual configuration.
See https://github.com/databricks/dbt-databricks/blob/main/docs/oauth.md See https://github.com/databricks/dbt-databricks/blob/main/docs/oauth.md
for general instructions, or https://community.databricks.com/t5/technical-blog/using-dbt-core-with-oauth-on-azure-databricks/ba-p/46605 for general instructions, or https://community.databricks.com/t5/technical-blog/using-dbt-core-with-oauth-on-azure-databricks/ba-p/<NUMID>
for advice on setting up OAuth for Azure Databricks. for advice on setting up OAuth for Azure Databricks.
To setup up additional profiles, such as a 'prod' profile, To setup up additional profiles, such as a 'prod' profile,

View File

@ -3,7 +3,7 @@
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle: bundle:
name: my_dbt_sql name: my_dbt_sql
uuid: 9d4ad8f6-850c-45fb-b51a-04fa314564de uuid: <UUID>
include: include:
- resources/*.yml - resources/*.yml

View File

@ -2,7 +2,7 @@
-- This model file defines a materialized view called 'orders_daily' -- This model file defines a materialized view called 'orders_daily'
-- --
-- Read more about materialized at https://docs.getdbt.com/reference/resource-configs/databricks-configs#materialized-views-and-streaming-tables -- Read more about materialized at https://docs.getdbt.com/reference/resource-configs/databricks-configs#materialized-views-and-streaming-tables
-- Current limitation: a "full refresh" is needed in case the definition below is changed; see https://github.com/databricks/dbt-databricks/issues/561. -- Current limitation: a "full refresh" is needed in case the definition below is changed; see https://github.com/databricks/dbt-databricks/issues/<NUMID>.
{{ config(materialized = 'materialized_view') }} {{ config(materialized = 'materialized_view') }}
select order_date, count(*) AS number_of_orders select order_date, count(*) AS number_of_orders
@ -11,7 +11,7 @@ from {{ ref('orders_raw') }}
-- During development, only process a smaller range of data -- During development, only process a smaller range of data
{% if target.name != 'prod' %} {% if target.name != 'prod' %}
where order_date >= '2019-08-01' and order_date < '2019-09-01' where order_date >= '<NUMID>-08-01' and order_date < '<NUMID>-09-01'
{% endif %} {% endif %}
group by order_date group by order_date

View File

@ -2,7 +2,7 @@
-- --
-- The streaming table below ingests all JSON files in /databricks-datasets/retail-org/sales_orders/ -- The streaming table below ingests all JSON files in /databricks-datasets/retail-org/sales_orders/
-- Read more about streaming tables at https://docs.getdbt.com/reference/resource-configs/databricks-configs#materialized-views-and-streaming-tables -- Read more about streaming tables at https://docs.getdbt.com/reference/resource-configs/databricks-configs#materialized-views-and-streaming-tables
-- Current limitation: a "full refresh" is needed in case the definition below is changed; see https://github.com/databricks/dbt-databricks/issues/561. -- Current limitation: a "full refresh" is needed in case the definition below is changed; see https://github.com/databricks/dbt-databricks/issues/<NUMID>.
{{ config(materialized = 'streaming_table') }} {{ config(materialized = 'streaming_table') }}
select select

View File

@ -0,0 +1,157 @@
>>> $CLI bundle init default-python --config-file ./input.json --output-dir output
Welcome to the default Python template for Databricks Asset Bundles!
Workspace to use (auto-detected, edit in 'my_default_python/databricks.yml'): http://$DATABRICKS_HOST
✨ Your new project has been created in the 'my_default_python' directory!
Please refer to the README.md file for "getting started" instructions.
See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html.
>>> $CLI bundle validate -t dev
Name: my_default_python
Target: dev
Workspace:
Host: http://$DATABRICKS_HOST
User: $USERNAME
Path: /Workspace/Users/$USERNAME/.bundle/my_default_python/dev
Validation OK!
>>> $CLI bundle validate -t prod
Name: my_default_python
Target: prod
Workspace:
Host: http://$DATABRICKS_HOST
User: $USERNAME
Path: /Workspace/Users/$USERNAME/.bundle/my_default_python/prod
Validation OK!
>>> ruff format --diff
--- scratch/exploration.ipynb:cell 1
+++ scratch/exploration.ipynb:cell 1
--- scratch/exploration.ipynb:cell 2
+++ scratch/exploration.ipynb:cell 2
@@ -1,5 +1,6 @@
import sys
-sys.path.append('../src')
+
+sys.path.append("../src")
from my_default_python import main
main.get_taxis(spark).show(10)
--- setup.py
+++ setup.py
@@ -5,11 +5,13 @@
be executed directly. See README.md for how to deploy, test, and run
the my_default_python project.
"""
+
from setuptools import setup, find_packages
import sys
-sys.path.append('./src')
+sys.path.append("./src")
+
import datetime
import my_default_python
@@ -17,17 +19,15 @@
name="my_default_python",
# We use timestamp as Local version identifier (https://peps.python.org/pep-<NUMID>/#local-version-identifiers.)
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
- version=my_default_python.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"),
+ version=my_default_python.__version__
+ + "+"
+ + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"),
url="https://databricks.com",
author="$USERNAME",
description="wheel file based on my_default_python/src",
- packages=find_packages(where='./src'),
- package_dir={'': 'src'},
- entry_points={
- "packages": [
- "main=my_default_python.main:main"
- ]
- },
+ packages=find_packages(where="./src"),
+ package_dir={"": "src"},
+ entry_points={"packages": ["main=my_default_python.main:main"]},
install_requires=[
# Dependencies in case the output wheel file is used as a library dependency.
# For defining dependencies, when this package is used in Databricks, see:
--- src/dlt_pipeline.ipynb:cell 2
+++ src/dlt_pipeline.ipynb:cell 2
@@ -1,6 +1,7 @@
# Import DLT and src/my_default_python
import dlt
import sys
+
sys.path.append(spark.conf.get("bundle.sourcePath", "."))
from pyspark.sql.functions import expr
from my_default_python import main
--- src/dlt_pipeline.ipynb:cell 3
+++ src/dlt_pipeline.ipynb:cell 3
@@ -1,7 +1,8 @@
@dlt.view
def taxi_raw():
- return main.get_taxis(spark)
+ return main.get_taxis(spark)
+
@dlt.table
def filtered_taxis():
- return dlt.read("taxi_raw").filter(expr("fare_amount < 30"))
+ return dlt.read("taxi_raw").filter(expr("fare_amount < 30"))
--- src/my_default_python/main.py
+++ src/my_default_python/main.py
@@ -1,21 +1,25 @@
from pyspark.sql import SparkSession, DataFrame
+
def get_taxis(spark: SparkSession) -> DataFrame:
- return spark.read.table("samples.nyctaxi.trips")
+ return spark.read.table("samples.nyctaxi.trips")
# Create a new Databricks Connect session. If this fails,
# check that you have configured Databricks Connect correctly.
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
def get_spark() -> SparkSession:
- try:
- from databricks.connect import DatabricksSession
- return DatabricksSession.builder.getOrCreate()
- except ImportError:
- return SparkSession.builder.getOrCreate()
+ try:
+ from databricks.connect import DatabricksSession
+
+ return DatabricksSession.builder.getOrCreate()
+ except ImportError:
+ return SparkSession.builder.getOrCreate()
+
def main():
- get_taxis(get_spark()).show(5)
+ get_taxis(get_spark()).show(5)
+
-if __name__ == '__main__':
- main()
+if __name__ == "__main__":
+ main()
4 files would be reformatted, 3 files already formatted
Exit code: 1
>>> ruff clean
Removing cache at: .ruff_cache
Exit code: 0

View File

@ -2,7 +2,7 @@
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle: bundle:
name: my_default_python name: my_default_python
uuid: 3c5cdb6b-9e42-46f3-a33c-54769acda6bf uuid: <UUID>
include: include:
- resources/*.yml - resources/*.yml

View File

@ -15,7 +15,7 @@ import my_default_python
setup( setup(
name="my_default_python", name="my_default_python",
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.) # We use timestamp as Local version identifier (https://peps.python.org/pep-<NUMID>/#local-version-identifiers.)
# to ensure that changes to wheel package are picked up when used on all-purpose clusters # to ensure that changes to wheel package are picked up when used on all-purpose clusters
version=my_default_python.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"), version=my_default_python.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"),
url="https://databricks.com", url="https://databricks.com",

View File

@ -6,7 +6,7 @@
"application/vnd.databricks.v1+cell": { "application/vnd.databricks.v1+cell": {
"cellMetadata": {}, "cellMetadata": {},
"inputWidgets": {}, "inputWidgets": {},
"nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec", "nuid": "<UUID>",
"showTitle": false, "showTitle": false,
"title": "" "title": ""
} }
@ -24,7 +24,7 @@
"application/vnd.databricks.v1+cell": { "application/vnd.databricks.v1+cell": {
"cellMetadata": {}, "cellMetadata": {},
"inputWidgets": {}, "inputWidgets": {},
"nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f", "nuid": "<UUID>",
"showTitle": false, "showTitle": false,
"title": "" "title": ""
} }
@ -46,7 +46,7 @@
"application/vnd.databricks.v1+cell": { "application/vnd.databricks.v1+cell": {
"cellMetadata": {}, "cellMetadata": {},
"inputWidgets": {}, "inputWidgets": {},
"nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14", "nuid": "<UUID>",
"showTitle": false, "showTitle": false,
"title": "" "title": ""
} }

View File

@ -6,7 +6,7 @@
"application/vnd.databricks.v1+cell": { "application/vnd.databricks.v1+cell": {
"cellMetadata": {}, "cellMetadata": {},
"inputWidgets": {}, "inputWidgets": {},
"nuid": "ee353e42-ff58-4955-9608-12865bd0950e", "nuid": "<UUID>",
"showTitle": false, "showTitle": false,
"title": "" "title": ""
} }
@ -33,11 +33,11 @@
"metadata": { "metadata": {
"application/vnd.databricks.v1+cell": { "application/vnd.databricks.v1+cell": {
"cellMetadata": { "cellMetadata": {
"byteLimit": 2048000, "byteLimit": <NUMID>,
"rowLimit": 10000 "rowLimit": <NUMID>
}, },
"inputWidgets": {}, "inputWidgets": {},
"nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", "nuid": "<UUID>",
"showTitle": false, "showTitle": false,
"title": "" "title": ""
} }

View File

@ -30,3 +30,13 @@ Workspace:
Path: /Workspace/Users/$USERNAME/.bundle/my_default_sql/prod Path: /Workspace/Users/$USERNAME/.bundle/my_default_sql/prod
Validation OK! Validation OK!
>>> ruff format --diff
error: Failed to parse scratch/exploration.ipynb:1:2:15: Simple statements must be separated by newlines or semicolons
Exit code: 2
>>> ruff clean
Removing cache at: .ruff_cache
Exit code: 0

View File

@ -2,7 +2,7 @@
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle: bundle:
name: my_default_sql name: my_default_sql
uuid: 631398bf-1d77-42ce-ba4f-9bb29dd64b5a uuid: <UUID>
include: include:
- resources/*.yml - resources/*.yml

View File

@ -7,7 +7,7 @@
"application/vnd.databricks.v1+cell": { "application/vnd.databricks.v1+cell": {
"cellMetadata": {}, "cellMetadata": {},
"inputWidgets": {}, "inputWidgets": {},
"nuid": "dc8c630c-1ea0-42e4-873f-e4dec4d3d416", "nuid": "<UUID>",
"showTitle": false, "showTitle": false,
"title": "" "title": ""
} }

View File

@ -15,7 +15,7 @@ WHERE if(
true, true,
-- During development, only process a smaller range of data -- During development, only process a smaller range of data
order_date >= '2019-08-01' AND order_date < '2019-09-01' order_date >= '<NUMID>-08-01' AND order_date < '<NUMID>-09-01'
) )
GROUP BY order_date GROUP BY order_date