Update outputs

This commit is contained in:
Pieter Noordhuis 2025-01-15 11:19:12 +01:00
parent 6c3a1fb049
commit 658458f2df
No known key found for this signature in database
GPG Key ID: 12ACCCC104CF2930
7 changed files with 32 additions and 139 deletions

View File

@ -3,7 +3,7 @@
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle:
name: my_dbt_sql
uuid: 77b4c5d0-0d0a-4e1a-a472-c08952b7f38e
uuid: 92ca153c-da4d-4bc3-aa88-1a468193ba6a
include:
- resources/*.yml

View File

@ -30,126 +30,9 @@ Workspace:
Validation OK!
>>> ruff format --diff
--- scratch/exploration.ipynb:cell 1
+++ scratch/exploration.ipynb:cell 1
--- scratch/exploration.ipynb:cell 2
+++ scratch/exploration.ipynb:cell 2
@@ -1,5 +1,6 @@
import sys
-sys.path.append('../src')
+
+sys.path.append("../src")
from my_default_python import main
main.get_taxis(spark).show(10)
7 files already formatted
--- setup.py
+++ setup.py
@@ -5,11 +5,13 @@
be executed directly. See README.md for how to deploy, test, and run
the my_default_python project.
"""
+
from setuptools import setup, find_packages
import sys
-sys.path.append('./src')
+sys.path.append("./src")
+
import datetime
import my_default_python
@@ -17,17 +19,15 @@
name="my_default_python",
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
- version=my_default_python.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"),
+ version=my_default_python.__version__
+ + "+"
+ + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"),
url="https://databricks.com",
author="$USERNAME",
description="wheel file based on my_default_python/src",
- packages=find_packages(where='./src'),
- package_dir={'': 'src'},
- entry_points={
- "packages": [
- "main=my_default_python.main:main"
- ]
- },
+ packages=find_packages(where="./src"),
+ package_dir={"": "src"},
+ entry_points={"packages": ["main=my_default_python.main:main"]},
install_requires=[
# Dependencies in case the output wheel file is used as a library dependency.
# For defining dependencies, when this package is used in Databricks, see:
--- src/dlt_pipeline.ipynb:cell 2
+++ src/dlt_pipeline.ipynb:cell 2
@@ -1,6 +1,7 @@
# Import DLT and src/my_default_python
import dlt
import sys
+
sys.path.append(spark.conf.get("bundle.sourcePath", "."))
from pyspark.sql.functions import expr
from my_default_python import main
--- src/dlt_pipeline.ipynb:cell 3
+++ src/dlt_pipeline.ipynb:cell 3
@@ -1,7 +1,8 @@
@dlt.view
def taxi_raw():
- return main.get_taxis(spark)
+ return main.get_taxis(spark)
+
@dlt.table
def filtered_taxis():
- return dlt.read("taxi_raw").filter(expr("fare_amount < 30"))
+ return dlt.read("taxi_raw").filter(expr("fare_amount < 30"))
--- src/my_default_python/main.py
+++ src/my_default_python/main.py
@@ -1,21 +1,25 @@
from pyspark.sql import SparkSession, DataFrame
+
def get_taxis(spark: SparkSession) -> DataFrame:
- return spark.read.table("samples.nyctaxi.trips")
+ return spark.read.table("samples.nyctaxi.trips")
# Create a new Databricks Connect session. If this fails,
# check that you have configured Databricks Connect correctly.
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
def get_spark() -> SparkSession:
- try:
- from databricks.connect import DatabricksSession
- return DatabricksSession.builder.getOrCreate()
- except ImportError:
- return SparkSession.builder.getOrCreate()
+ try:
+ from databricks.connect import DatabricksSession
+
+ return DatabricksSession.builder.getOrCreate()
+ except ImportError:
+ return SparkSession.builder.getOrCreate()
+
def main():
- get_taxis(get_spark()).show(5)
+ get_taxis(get_spark()).show(5)
+
-if __name__ == '__main__':
- main()
+if __name__ == "__main__":
+ main()
4 files would be reformatted, 3 files already formatted
Exit code: 1
Exit code: 0
>>> ruff clean
Removing cache at: .ruff_cache

View File

@ -2,7 +2,7 @@
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle:
name: my_default_python
uuid: f0d73e0a-8483-485c-a7f9-0c1588086268
uuid: cc022987-d2d7-42c2-9019-3f9615812f23
include:
- resources/*.yml

View File

@ -5,28 +5,32 @@ This file is primarily used by the setuptools library and typically should not
be executed directly. See README.md for how to deploy, test, and run
the my_default_python project.
"""
from setuptools import setup, find_packages
import sys
sys.path.append('./src')
sys.path.append("./src")
import datetime
import my_default_python
local_version = datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S")
setup(
name="my_default_python",
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
version=my_default_python.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"),
version=my_default_python.__version__ + "+" + local_version,
url="https://databricks.com",
author="$USERNAME",
description="wheel file based on my_default_python/src",
packages=find_packages(where='./src'),
package_dir={'': 'src'},
packages=find_packages(where="./src"),
package_dir={"": "src"},
entry_points={
"packages": [
"main=my_default_python.main:main"
]
"main=my_default_python.main:main",
],
},
install_requires=[
# Dependencies in case the output wheel file is used as a library dependency.

View File

@ -34,6 +34,7 @@
"# Import DLT and src/my_default_python\n",
"import dlt\n",
"import sys\n",
"\n",
"sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n",
"from pyspark.sql.functions import expr\n",
"from my_default_python import main"
@ -55,11 +56,12 @@
"source": [
"@dlt.view\n",
"def taxi_raw():\n",
" return main.get_taxis(spark)\n",
" return main.get_taxis(spark)\n",
"\n",
"\n",
"@dlt.table\n",
"def filtered_taxis():\n",
" return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))"
" return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))"
]
}
],

View File

@ -1,21 +1,25 @@
from pyspark.sql import SparkSession, DataFrame
def get_taxis(spark: SparkSession) -> DataFrame:
return spark.read.table("samples.nyctaxi.trips")
return spark.read.table("samples.nyctaxi.trips")
# Create a new Databricks Connect session. If this fails,
# check that you have configured Databricks Connect correctly.
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
def get_spark() -> SparkSession:
try:
from databricks.connect import DatabricksSession
return DatabricksSession.builder.getOrCreate()
except ImportError:
return SparkSession.builder.getOrCreate()
try:
from databricks.connect import DatabricksSession
return DatabricksSession.builder.getOrCreate()
except ImportError:
return SparkSession.builder.getOrCreate()
def main():
get_taxis(get_spark()).show(5)
get_taxis(get_spark()).show(5)
if __name__ == '__main__':
main()
if __name__ == "__main__":
main()

View File

@ -2,7 +2,7 @@
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle:
name: my_default_sql
uuid: f99204e5-97fa-4ae2-b358-91d3669fcecc
uuid: 19bd03e9-03e7-462a-a918-a8bcd255e45a
include:
- resources/*.yml