Update outputs

This commit is contained in:
Pieter Noordhuis 2025-01-15 11:19:12 +01:00
parent 6c3a1fb049
commit 658458f2df
No known key found for this signature in database
GPG Key ID: 12ACCCC104CF2930
7 changed files with 32 additions and 139 deletions

View File

@ -3,7 +3,7 @@
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle: bundle:
name: my_dbt_sql name: my_dbt_sql
uuid: 77b4c5d0-0d0a-4e1a-a472-c08952b7f38e uuid: 92ca153c-da4d-4bc3-aa88-1a468193ba6a
include: include:
- resources/*.yml - resources/*.yml

View File

@ -30,126 +30,9 @@ Workspace:
Validation OK! Validation OK!
>>> ruff format --diff >>> ruff format --diff
--- scratch/exploration.ipynb:cell 1 7 files already formatted
+++ scratch/exploration.ipynb:cell 1
--- scratch/exploration.ipynb:cell 2
+++ scratch/exploration.ipynb:cell 2
@@ -1,5 +1,6 @@
import sys
-sys.path.append('../src')
+
+sys.path.append("../src")
from my_default_python import main
main.get_taxis(spark).show(10) Exit code: 0
--- setup.py
+++ setup.py
@@ -5,11 +5,13 @@
be executed directly. See README.md for how to deploy, test, and run
the my_default_python project.
"""
+
from setuptools import setup, find_packages
import sys
-sys.path.append('./src')
+sys.path.append("./src")
+
import datetime
import my_default_python
@@ -17,17 +19,15 @@
name="my_default_python",
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
# to ensure that changes to wheel package are picked up when used on all-purpose clusters
- version=my_default_python.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"),
+ version=my_default_python.__version__
+ + "+"
+ + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"),
url="https://databricks.com",
author="$USERNAME",
description="wheel file based on my_default_python/src",
- packages=find_packages(where='./src'),
- package_dir={'': 'src'},
- entry_points={
- "packages": [
- "main=my_default_python.main:main"
- ]
- },
+ packages=find_packages(where="./src"),
+ package_dir={"": "src"},
+ entry_points={"packages": ["main=my_default_python.main:main"]},
install_requires=[
# Dependencies in case the output wheel file is used as a library dependency.
# For defining dependencies, when this package is used in Databricks, see:
--- src/dlt_pipeline.ipynb:cell 2
+++ src/dlt_pipeline.ipynb:cell 2
@@ -1,6 +1,7 @@
# Import DLT and src/my_default_python
import dlt
import sys
+
sys.path.append(spark.conf.get("bundle.sourcePath", "."))
from pyspark.sql.functions import expr
from my_default_python import main
--- src/dlt_pipeline.ipynb:cell 3
+++ src/dlt_pipeline.ipynb:cell 3
@@ -1,7 +1,8 @@
@dlt.view
def taxi_raw():
- return main.get_taxis(spark)
+ return main.get_taxis(spark)
+
@dlt.table
def filtered_taxis():
- return dlt.read("taxi_raw").filter(expr("fare_amount < 30"))
+ return dlt.read("taxi_raw").filter(expr("fare_amount < 30"))
--- src/my_default_python/main.py
+++ src/my_default_python/main.py
@@ -1,21 +1,25 @@
from pyspark.sql import SparkSession, DataFrame
+
def get_taxis(spark: SparkSession) -> DataFrame:
- return spark.read.table("samples.nyctaxi.trips")
+ return spark.read.table("samples.nyctaxi.trips")
# Create a new Databricks Connect session. If this fails,
# check that you have configured Databricks Connect correctly.
# See https://docs.databricks.com/dev-tools/databricks-connect.html.
def get_spark() -> SparkSession:
- try:
- from databricks.connect import DatabricksSession
- return DatabricksSession.builder.getOrCreate()
- except ImportError:
- return SparkSession.builder.getOrCreate()
+ try:
+ from databricks.connect import DatabricksSession
+
+ return DatabricksSession.builder.getOrCreate()
+ except ImportError:
+ return SparkSession.builder.getOrCreate()
+
def main():
- get_taxis(get_spark()).show(5)
+ get_taxis(get_spark()).show(5)
+
-if __name__ == '__main__':
- main()
+if __name__ == "__main__":
+ main()
4 files would be reformatted, 3 files already formatted
Exit code: 1
>>> ruff clean >>> ruff clean
Removing cache at: .ruff_cache Removing cache at: .ruff_cache

View File

@ -2,7 +2,7 @@
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle: bundle:
name: my_default_python name: my_default_python
uuid: f0d73e0a-8483-485c-a7f9-0c1588086268 uuid: cc022987-d2d7-42c2-9019-3f9615812f23
include: include:
- resources/*.yml - resources/*.yml

View File

@ -5,28 +5,32 @@ This file is primarily used by the setuptools library and typically should not
be executed directly. See README.md for how to deploy, test, and run be executed directly. See README.md for how to deploy, test, and run
the my_default_python project. the my_default_python project.
""" """
from setuptools import setup, find_packages from setuptools import setup, find_packages
import sys import sys
sys.path.append('./src')
sys.path.append("./src")
import datetime import datetime
import my_default_python import my_default_python
local_version = datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S")
setup( setup(
name="my_default_python", name="my_default_python",
# We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.) # We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.)
# to ensure that changes to wheel package are picked up when used on all-purpose clusters # to ensure that changes to wheel package are picked up when used on all-purpose clusters
version=my_default_python.__version__ + "+" + datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S"), version=my_default_python.__version__ + "+" + local_version,
url="https://databricks.com", url="https://databricks.com",
author="$USERNAME", author="$USERNAME",
description="wheel file based on my_default_python/src", description="wheel file based on my_default_python/src",
packages=find_packages(where='./src'), packages=find_packages(where="./src"),
package_dir={'': 'src'}, package_dir={"": "src"},
entry_points={ entry_points={
"packages": [ "packages": [
"main=my_default_python.main:main" "main=my_default_python.main:main",
] ],
}, },
install_requires=[ install_requires=[
# Dependencies in case the output wheel file is used as a library dependency. # Dependencies in case the output wheel file is used as a library dependency.

View File

@ -34,6 +34,7 @@
"# Import DLT and src/my_default_python\n", "# Import DLT and src/my_default_python\n",
"import dlt\n", "import dlt\n",
"import sys\n", "import sys\n",
"\n",
"sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n", "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n",
"from pyspark.sql.functions import expr\n", "from pyspark.sql.functions import expr\n",
"from my_default_python import main" "from my_default_python import main"
@ -55,11 +56,12 @@
"source": [ "source": [
"@dlt.view\n", "@dlt.view\n",
"def taxi_raw():\n", "def taxi_raw():\n",
" return main.get_taxis(spark)\n", " return main.get_taxis(spark)\n",
"\n",
"\n", "\n",
"@dlt.table\n", "@dlt.table\n",
"def filtered_taxis():\n", "def filtered_taxis():\n",
" return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))"
] ]
} }
], ],

View File

@ -1,21 +1,25 @@
from pyspark.sql import SparkSession, DataFrame from pyspark.sql import SparkSession, DataFrame
def get_taxis(spark: SparkSession) -> DataFrame: def get_taxis(spark: SparkSession) -> DataFrame:
return spark.read.table("samples.nyctaxi.trips") return spark.read.table("samples.nyctaxi.trips")
# Create a new Databricks Connect session. If this fails, # Create a new Databricks Connect session. If this fails,
# check that you have configured Databricks Connect correctly. # check that you have configured Databricks Connect correctly.
# See https://docs.databricks.com/dev-tools/databricks-connect.html. # See https://docs.databricks.com/dev-tools/databricks-connect.html.
def get_spark() -> SparkSession: def get_spark() -> SparkSession:
try: try:
from databricks.connect import DatabricksSession from databricks.connect import DatabricksSession
return DatabricksSession.builder.getOrCreate()
except ImportError: return DatabricksSession.builder.getOrCreate()
return SparkSession.builder.getOrCreate() except ImportError:
return SparkSession.builder.getOrCreate()
def main(): def main():
get_taxis(get_spark()).show(5) get_taxis(get_spark()).show(5)
if __name__ == '__main__':
main() if __name__ == "__main__":
main()

View File

@ -2,7 +2,7 @@
# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
bundle: bundle:
name: my_default_sql name: my_default_sql
uuid: f99204e5-97fa-4ae2-b358-91d3669fcecc uuid: 19bd03e9-03e7-462a-a918-a8bcd255e45a
include: include:
- resources/*.yml - resources/*.yml