From e61f0e1eb93361d2553037d3616540ecd006f4ca Mon Sep 17 00:00:00 2001 From: Fabian Jakobs Date: Tue, 5 Mar 2024 15:31:27 +0100 Subject: [PATCH] Fix DBConnect support in VS Code (#1253) ## Changes With the current template, we can't execute the Python file and the jobs notebook using DBConnect from VSCode because we import `from pyspark.sql import SparkSession`, which doesn't support Databricks unified auth. This PR fixes this by passing spark into the library code and by explicitly instantiating a spark session where the spark global is not available. Other changes: * add auto-reload to notebooks * add DLT typings for code completion --- .../requirements-dev.txt.tmpl | 3 +++ .../scratch/exploration.ipynb.tmpl | 12 +++++++++- .../src/dlt_pipeline.ipynb.tmpl | 2 +- .../{{.project_name}}/src/notebook.ipynb.tmpl | 12 +++++++++- .../src/{{.project_name}}/main.py.tmpl | 23 +++++++++++-------- .../{{.project_name}}/tests/main_test.py.tmpl | 19 ++------------- 6 files changed, 42 insertions(+), 29 deletions(-) diff --git a/libs/template/templates/default-python/template/{{.project_name}}/requirements-dev.txt.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/requirements-dev.txt.tmpl index 6da40321..93dd4c48 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/requirements-dev.txt.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/requirements-dev.txt.tmpl @@ -3,6 +3,9 @@ ## For defining dependencies used by jobs in Databricks Workflows, see ## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html +## Add code completion support for DLT +databricks-dlt + ## pytest is the default package used for testing pytest diff --git a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl index 04bb261c..42164dff 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, { "cell_type": "code", "execution_count": null, @@ -22,7 +32,7 @@ "sys.path.append('../src')\n", "from {{.project_name}} import main\n", "\n", - "main.get_taxis().show(10)" + "main.get_taxis(spark).show(10)" {{else}} "spark.range(10)" {{end -}} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl index 4f50294f..b152e9a3 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl @@ -63,7 +63,7 @@ {{- if (eq .include_python "yes") }} "@dlt.view\n", "def taxi_raw():\n", - " return main.get_taxis()\n", + " return main.get_taxis(spark)\n", {{else}} "\n", "@dlt.view\n", diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl index 0ab61db2..a228f8d1 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl @@ -17,6 +17,16 @@ "This default notebook is executed using Databricks Workflows as defined in resources/{{.project_name}}_job.yml." ] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, { "cell_type": "code", "execution_count": 0, @@ -37,7 +47,7 @@ {{- if (eq .include_python "yes") }} "from {{.project_name}} import main\n", "\n", - "main.get_taxis().show(10)" + "main.get_taxis(spark).show(10)" {{else}} "spark.range(10)" {{end -}} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl index 4fe5ac8f..c514c6dc 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl @@ -1,16 +1,21 @@ -{{- /* -We use pyspark.sql rather than DatabricksSession.builder.getOrCreate() -for compatibility with older runtimes. With a new runtime, it's -equivalent to DatabricksSession.builder.getOrCreate(). -*/ -}} -from pyspark.sql import SparkSession +from pyspark.sql import SparkSession, DataFrame -def get_taxis(): - spark = SparkSession.builder.getOrCreate() +def get_taxis(spark: SparkSession) -> DataFrame: return spark.read.table("samples.nyctaxi.trips") + +# Create a new Databricks Connect session. If this fails, +# check that you have configured Databricks Connect correctly. +# See https://docs.databricks.com/dev-tools/databricks-connect.html. +def get_spark() -> SparkSession: + try: + from databricks.connect import DatabricksSession + return DatabricksSession.builder.getOrCreate() + except ImportError: + return SparkSession.builder.getOrCreate() + def main(): - get_taxis().show(5) + get_taxis(get_spark()).show(5) if __name__ == '__main__': main() diff --git a/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl index a7a6afe0..fea2f3f6 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl @@ -1,21 +1,6 @@ -from databricks.connect import DatabricksSession -from pyspark.sql import SparkSession -from {{.project_name}} import main +from {{.project_name}}.main import get_taxis, get_spark -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -{{/* - The below works around a problematic error message from Databricks Connect. - The standard SparkSession is supported in all configurations (workspace, IDE, - all runtime versions, CLI). But on the CLI it currently gives a confusing - error message if SPARK_REMOTE is not set. We can't directly use - DatabricksSession.builder in main.py, so we're re-assigning it here so - everything works out of the box, even for CLI users who don't set SPARK_REMOTE. -*/}} -SparkSession.builder = DatabricksSession.builder -SparkSession.builder.getOrCreate() def test_main(): - taxis = main.get_taxis() + taxis = get_taxis(get_spark()) assert taxis.count() > 5