From e61f0e1eb93361d2553037d3616540ecd006f4ca Mon Sep 17 00:00:00 2001
From: Fabian Jakobs <fabian.jakobs@databricks.com>
Date: Tue, 5 Mar 2024 15:31:27 +0100
Subject: [PATCH] Fix DBConnect support in VS Code (#1253)

## Changes

With the current template, we can't execute the Python file and the jobs
notebook using DBConnect from VSCode because we import `from pyspark.sql
import SparkSession`, which doesn't support Databricks unified auth.
This PR fixes this by passing spark into the library code and by
explicitly instantiating a spark session where the spark global is not
available.

Other changes:

* add auto-reload to notebooks
* add DLT typings for code completion
---
 .../requirements-dev.txt.tmpl                 |  3 +++
 .../scratch/exploration.ipynb.tmpl            | 12 +++++++++-
 .../src/dlt_pipeline.ipynb.tmpl               |  2 +-
 .../{{.project_name}}/src/notebook.ipynb.tmpl | 12 +++++++++-
 .../src/{{.project_name}}/main.py.tmpl        | 23 +++++++++++--------
 .../{{.project_name}}/tests/main_test.py.tmpl | 19 ++-------------
 6 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/libs/template/templates/default-python/template/{{.project_name}}/requirements-dev.txt.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/requirements-dev.txt.tmpl
index 6da40321..93dd4c48 100644
--- a/libs/template/templates/default-python/template/{{.project_name}}/requirements-dev.txt.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/requirements-dev.txt.tmpl
@@ -3,6 +3,9 @@
 ## For defining dependencies used by jobs in Databricks Workflows, see
 ## https://docs.databricks.com/dev-tools/bundles/library-dependencies.html
 
+## Add code completion support for DLT
+databricks-dlt
+
 ## pytest is the default package used for testing
 pytest
 
diff --git a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl
index 04bb261c..42164dff 100644
--- a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl
@@ -1,5 +1,15 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -22,7 +32,7 @@
     "sys.path.append('../src')\n",
     "from {{.project_name}} import main\n",
     "\n",
-    "main.get_taxis().show(10)"
+    "main.get_taxis(spark).show(10)"
    {{else}}
     "spark.range(10)"
    {{end -}}
diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl
index 4f50294f..b152e9a3 100644
--- a/libs/template/templates/default-python/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl
@@ -63,7 +63,7 @@
     {{- if (eq .include_python "yes") }}
     "@dlt.view\n",
     "def taxi_raw():\n",
-    "  return main.get_taxis()\n",
+    "  return main.get_taxis(spark)\n",
     {{else}}
     "\n",
     "@dlt.view\n",
diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl
index 0ab61db2..a228f8d1 100644
--- a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl
@@ -17,6 +17,16 @@
     "This default notebook is executed using Databricks Workflows as defined in resources/{{.project_name}}_job.yml."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 0,
@@ -37,7 +47,7 @@
    {{- if (eq .include_python "yes") }}
     "from {{.project_name}} import main\n",
     "\n",
-    "main.get_taxis().show(10)"
+    "main.get_taxis(spark).show(10)"
    {{else}}
     "spark.range(10)"
    {{end -}}
diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl
index 4fe5ac8f..c514c6dc 100644
--- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl
@@ -1,16 +1,21 @@
-{{- /*
-We use pyspark.sql rather than DatabricksSession.builder.getOrCreate()
-for compatibility with older runtimes. With a new runtime, it's
-equivalent to DatabricksSession.builder.getOrCreate().
-*/ -}}
-from pyspark.sql import SparkSession
+from pyspark.sql import SparkSession, DataFrame
 
-def get_taxis():
-  spark = SparkSession.builder.getOrCreate()
+def get_taxis(spark: SparkSession) -> DataFrame:
   return spark.read.table("samples.nyctaxi.trips")
 
+
+# Create a new Databricks Connect session. If this fails,
+# check that you have configured Databricks Connect correctly.
+# See https://docs.databricks.com/dev-tools/databricks-connect.html.
+def get_spark() -> SparkSession:
+  try:
+    from databricks.connect import DatabricksSession
+    return DatabricksSession.builder.getOrCreate()
+  except ImportError:
+    return SparkSession.builder.getOrCreate()
+
 def main():
-  get_taxis().show(5)
+  get_taxis(get_spark()).show(5)
 
 if __name__ == '__main__':
   main()
diff --git a/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl
index a7a6afe0..fea2f3f6 100644
--- a/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl
+++ b/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl
@@ -1,21 +1,6 @@
-from databricks.connect import DatabricksSession
-from pyspark.sql import SparkSession
-from {{.project_name}} import main
+from {{.project_name}}.main import get_taxis, get_spark
 
-# Create a new Databricks Connect session. If this fails,
-# check that you have configured Databricks Connect correctly.
-# See https://docs.databricks.com/dev-tools/databricks-connect.html.
-{{/*
-  The below works around a problematic error message from Databricks Connect.
-  The standard SparkSession is supported in all configurations (workspace, IDE,
-  all runtime versions, CLI). But on the CLI it currently gives a confusing
-  error message if SPARK_REMOTE is not set. We can't directly use
-  DatabricksSession.builder in main.py, so we're re-assigning it here so
-  everything works out of the box, even for CLI users who don't set SPARK_REMOTE.
-*/}}
-SparkSession.builder = DatabricksSession.builder
-SparkSession.builder.getOrCreate()
 
 def test_main():
-    taxis = main.get_taxis()
+    taxis = get_taxis(get_spark())
     assert taxis.count() > 5