Add support for non-Python ipynb notebooks to DABs (#1827)

## Changes

### Background

The workspace import APIs recently added support for importing Jupyter
notebooks written in R, Scala, or SQL, that is non-Python notebooks.
This now works for the `/import-file` API which we leverage in the CLI.

Note: We do not need any changes in `databricks sync`. It works out of
the box because any state mapping of local names to remote names that we
store is only scoped to the notebook extension (i.e., `.ipynb` in this
case) and is agnostic of the notebook's specific language.

### Problem this PR addresses

The extension-aware filer previously did not function because it checks
that a `.ipynb` notebook is written in Python. This PR relaxes that
constraint and adds integration tests for both the normal workspace
filer and extensions aware filer writing and reading non-Python `.ipynb`
notebooks.

This implies that after this PR DABs in the workspace / CLI from DBR
will work for non-Python notebooks as well. non-Python notebooks for
DABs deployment from local machines already works after the platform
side changes to the API landed, this PR just adds integration tests for
that bit of functionality.

Note: Any platform side changes we needed for the import API have
already been rolled out to production.

### Before

DABs deploy would work fine for non-Python notebooks. But DABs
deployments from DBR would not.

### After

DABs deploys both from local machines and DBR will work fine.

## Testing

For creating the `.ipynb` notebook fixtures used in the integration
tests I created them directly from the VSCode UI. This ensures high
fidelity with how users will create their non-Python notebooks locally.
For Python notebooks this is supported out of the box by VSCode but for
R and Scala notebooks this requires installing the Jupyter kernel for R
and Scala on my local machine and using that from VSCode.

For SQL, I ended up directly modifying the `language_info` field in the
Jupyter metadata to create the test fixture.

### Discussion: Issues with configuring language at the cell level

The language metadata for a Jupyter notebook is standardized at the
notebook level (in the `language_info` field). Unfortunately, it's not
standardized at the cell level. Thus, for example, if a user changes the
language for their cell in VSCode (which is supported by the standard
Jupyter VSCode integration), it'll cause a runtime error when the user
actually attempts to run the notebook. This is because the cell-level
metadata is encoded in a format specific to VSCode:
```
cells: []{
    "vscode": {
     "languageId": "sql"
    }
}
```

Supporting cell level languages is thus out of scope for this PR and can
be revisited along with the workspace files team if there's strong
customer interest.
This commit is contained in:
shreyas-goenka 2024-11-14 03:09:51 +05:30 committed by GitHub
parent 25838ee0af
commit e1978fa429
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 637 additions and 271 deletions

View File

@ -39,7 +39,7 @@ func (f filerTest) assertContents(ctx context.Context, name string, contents str
assert.Equal(f, contents, body.String())
}
func (f filerTest) assertContentsJupyter(ctx context.Context, name string) {
func (f filerTest) assertContentsJupyter(ctx context.Context, name string, language string) {
reader, err := f.Read(ctx, name)
if !assert.NoError(f, err) {
return
@ -62,6 +62,7 @@ func (f filerTest) assertContentsJupyter(ctx context.Context, name string) {
// Since a roundtrip to the workspace changes a Jupyter notebook's payload,
// the best we can do is assert that the nbformat is correct.
assert.EqualValues(f, 4, actual["nbformat"])
assert.Equal(f, language, actual["metadata"].(map[string]any)["language_info"].(map[string]any)["name"])
}
func (f filerTest) assertNotExists(ctx context.Context, name string) {
@ -360,146 +361,114 @@ func TestAccFilerReadDir(t *testing.T) {
}
}
var jupyterNotebookContent1 = `
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Jupyter Notebook Version 1\")"
]
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
`
var jupyterNotebookContent2 = `
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Jupyter Notebook Version 2\")"
]
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
`
func TestAccFilerWorkspaceNotebookConflict(t *testing.T) {
func TestAccFilerWorkspaceNotebook(t *testing.T) {
t.Parallel()
f, _ := setupWsfsFiler(t)
ctx := context.Background()
var err error
// Upload the notebooks
err = f.Write(ctx, "pyNb.py", strings.NewReader("# Databricks notebook source\nprint('first upload'))"))
require.NoError(t, err)
err = f.Write(ctx, "rNb.r", strings.NewReader("# Databricks notebook source\nprint('first upload'))"))
require.NoError(t, err)
err = f.Write(ctx, "sqlNb.sql", strings.NewReader("-- Databricks notebook source\n SELECT \"first upload\""))
require.NoError(t, err)
err = f.Write(ctx, "scalaNb.scala", strings.NewReader("// Databricks notebook source\n println(\"first upload\"))"))
require.NoError(t, err)
err = f.Write(ctx, "jupyterNb.ipynb", strings.NewReader(jupyterNotebookContent1))
tcases := []struct {
name string
nameWithoutExt string
content1 string
expected1 string
content2 string
expected2 string
}{
{
name: "pyNb.py",
nameWithoutExt: "pyNb",
content1: "# Databricks notebook source\nprint('first upload')",
expected1: "# Databricks notebook source\nprint('first upload')",
content2: "# Databricks notebook source\nprint('second upload')",
expected2: "# Databricks notebook source\nprint('second upload')",
},
{
name: "rNb.r",
nameWithoutExt: "rNb",
content1: "# Databricks notebook source\nprint('first upload')",
expected1: "# Databricks notebook source\nprint('first upload')",
content2: "# Databricks notebook source\nprint('second upload')",
expected2: "# Databricks notebook source\nprint('second upload')",
},
{
name: "sqlNb.sql",
nameWithoutExt: "sqlNb",
content1: "-- Databricks notebook source\n SELECT \"first upload\"",
expected1: "-- Databricks notebook source\n SELECT \"first upload\"",
content2: "-- Databricks notebook source\n SELECT \"second upload\"",
expected2: "-- Databricks notebook source\n SELECT \"second upload\"",
},
{
name: "scalaNb.scala",
nameWithoutExt: "scalaNb",
content1: "// Databricks notebook source\n println(\"first upload\")",
expected1: "// Databricks notebook source\n println(\"first upload\")",
content2: "// Databricks notebook source\n println(\"second upload\")",
expected2: "// Databricks notebook source\n println(\"second upload\")",
},
{
name: "pythonJupyterNb.ipynb",
nameWithoutExt: "pythonJupyterNb",
content1: readFile(t, "testdata/notebooks/py1.ipynb"),
expected1: "# Databricks notebook source\nprint(1)",
content2: readFile(t, "testdata/notebooks/py2.ipynb"),
expected2: "# Databricks notebook source\nprint(2)",
},
{
name: "rJupyterNb.ipynb",
nameWithoutExt: "rJupyterNb",
content1: readFile(t, "testdata/notebooks/r1.ipynb"),
expected1: "# Databricks notebook source\nprint(1)",
content2: readFile(t, "testdata/notebooks/r2.ipynb"),
expected2: "# Databricks notebook source\nprint(2)",
},
{
name: "scalaJupyterNb.ipynb",
nameWithoutExt: "scalaJupyterNb",
content1: readFile(t, "testdata/notebooks/scala1.ipynb"),
expected1: "// Databricks notebook source\nprintln(1)",
content2: readFile(t, "testdata/notebooks/scala2.ipynb"),
expected2: "// Databricks notebook source\nprintln(2)",
},
{
name: "sqlJupyterNotebook.ipynb",
nameWithoutExt: "sqlJupyterNotebook",
content1: readFile(t, "testdata/notebooks/sql1.ipynb"),
expected1: "-- Databricks notebook source\nselect 1",
content2: readFile(t, "testdata/notebooks/sql2.ipynb"),
expected2: "-- Databricks notebook source\nselect 2",
},
}
for _, tc := range tcases {
f, _ := setupWsfsFiler(t)
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
// Upload the notebook
err = f.Write(ctx, tc.name, strings.NewReader(tc.content1))
require.NoError(t, err)
// Assert contents after initial upload
filerTest{t, f}.assertContents(ctx, "pyNb", "# Databricks notebook source\nprint('first upload'))")
filerTest{t, f}.assertContents(ctx, "rNb", "# Databricks notebook source\nprint('first upload'))")
filerTest{t, f}.assertContents(ctx, "sqlNb", "-- Databricks notebook source\n SELECT \"first upload\"")
filerTest{t, f}.assertContents(ctx, "scalaNb", "// Databricks notebook source\n println(\"first upload\"))")
filerTest{t, f}.assertContents(ctx, "jupyterNb", "# Databricks notebook source\nprint(\"Jupyter Notebook Version 1\")")
// Assert contents after initial upload. Note that we expect the content
// for jupyter notebooks to be of type source because the workspace files
// client always uses the source format to read notebooks from the workspace.
filerTest{t, f}.assertContents(ctx, tc.nameWithoutExt, tc.expected1)
// Assert uploading a second time fails due to overwrite mode missing
err = f.Write(ctx, "pyNb.py", strings.NewReader("# Databricks notebook source\nprint('second upload'))"))
err = f.Write(ctx, tc.name, strings.NewReader(tc.content2))
assert.ErrorIs(t, err, fs.ErrExist)
assert.Regexp(t, regexp.MustCompile(`file already exists: .*/pyNb$`), err.Error())
assert.Regexp(t, regexp.MustCompile(`file already exists: .*/`+tc.nameWithoutExt+`$`), err.Error())
err = f.Write(ctx, "rNb.r", strings.NewReader("# Databricks notebook source\nprint('second upload'))"))
assert.ErrorIs(t, err, fs.ErrExist)
assert.Regexp(t, regexp.MustCompile(`file already exists: .*/rNb$`), err.Error())
err = f.Write(ctx, "sqlNb.sql", strings.NewReader("# Databricks notebook source\n SELECT \"second upload\")"))
assert.ErrorIs(t, err, fs.ErrExist)
assert.Regexp(t, regexp.MustCompile(`file already exists: .*/sqlNb$`), err.Error())
err = f.Write(ctx, "scalaNb.scala", strings.NewReader("# Databricks notebook source\n println(\"second upload\"))"))
assert.ErrorIs(t, err, fs.ErrExist)
assert.Regexp(t, regexp.MustCompile(`file already exists: .*/scalaNb$`), err.Error())
err = f.Write(ctx, "jupyterNb.ipynb", strings.NewReader(jupyterNotebookContent2))
assert.ErrorIs(t, err, fs.ErrExist)
assert.Regexp(t, regexp.MustCompile(`file already exists: .*/jupyterNb$`), err.Error())
}
func TestAccFilerWorkspaceNotebookWithOverwriteFlag(t *testing.T) {
t.Parallel()
f, _ := setupWsfsFiler(t)
ctx := context.Background()
var err error
// Upload notebooks
err = f.Write(ctx, "pyNb.py", strings.NewReader("# Databricks notebook source\nprint('first upload'))"))
require.NoError(t, err)
err = f.Write(ctx, "rNb.r", strings.NewReader("# Databricks notebook source\nprint('first upload'))"))
require.NoError(t, err)
err = f.Write(ctx, "sqlNb.sql", strings.NewReader("-- Databricks notebook source\n SELECT \"first upload\""))
require.NoError(t, err)
err = f.Write(ctx, "scalaNb.scala", strings.NewReader("// Databricks notebook source\n println(\"first upload\"))"))
require.NoError(t, err)
err = f.Write(ctx, "jupyterNb.ipynb", strings.NewReader(jupyterNotebookContent1))
// Try uploading the notebook again with overwrite flag. This time it should succeed.
err = f.Write(ctx, tc.name, strings.NewReader(tc.content2), filer.OverwriteIfExists)
require.NoError(t, err)
// Assert contents after initial upload
filerTest{t, f}.assertContents(ctx, "pyNb", "# Databricks notebook source\nprint('first upload'))")
filerTest{t, f}.assertContents(ctx, "rNb", "# Databricks notebook source\nprint('first upload'))")
filerTest{t, f}.assertContents(ctx, "sqlNb", "-- Databricks notebook source\n SELECT \"first upload\"")
filerTest{t, f}.assertContents(ctx, "scalaNb", "// Databricks notebook source\n println(\"first upload\"))")
filerTest{t, f}.assertContents(ctx, "jupyterNb", "# Databricks notebook source\nprint(\"Jupyter Notebook Version 1\")")
// Assert contents after second upload
filerTest{t, f}.assertContents(ctx, tc.nameWithoutExt, tc.expected2)
})
}
// Upload notebooks a second time, overwriting the initial uplaods
err = f.Write(ctx, "pyNb.py", strings.NewReader("# Databricks notebook source\nprint('second upload'))"), filer.OverwriteIfExists)
require.NoError(t, err)
err = f.Write(ctx, "rNb.r", strings.NewReader("# Databricks notebook source\nprint('second upload'))"), filer.OverwriteIfExists)
require.NoError(t, err)
err = f.Write(ctx, "sqlNb.sql", strings.NewReader("-- Databricks notebook source\n SELECT \"second upload\""), filer.OverwriteIfExists)
require.NoError(t, err)
err = f.Write(ctx, "scalaNb.scala", strings.NewReader("// Databricks notebook source\n println(\"second upload\"))"), filer.OverwriteIfExists)
require.NoError(t, err)
err = f.Write(ctx, "jupyterNb.ipynb", strings.NewReader(jupyterNotebookContent2), filer.OverwriteIfExists)
require.NoError(t, err)
// Assert contents have been overwritten
filerTest{t, f}.assertContents(ctx, "pyNb", "# Databricks notebook source\nprint('second upload'))")
filerTest{t, f}.assertContents(ctx, "rNb", "# Databricks notebook source\nprint('second upload'))")
filerTest{t, f}.assertContents(ctx, "sqlNb", "-- Databricks notebook source\n SELECT \"second upload\"")
filerTest{t, f}.assertContents(ctx, "scalaNb", "// Databricks notebook source\n println(\"second upload\"))")
filerTest{t, f}.assertContents(ctx, "jupyterNb", "# Databricks notebook source\nprint(\"Jupyter Notebook Version 2\")")
}
func TestAccFilerWorkspaceFilesExtensionsReadDir(t *testing.T) {
@ -515,11 +484,13 @@ func TestAccFilerWorkspaceFilesExtensionsReadDir(t *testing.T) {
{"foo.r", "print('foo')"},
{"foo.scala", "println('foo')"},
{"foo.sql", "SELECT 'foo'"},
{"jupyterNb.ipynb", jupyterNotebookContent1},
{"jupyterNb2.ipynb", jupyterNotebookContent2},
{"py1.ipynb", readFile(t, "testdata/notebooks/py1.ipynb")},
{"pyNb.py", "# Databricks notebook source\nprint('first upload'))"},
{"r1.ipynb", readFile(t, "testdata/notebooks/r1.ipynb")},
{"rNb.r", "# Databricks notebook source\nprint('first upload'))"},
{"scala1.ipynb", readFile(t, "testdata/notebooks/scala1.ipynb")},
{"scalaNb.scala", "// Databricks notebook source\n println(\"first upload\"))"},
{"sql1.ipynb", readFile(t, "testdata/notebooks/sql1.ipynb")},
{"sqlNb.sql", "-- Databricks notebook source\n SELECT \"first upload\""},
}
@ -554,11 +525,13 @@ func TestAccFilerWorkspaceFilesExtensionsReadDir(t *testing.T) {
"foo.r",
"foo.scala",
"foo.sql",
"jupyterNb.ipynb",
"jupyterNb2.ipynb",
"py1.ipynb",
"pyNb.py",
"r1.ipynb",
"rNb.r",
"scala1.ipynb",
"scalaNb.scala",
"sql1.ipynb",
"sqlNb.sql",
}, names)
@ -582,7 +555,10 @@ func setupFilerWithExtensionsTest(t *testing.T) filer.Filer {
}{
{"foo.py", "# Databricks notebook source\nprint('first upload'))"},
{"bar.py", "print('foo')"},
{"jupyter.ipynb", jupyterNotebookContent1},
{"p1.ipynb", readFile(t, "testdata/notebooks/py1.ipynb")},
{"r1.ipynb", readFile(t, "testdata/notebooks/r1.ipynb")},
{"scala1.ipynb", readFile(t, "testdata/notebooks/scala1.ipynb")},
{"sql1.ipynb", readFile(t, "testdata/notebooks/sql1.ipynb")},
{"pretender", "not a notebook"},
{"dir/file.txt", "file content"},
{"scala-notebook.scala", "// Databricks notebook source\nprintln('first upload')"},
@ -608,11 +584,15 @@ func TestAccFilerWorkspaceFilesExtensionsRead(t *testing.T) {
// Read contents of test fixtures as a sanity check.
filerTest{t, wf}.assertContents(ctx, "foo.py", "# Databricks notebook source\nprint('first upload'))")
filerTest{t, wf}.assertContents(ctx, "bar.py", "print('foo')")
filerTest{t, wf}.assertContentsJupyter(ctx, "jupyter.ipynb")
filerTest{t, wf}.assertContents(ctx, "dir/file.txt", "file content")
filerTest{t, wf}.assertContents(ctx, "scala-notebook.scala", "// Databricks notebook source\nprintln('first upload')")
filerTest{t, wf}.assertContents(ctx, "pretender", "not a notebook")
filerTest{t, wf}.assertContentsJupyter(ctx, "p1.ipynb", "python")
filerTest{t, wf}.assertContentsJupyter(ctx, "r1.ipynb", "r")
filerTest{t, wf}.assertContentsJupyter(ctx, "scala1.ipynb", "scala")
filerTest{t, wf}.assertContentsJupyter(ctx, "sql1.ipynb", "sql")
// Read non-existent file
_, err := wf.Read(ctx, "non-existent.py")
assert.ErrorIs(t, err, fs.ErrNotExist)
@ -638,35 +618,41 @@ func TestAccFilerWorkspaceFilesExtensionsDelete(t *testing.T) {
ctx := context.Background()
wf := setupFilerWithExtensionsTest(t)
// Delete notebook
err := wf.Delete(ctx, "foo.py")
for _, fileName := range []string{
// notebook
"foo.py",
// file
"bar.py",
// python jupyter notebook
"p1.ipynb",
// R jupyter notebook
"r1.ipynb",
// Scala jupyter notebook
"scala1.ipynb",
// SQL jupyter notebook
"sql1.ipynb",
} {
err := wf.Delete(ctx, fileName)
require.NoError(t, err)
filerTest{t, wf}.assertNotExists(ctx, "foo.py")
filerTest{t, wf}.assertNotExists(ctx, fileName)
}
// Delete file
err = wf.Delete(ctx, "bar.py")
require.NoError(t, err)
filerTest{t, wf}.assertNotExists(ctx, "bar.py")
// Delete jupyter notebook
err = wf.Delete(ctx, "jupyter.ipynb")
require.NoError(t, err)
filerTest{t, wf}.assertNotExists(ctx, "jupyter.ipynb")
// Delete non-existent file
err = wf.Delete(ctx, "non-existent.py")
assert.ErrorIs(t, err, fs.ErrNotExist)
// Ensure we do not delete a file as a notebook
err = wf.Delete(ctx, "pretender.py")
assert.ErrorIs(t, err, fs.ErrNotExist)
// Ensure we do not delete a Scala notebook as a Python notebook
_, err = wf.Read(ctx, "scala-notebook.py")
for _, fileName := range []string{
// do not delete non-existent file
"non-existent.py",
// do not delete a file assuming it is a notebook and stripping the extension
"pretender.py",
// do not delete a Scala notebook as a Python notebook
"scala-notebook.py",
// do not delete a file assuming it is a Jupyter notebook and stripping the extension
"pretender.ipynb",
} {
err := wf.Delete(ctx, fileName)
assert.ErrorIs(t, err, fs.ErrNotExist)
}
// Delete directory
err = wf.Delete(ctx, "dir")
err := wf.Delete(ctx, "dir")
assert.ErrorIs(t, err, fs.ErrInvalid)
// Delete directory recursively
@ -681,44 +667,45 @@ func TestAccFilerWorkspaceFilesExtensionsStat(t *testing.T) {
ctx := context.Background()
wf := setupFilerWithExtensionsTest(t)
// Stat on a notebook
info, err := wf.Stat(ctx, "foo.py")
for _, fileName := range []string{
// notebook
"foo.py",
// file
"bar.py",
// python jupyter notebook
"p1.ipynb",
// R jupyter notebook
"r1.ipynb",
// Scala jupyter notebook
"scala1.ipynb",
// SQL jupyter notebook
"sql1.ipynb",
} {
info, err := wf.Stat(ctx, fileName)
require.NoError(t, err)
assert.Equal(t, "foo.py", info.Name())
assert.False(t, info.IsDir())
// Stat on a file
info, err = wf.Stat(ctx, "bar.py")
require.NoError(t, err)
assert.Equal(t, "bar.py", info.Name())
assert.False(t, info.IsDir())
// Stat on a Jupyter notebook
info, err = wf.Stat(ctx, "jupyter.ipynb")
require.NoError(t, err)
assert.Equal(t, "jupyter.ipynb", info.Name())
assert.Equal(t, fileName, info.Name())
assert.False(t, info.IsDir())
}
// Stat on a directory
info, err = wf.Stat(ctx, "dir")
info, err := wf.Stat(ctx, "dir")
require.NoError(t, err)
assert.Equal(t, "dir", info.Name())
assert.True(t, info.IsDir())
// Stat on a non-existent file
_, err = wf.Stat(ctx, "non-existent.py")
assert.ErrorIs(t, err, fs.ErrNotExist)
// Ensure we do not stat a file as a notebook
_, err = wf.Stat(ctx, "pretender.py")
assert.ErrorIs(t, err, fs.ErrNotExist)
// Ensure we do not stat a Scala notebook as a Python notebook
_, err = wf.Stat(ctx, "scala-notebook.py")
assert.ErrorIs(t, err, fs.ErrNotExist)
_, err = wf.Stat(ctx, "pretender.ipynb")
for _, fileName := range []string{
// non-existent file
"non-existent.py",
// do not stat a file assuming it is a notebook and stripping the extension
"pretender.py",
// do not stat a Scala notebook as a Python notebook
"scala-notebook.py",
// do not read a regular file assuming it is a Jupyter notebook and stripping the extension
"pretender.ipynb",
} {
_, err := wf.Stat(ctx, fileName)
assert.ErrorIs(t, err, fs.ErrNotExist)
}
}
func TestAccWorkspaceFilesExtensionsDirectoriesAreNotNotebooks(t *testing.T) {
@ -739,32 +726,115 @@ func TestAccWorkspaceFilesExtensionsDirectoriesAreNotNotebooks(t *testing.T) {
func TestAccWorkspaceFilesExtensions_ExportFormatIsPreserved(t *testing.T) {
t.Parallel()
// Case 1: Writing source notebooks.
for _, tc := range []struct {
language string
sourceName string
sourceContent string
jupyterName string
jupyterContent string
}{
{
language: "python",
sourceName: "foo.py",
sourceContent: "# Databricks notebook source\nprint('foo')",
jupyterName: "foo.ipynb",
},
{
language: "r",
sourceName: "foo.r",
sourceContent: "# Databricks notebook source\nprint('foo')",
jupyterName: "foo.ipynb",
},
{
language: "scala",
sourceName: "foo.scala",
sourceContent: "// Databricks notebook source\nprintln('foo')",
jupyterName: "foo.ipynb",
},
{
language: "sql",
sourceName: "foo.sql",
sourceContent: "-- Databricks notebook source\nselect 'foo'",
jupyterName: "foo.ipynb",
},
} {
t.Run("source_"+tc.language, func(t *testing.T) {
t.Parallel()
ctx := context.Background()
wf, _ := setupWsfsExtensionsFiler(t)
// Case 1: Source Notebook
err := wf.Write(ctx, "foo.py", strings.NewReader("# Databricks notebook source\nprint('foo')"))
err := wf.Write(ctx, tc.sourceName, strings.NewReader(tc.sourceContent))
require.NoError(t, err)
// The source notebook should exist but not the Jupyter notebook
filerTest{t, wf}.assertContents(ctx, "foo.py", "# Databricks notebook source\nprint('foo')")
_, err = wf.Stat(ctx, "foo.ipynb")
assert.ErrorIs(t, err, fs.ErrNotExist)
_, err = wf.Read(ctx, "foo.ipynb")
assert.ErrorIs(t, err, fs.ErrNotExist)
err = wf.Delete(ctx, "foo.ipynb")
assert.ErrorIs(t, err, fs.ErrNotExist)
// Assert on the content of the source notebook that's been written.
filerTest{t, wf}.assertContents(ctx, tc.sourceName, tc.sourceContent)
// Case 2: Jupyter Notebook
err = wf.Write(ctx, "bar.ipynb", strings.NewReader(jupyterNotebookContent1))
// Ensure that the source notebook is not read when the name contains
// the .ipynb extension.
_, err = wf.Stat(ctx, tc.jupyterName)
assert.ErrorIs(t, err, fs.ErrNotExist)
_, err = wf.Read(ctx, tc.jupyterName)
assert.ErrorIs(t, err, fs.ErrNotExist)
err = wf.Delete(ctx, tc.jupyterName)
assert.ErrorIs(t, err, fs.ErrNotExist)
})
}
// Case 2: Writing Jupyter notebooks.
for _, tc := range []struct {
language string
sourceName string
jupyterName string
jupyterContent string
}{
{
language: "python",
sourceName: "foo.py",
jupyterName: "foo.ipynb",
jupyterContent: readFile(t, "testdata/notebooks/py1.ipynb"),
},
{
language: "r",
sourceName: "foo.r",
jupyterName: "foo.ipynb",
jupyterContent: readFile(t, "testdata/notebooks/r1.ipynb"),
},
{
language: "scala",
sourceName: "foo.scala",
jupyterName: "foo.ipynb",
jupyterContent: readFile(t, "testdata/notebooks/scala1.ipynb"),
},
{
language: "sql",
sourceName: "foo.sql",
jupyterName: "foo.ipynb",
jupyterContent: readFile(t, "testdata/notebooks/sql1.ipynb"),
},
} {
t.Run("jupyter_"+tc.language, func(t *testing.T) {
t.Parallel()
ctx := context.Background()
wf, _ := setupWsfsExtensionsFiler(t)
err := wf.Write(ctx, tc.jupyterName, strings.NewReader(tc.jupyterContent))
require.NoError(t, err)
// The Jupyter notebook should exist but not the source notebook
filerTest{t, wf}.assertContentsJupyter(ctx, "bar.ipynb")
_, err = wf.Stat(ctx, "bar.py")
// Assert that the written notebook is jupyter and has the correct
// language_info metadata set.
filerTest{t, wf}.assertContentsJupyter(ctx, tc.jupyterName, tc.language)
// Ensure that the Jupyter notebook is not read when the name does not
// contain the .ipynb extension.
_, err = wf.Stat(ctx, tc.sourceName)
assert.ErrorIs(t, err, fs.ErrNotExist)
_, err = wf.Read(ctx, "bar.py")
_, err = wf.Read(ctx, tc.sourceName)
assert.ErrorIs(t, err, fs.ErrNotExist)
err = wf.Delete(ctx, "bar.py")
err = wf.Delete(ctx, tc.sourceName)
assert.ErrorIs(t, err, fs.ErrNotExist)
})
}
}

View File

@ -352,6 +352,13 @@ func RequireErrorRun(t *testing.T, args ...string) (bytes.Buffer, bytes.Buffer,
return stdout, stderr, err
}
func readFile(t *testing.T, name string) string {
b, err := os.ReadFile(name)
require.NoError(t, err)
return string(b)
}
func writeFile(t *testing.T, name string, body string) string {
f, err := os.Create(filepath.Join(t.TempDir(), name))
require.NoError(t, err)
@ -562,12 +569,10 @@ func setupLocalFiler(t *testing.T) (filer.Filer, string) {
}
func setupWsfsFiler(t *testing.T) (filer.Filer, string) {
t.Log(GetEnvOrSkipTest(t, "CLOUD_ENV"))
ctx, wt := acc.WorkspaceTest(t)
ctx := context.Background()
w := databricks.Must(databricks.NewWorkspaceClient())
tmpdir := TemporaryWorkspaceDir(t, w)
f, err := filer.NewWorkspaceFilesClient(w, tmpdir)
tmpdir := TemporaryWorkspaceDir(t, wt.W)
f, err := filer.NewWorkspaceFilesClient(wt.W, tmpdir)
require.NoError(t, err)
// Check if we can use this API here, skip test if we cannot.
@ -581,11 +586,10 @@ func setupWsfsFiler(t *testing.T) (filer.Filer, string) {
}
func setupWsfsExtensionsFiler(t *testing.T) (filer.Filer, string) {
t.Log(GetEnvOrSkipTest(t, "CLOUD_ENV"))
_, wt := acc.WorkspaceTest(t)
w := databricks.Must(databricks.NewWorkspaceClient())
tmpdir := TemporaryWorkspaceDir(t, w)
f, err := filer.NewWorkspaceFilesExtensionsClient(w, tmpdir)
tmpdir := TemporaryWorkspaceDir(t, wt.W)
f, err := filer.NewWorkspaceFilesExtensionsClient(wt.W, tmpdir)
require.NoError(t, err)
return f, tmpdir

27
internal/testdata/notebooks/py1.ipynb vendored Normal file
View File

@ -0,0 +1,27 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.13"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

27
internal/testdata/notebooks/py2.ipynb vendored Normal file
View File

@ -0,0 +1,27 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.13"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

25
internal/testdata/notebooks/r1.ipynb vendored Normal file
View File

@ -0,0 +1,25 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"name": "R"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

29
internal/testdata/notebooks/r2.ipynb vendored Normal file
View File

@ -0,0 +1,29 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"print(2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"name": "R"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,38 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n"
]
}
],
"source": [
"println(1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Scala",
"language": "scala",
"name": "scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".sc",
"mimetype": "text/x-scala",
"name": "scala",
"nbconvert_exporter": "script",
"version": "2.13.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,38 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n"
]
}
],
"source": [
"println(2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Scala",
"language": "scala",
"name": "scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".sc",
"mimetype": "text/x-scala",
"name": "scala",
"nbconvert_exporter": "script",
"version": "2.13.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

20
internal/testdata/notebooks/sql1.ipynb vendored Normal file
View File

@ -0,0 +1,20 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"select 1"
]
}
],
"metadata": {
"language_info": {
"name": "sql"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

20
internal/testdata/notebooks/sql2.ipynb vendored Normal file
View File

@ -0,0 +1,20 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"select 2"
]
}
],
"metadata": {
"language_info": {
"name": "sql"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -7,6 +7,7 @@ import (
"io"
"io/fs"
"path"
"slices"
"strings"
"github.com/databricks/cli/libs/log"
@ -23,14 +24,6 @@ type workspaceFilesExtensionsClient struct {
readonly bool
}
var extensionsToLanguages = map[string]workspace.Language{
".py": workspace.LanguagePython,
".r": workspace.LanguageR,
".scala": workspace.LanguageScala,
".sql": workspace.LanguageSql,
".ipynb": workspace.LanguagePython,
}
type workspaceFileStatus struct {
wsfsFileInfo
@ -54,7 +47,12 @@ func (w *workspaceFilesExtensionsClient) getNotebookStatByNameWithExt(ctx contex
nameWithoutExt := strings.TrimSuffix(name, ext)
// File name does not have an extension associated with Databricks notebooks, return early.
if _, ok := extensionsToLanguages[ext]; !ok {
if !slices.Contains([]string{
notebook.ExtensionPython,
notebook.ExtensionR,
notebook.ExtensionScala,
notebook.ExtensionSql,
notebook.ExtensionJupyter}, ext) {
return nil, nil
}
@ -75,22 +73,23 @@ func (w *workspaceFilesExtensionsClient) getNotebookStatByNameWithExt(ctx contex
return nil, nil
}
// Not the correct language. Return early.
if stat.Language != extensionsToLanguages[ext] {
log.Debugf(ctx, "attempting to determine if %s could be a notebook. Found a notebook at %s but it is not of the correct language. Expected %s but found %s.", name, path.Join(w.root, nameWithoutExt), extensionsToLanguages[ext], stat.Language)
// Not the correct language. Return early. Note: All languages are supported
// for Jupyter notebooks.
if ext != notebook.ExtensionJupyter && stat.Language != notebook.ExtensionToLanguage[ext] {
log.Debugf(ctx, "attempting to determine if %s could be a notebook. Found a notebook at %s but it is not of the correct language. Expected %s but found %s.", name, path.Join(w.root, nameWithoutExt), notebook.ExtensionToLanguage[ext], stat.Language)
return nil, nil
}
// When the extension is .py we expect the export format to be source.
// For non-jupyter notebooks the export format should be source.
// If it's not, return early.
if ext == ".py" && stat.ReposExportFormat != workspace.ExportFormatSource {
if ext != notebook.ExtensionJupyter && stat.ReposExportFormat != workspace.ExportFormatSource {
log.Debugf(ctx, "attempting to determine if %s could be a notebook. Found a notebook at %s but it is not exported as a source notebook. Its export format is %s.", name, path.Join(w.root, nameWithoutExt), stat.ReposExportFormat)
return nil, nil
}
// When the extension is .ipynb we expect the export format to be Jupyter.
// If it's not, return early.
if ext == ".ipynb" && stat.ReposExportFormat != workspace.ExportFormatJupyter {
if ext == notebook.ExtensionJupyter && stat.ReposExportFormat != workspace.ExportFormatJupyter {
log.Debugf(ctx, "attempting to determine if %s could be a notebook. Found a notebook at %s but it is not exported as a Jupyter notebook. Its export format is %s.", name, path.Join(w.root, nameWithoutExt), stat.ReposExportFormat)
return nil, nil
}
@ -120,8 +119,8 @@ func (w *workspaceFilesExtensionsClient) getNotebookStatByNameWithoutExt(ctx con
ext := notebook.GetExtensionByLanguage(&stat.ObjectInfo)
// If the notebook was exported as a Jupyter notebook, the extension should be .ipynb.
if stat.Language == workspace.LanguagePython && stat.ReposExportFormat == workspace.ExportFormatJupyter {
ext = ".ipynb"
if stat.ReposExportFormat == workspace.ExportFormatJupyter {
ext = notebook.ExtensionJupyter
}
// Modify the stat object path to include the extension. This stat object will be used

View File

@ -37,7 +37,7 @@ func TestFilerWorkspaceFilesExtensionsErrorsOnDupName(t *testing.T) {
expectedError string
}{
{
name: "python source notebook and file",
name: "python source notebook and file with source extension",
language: workspace.LanguagePython,
notebookExportFormat: workspace.ExportFormatSource,
notebookPath: "/dir/foo",
@ -45,7 +45,31 @@ func TestFilerWorkspaceFilesExtensionsErrorsOnDupName(t *testing.T) {
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.py resolve to the same name /foo.py. Changing the name of one of these objects will resolve this issue",
},
{
name: "python jupyter notebook and file",
name: "scala source notebook and file with source extension",
language: workspace.LanguageScala,
notebookExportFormat: workspace.ExportFormatSource,
notebookPath: "/dir/foo",
filePath: "/dir/foo.scala",
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.scala resolve to the same name /foo.scala. Changing the name of one of these objects will resolve this issue",
},
{
name: "r source notebook and file with source extension",
language: workspace.LanguageR,
notebookExportFormat: workspace.ExportFormatSource,
notebookPath: "/dir/foo",
filePath: "/dir/foo.r",
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.r resolve to the same name /foo.r. Changing the name of one of these objects will resolve this issue",
},
{
name: "sql source notebook and file with source extension",
language: workspace.LanguageSql,
notebookExportFormat: workspace.ExportFormatSource,
notebookPath: "/dir/foo",
filePath: "/dir/foo.sql",
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.sql resolve to the same name /foo.sql. Changing the name of one of these objects will resolve this issue",
},
{
name: "python jupyter notebook and file with source extension",
language: workspace.LanguagePython,
notebookExportFormat: workspace.ExportFormatJupyter,
notebookPath: "/dir/foo",
@ -54,37 +78,64 @@ func TestFilerWorkspaceFilesExtensionsErrorsOnDupName(t *testing.T) {
expectedError: "",
},
{
name: "scala source notebook and file",
name: "scala jupyter notebook and file with source extension",
language: workspace.LanguageScala,
notebookExportFormat: workspace.ExportFormatSource,
notebookExportFormat: workspace.ExportFormatJupyter,
notebookPath: "/dir/foo",
filePath: "/dir/foo.scala",
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.scala resolve to the same name /foo.scala. Changing the name of one of these objects will resolve this issue",
// Jupyter notebooks would correspond to foo.ipynb so an error is not expected.
expectedError: "",
},
{
name: "r source notebook and file",
language: workspace.LanguageR,
notebookExportFormat: workspace.ExportFormatSource,
notebookPath: "/dir/foo",
filePath: "/dir/foo.r",
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.r resolve to the same name /foo.r. Changing the name of one of these objects will resolve this issue",
},
{
name: "sql source notebook and file",
name: "sql jupyter notebook and file with source extension",
language: workspace.LanguageSql,
notebookExportFormat: workspace.ExportFormatSource,
notebookExportFormat: workspace.ExportFormatJupyter,
notebookPath: "/dir/foo",
filePath: "/dir/foo.sql",
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.sql resolve to the same name /foo.sql. Changing the name of one of these objects will resolve this issue",
// Jupyter notebooks would correspond to foo.ipynb so an error is not expected.
expectedError: "",
},
{
name: "python jupyter notebook and file",
name: "r jupyter notebook and file with source extension",
language: workspace.LanguageR,
notebookExportFormat: workspace.ExportFormatJupyter,
notebookPath: "/dir/foo",
filePath: "/dir/foo.sql",
// Jupyter notebooks would correspond to foo.ipynb so an error is not expected.
expectedError: "",
},
{
name: "python jupyter notebook and file with .ipynb extension",
language: workspace.LanguagePython,
notebookExportFormat: workspace.ExportFormatJupyter,
notebookPath: "/dir/foo",
filePath: "/dir/foo.ipynb",
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.ipynb resolve to the same name /foo.ipynb. Changing the name of one of these objects will resolve this issue",
},
{
name: "scala jupyter notebook and file with .ipynb extension",
language: workspace.LanguageScala,
notebookExportFormat: workspace.ExportFormatJupyter,
notebookPath: "/dir/foo",
filePath: "/dir/foo.ipynb",
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.ipynb resolve to the same name /foo.ipynb. Changing the name of one of these objects will resolve this issue",
},
{
name: "r jupyter notebook and file with .ipynb extension",
language: workspace.LanguageR,
notebookExportFormat: workspace.ExportFormatJupyter,
notebookPath: "/dir/foo",
filePath: "/dir/foo.ipynb",
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.ipynb resolve to the same name /foo.ipynb. Changing the name of one of these objects will resolve this issue",
},
{
name: "sql jupyter notebook and file with .ipynb extension",
language: workspace.LanguageSql,
notebookExportFormat: workspace.ExportFormatJupyter,
notebookPath: "/dir/foo",
filePath: "/dir/foo.ipynb",
expectedError: "failed to read files from the workspace file system. Duplicate paths encountered. Both NOTEBOOK at /dir/foo and FILE at /dir/foo.ipynb resolve to the same name /foo.ipynb. Changing the name of one of these objects will resolve this issue",
},
} {
t.Run(tc.name, func(t *testing.T) {
mockedWorkspaceClient := mocks.NewMockWorkspaceClient(t)

View File

@ -107,19 +107,19 @@ func DetectWithFS(fsys fs.FS, name string) (notebook bool, language workspace.La
// Determine which header to expect based on filename extension.
ext := strings.ToLower(filepath.Ext(name))
switch ext {
case ".py":
case ExtensionPython:
header = `# Databricks notebook source`
language = workspace.LanguagePython
case ".r":
case ExtensionR:
header = `# Databricks notebook source`
language = workspace.LanguageR
case ".scala":
case ExtensionScala:
header = "// Databricks notebook source"
language = workspace.LanguageScala
case ".sql":
case ExtensionSql:
header = "-- Databricks notebook source"
language = workspace.LanguageSql
case ".ipynb":
case ExtensionJupyter:
return DetectJupyterWithFS(fsys, name)
default:
return false, "", nil

View File

@ -2,22 +2,40 @@ package notebook
import "github.com/databricks/databricks-sdk-go/service/workspace"
const (
ExtensionNone string = ""
ExtensionPython string = ".py"
ExtensionR string = ".r"
ExtensionScala string = ".scala"
ExtensionSql string = ".sql"
ExtensionJupyter string = ".ipynb"
)
var ExtensionToLanguage = map[string]workspace.Language{
ExtensionPython: workspace.LanguagePython,
ExtensionR: workspace.LanguageR,
ExtensionScala: workspace.LanguageScala,
ExtensionSql: workspace.LanguageSql,
// The platform supports all languages (Python, R, Scala, and SQL) for Jupyter notebooks.
}
func GetExtensionByLanguage(objectInfo *workspace.ObjectInfo) string {
if objectInfo.ObjectType != workspace.ObjectTypeNotebook {
return ""
return ExtensionNone
}
switch objectInfo.Language {
case workspace.LanguagePython:
return ".py"
return ExtensionPython
case workspace.LanguageR:
return ".r"
return ExtensionR
case workspace.LanguageScala:
return ".scala"
return ExtensionScala
case workspace.LanguageSql:
return ".sql"
return ExtensionSql
default:
// Do not add any extension to the file name
return ""
return ExtensionNone
}
}