databricks-cli/libs/notebook/detect.go

142 lines
3.6 KiB
Go
Raw Normal View History

package notebook
import (
"bufio"
"bytes"
"io"
"io/fs"
"os"
"path/filepath"
"strings"
"github.com/databricks/databricks-sdk-go/service/workspace"
)
// FileInfoWithWorkspaceObjectInfo is an interface implemented by [fs.FileInfo] values that
// contain a file's underlying [workspace.ObjectInfo].
//
// This may be the case when working with a [filer.Filer] backed by the workspace API.
// For these files we do not need to read a file's header to know if it is a notebook;
// we can use the [workspace.ObjectInfo] value directly.
type FileInfoWithWorkspaceObjectInfo interface {
WorkspaceObjectInfo() workspace.ObjectInfo
}
// Maximum length in bytes of the notebook header.
const headerLength = 32
// file wraps an fs.File and implements a few helper methods such that
// they don't need to be inlined in the [DetectWithFS] function below.
type file struct {
f fs.File
}
func openFile(fsys fs.FS, name string) (*file, error) {
f, err := fsys.Open(name)
if err != nil {
return nil, err
}
return &file{f: f}, nil
}
func (f file) close() error {
return f.f.Close()
}
func (f file) readHeader() (string, error) {
// Scan header line with some padding.
var buf = make([]byte, headerLength)
n, err := f.f.Read([]byte(buf))
if err != nil && err != io.EOF {
return "", err
}
// Trim buffer to actual read bytes.
buf = buf[:n]
// Read the first line from the buffer.
scanner := bufio.NewScanner(bytes.NewReader(buf))
scanner.Scan()
return scanner.Text(), nil
}
// getObjectInfo returns the [workspace.ObjectInfo] for the file if it is
// part of the [fs.FileInfo] value returned by the [fs.Stat] call.
func (f file) getObjectInfo() (oi workspace.ObjectInfo, ok bool, err error) {
stat, err := f.f.Stat()
if err != nil {
return workspace.ObjectInfo{}, false, err
}
// Use object info if available.
if i, ok := stat.(FileInfoWithWorkspaceObjectInfo); ok {
return i.WorkspaceObjectInfo(), true, nil
}
return workspace.ObjectInfo{}, false, nil
}
// Detect returns whether the file at path is a Databricks notebook.
// If it is, it returns the notebook language.
func DetectWithFS(fsys fs.FS, name string) (notebook bool, language workspace.Language, err error) {
header := ""
f, err := openFile(fsys, name)
if err != nil {
return false, "", err
}
defer f.close()
// Use object info if available.
oi, ok, err := f.getObjectInfo()
if err != nil {
return false, "", err
}
if ok {
return oi.ObjectType == workspace.ObjectTypeNotebook, oi.Language, nil
}
// Read the first line of the file.
fileHeader, err := f.readHeader()
if err != nil {
return false, "", err
}
// Determine which header to expect based on filename extension.
ext := strings.ToLower(filepath.Ext(name))
switch ext {
Add support for non-Python ipynb notebooks to DABs (#1827) ## Changes ### Background The workspace import APIs recently added support for importing Jupyter notebooks written in R, Scala, or SQL, that is non-Python notebooks. This now works for the `/import-file` API which we leverage in the CLI. Note: We do not need any changes in `databricks sync`. It works out of the box because any state mapping of local names to remote names that we store is only scoped to the notebook extension (i.e., `.ipynb` in this case) and is agnostic of the notebook's specific language. ### Problem this PR addresses The extension-aware filer previously did not function because it checks that a `.ipynb` notebook is written in Python. This PR relaxes that constraint and adds integration tests for both the normal workspace filer and extensions aware filer writing and reading non-Python `.ipynb` notebooks. This implies that after this PR DABs in the workspace / CLI from DBR will work for non-Python notebooks as well. non-Python notebooks for DABs deployment from local machines already works after the platform side changes to the API landed, this PR just adds integration tests for that bit of functionality. Note: Any platform side changes we needed for the import API have already been rolled out to production. ### Before DABs deploy would work fine for non-Python notebooks. But DABs deployments from DBR would not. ### After DABs deploys both from local machines and DBR will work fine. ## Testing For creating the `.ipynb` notebook fixtures used in the integration tests I created them directly from the VSCode UI. This ensures high fidelity with how users will create their non-Python notebooks locally. For Python notebooks this is supported out of the box by VSCode but for R and Scala notebooks this requires installing the Jupyter kernel for R and Scala on my local machine and using that from VSCode. For SQL, I ended up directly modifying the `language_info` field in the Jupyter metadata to create the test fixture. ### Discussion: Issues with configuring language at the cell level The language metadata for a Jupyter notebook is standardized at the notebook level (in the `language_info` field). Unfortunately, it's not standardized at the cell level. Thus, for example, if a user changes the language for their cell in VSCode (which is supported by the standard Jupyter VSCode integration), it'll cause a runtime error when the user actually attempts to run the notebook. This is because the cell-level metadata is encoded in a format specific to VSCode: ``` cells: []{ "vscode": { "languageId": "sql" } } ``` Supporting cell level languages is thus out of scope for this PR and can be revisited along with the workspace files team if there's strong customer interest.
2024-11-13 21:39:51 +00:00
case ExtensionPython:
header = `# Databricks notebook source`
language = workspace.LanguagePython
Add support for non-Python ipynb notebooks to DABs (#1827) ## Changes ### Background The workspace import APIs recently added support for importing Jupyter notebooks written in R, Scala, or SQL, that is non-Python notebooks. This now works for the `/import-file` API which we leverage in the CLI. Note: We do not need any changes in `databricks sync`. It works out of the box because any state mapping of local names to remote names that we store is only scoped to the notebook extension (i.e., `.ipynb` in this case) and is agnostic of the notebook's specific language. ### Problem this PR addresses The extension-aware filer previously did not function because it checks that a `.ipynb` notebook is written in Python. This PR relaxes that constraint and adds integration tests for both the normal workspace filer and extensions aware filer writing and reading non-Python `.ipynb` notebooks. This implies that after this PR DABs in the workspace / CLI from DBR will work for non-Python notebooks as well. non-Python notebooks for DABs deployment from local machines already works after the platform side changes to the API landed, this PR just adds integration tests for that bit of functionality. Note: Any platform side changes we needed for the import API have already been rolled out to production. ### Before DABs deploy would work fine for non-Python notebooks. But DABs deployments from DBR would not. ### After DABs deploys both from local machines and DBR will work fine. ## Testing For creating the `.ipynb` notebook fixtures used in the integration tests I created them directly from the VSCode UI. This ensures high fidelity with how users will create their non-Python notebooks locally. For Python notebooks this is supported out of the box by VSCode but for R and Scala notebooks this requires installing the Jupyter kernel for R and Scala on my local machine and using that from VSCode. For SQL, I ended up directly modifying the `language_info` field in the Jupyter metadata to create the test fixture. ### Discussion: Issues with configuring language at the cell level The language metadata for a Jupyter notebook is standardized at the notebook level (in the `language_info` field). Unfortunately, it's not standardized at the cell level. Thus, for example, if a user changes the language for their cell in VSCode (which is supported by the standard Jupyter VSCode integration), it'll cause a runtime error when the user actually attempts to run the notebook. This is because the cell-level metadata is encoded in a format specific to VSCode: ``` cells: []{ "vscode": { "languageId": "sql" } } ``` Supporting cell level languages is thus out of scope for this PR and can be revisited along with the workspace files team if there's strong customer interest.
2024-11-13 21:39:51 +00:00
case ExtensionR:
header = `# Databricks notebook source`
language = workspace.LanguageR
Add support for non-Python ipynb notebooks to DABs (#1827) ## Changes ### Background The workspace import APIs recently added support for importing Jupyter notebooks written in R, Scala, or SQL, that is non-Python notebooks. This now works for the `/import-file` API which we leverage in the CLI. Note: We do not need any changes in `databricks sync`. It works out of the box because any state mapping of local names to remote names that we store is only scoped to the notebook extension (i.e., `.ipynb` in this case) and is agnostic of the notebook's specific language. ### Problem this PR addresses The extension-aware filer previously did not function because it checks that a `.ipynb` notebook is written in Python. This PR relaxes that constraint and adds integration tests for both the normal workspace filer and extensions aware filer writing and reading non-Python `.ipynb` notebooks. This implies that after this PR DABs in the workspace / CLI from DBR will work for non-Python notebooks as well. non-Python notebooks for DABs deployment from local machines already works after the platform side changes to the API landed, this PR just adds integration tests for that bit of functionality. Note: Any platform side changes we needed for the import API have already been rolled out to production. ### Before DABs deploy would work fine for non-Python notebooks. But DABs deployments from DBR would not. ### After DABs deploys both from local machines and DBR will work fine. ## Testing For creating the `.ipynb` notebook fixtures used in the integration tests I created them directly from the VSCode UI. This ensures high fidelity with how users will create their non-Python notebooks locally. For Python notebooks this is supported out of the box by VSCode but for R and Scala notebooks this requires installing the Jupyter kernel for R and Scala on my local machine and using that from VSCode. For SQL, I ended up directly modifying the `language_info` field in the Jupyter metadata to create the test fixture. ### Discussion: Issues with configuring language at the cell level The language metadata for a Jupyter notebook is standardized at the notebook level (in the `language_info` field). Unfortunately, it's not standardized at the cell level. Thus, for example, if a user changes the language for their cell in VSCode (which is supported by the standard Jupyter VSCode integration), it'll cause a runtime error when the user actually attempts to run the notebook. This is because the cell-level metadata is encoded in a format specific to VSCode: ``` cells: []{ "vscode": { "languageId": "sql" } } ``` Supporting cell level languages is thus out of scope for this PR and can be revisited along with the workspace files team if there's strong customer interest.
2024-11-13 21:39:51 +00:00
case ExtensionScala:
header = "// Databricks notebook source"
language = workspace.LanguageScala
Add support for non-Python ipynb notebooks to DABs (#1827) ## Changes ### Background The workspace import APIs recently added support for importing Jupyter notebooks written in R, Scala, or SQL, that is non-Python notebooks. This now works for the `/import-file` API which we leverage in the CLI. Note: We do not need any changes in `databricks sync`. It works out of the box because any state mapping of local names to remote names that we store is only scoped to the notebook extension (i.e., `.ipynb` in this case) and is agnostic of the notebook's specific language. ### Problem this PR addresses The extension-aware filer previously did not function because it checks that a `.ipynb` notebook is written in Python. This PR relaxes that constraint and adds integration tests for both the normal workspace filer and extensions aware filer writing and reading non-Python `.ipynb` notebooks. This implies that after this PR DABs in the workspace / CLI from DBR will work for non-Python notebooks as well. non-Python notebooks for DABs deployment from local machines already works after the platform side changes to the API landed, this PR just adds integration tests for that bit of functionality. Note: Any platform side changes we needed for the import API have already been rolled out to production. ### Before DABs deploy would work fine for non-Python notebooks. But DABs deployments from DBR would not. ### After DABs deploys both from local machines and DBR will work fine. ## Testing For creating the `.ipynb` notebook fixtures used in the integration tests I created them directly from the VSCode UI. This ensures high fidelity with how users will create their non-Python notebooks locally. For Python notebooks this is supported out of the box by VSCode but for R and Scala notebooks this requires installing the Jupyter kernel for R and Scala on my local machine and using that from VSCode. For SQL, I ended up directly modifying the `language_info` field in the Jupyter metadata to create the test fixture. ### Discussion: Issues with configuring language at the cell level The language metadata for a Jupyter notebook is standardized at the notebook level (in the `language_info` field). Unfortunately, it's not standardized at the cell level. Thus, for example, if a user changes the language for their cell in VSCode (which is supported by the standard Jupyter VSCode integration), it'll cause a runtime error when the user actually attempts to run the notebook. This is because the cell-level metadata is encoded in a format specific to VSCode: ``` cells: []{ "vscode": { "languageId": "sql" } } ``` Supporting cell level languages is thus out of scope for this PR and can be revisited along with the workspace files team if there's strong customer interest.
2024-11-13 21:39:51 +00:00
case ExtensionSql:
header = "-- Databricks notebook source"
language = workspace.LanguageSql
Add support for non-Python ipynb notebooks to DABs (#1827) ## Changes ### Background The workspace import APIs recently added support for importing Jupyter notebooks written in R, Scala, or SQL, that is non-Python notebooks. This now works for the `/import-file` API which we leverage in the CLI. Note: We do not need any changes in `databricks sync`. It works out of the box because any state mapping of local names to remote names that we store is only scoped to the notebook extension (i.e., `.ipynb` in this case) and is agnostic of the notebook's specific language. ### Problem this PR addresses The extension-aware filer previously did not function because it checks that a `.ipynb` notebook is written in Python. This PR relaxes that constraint and adds integration tests for both the normal workspace filer and extensions aware filer writing and reading non-Python `.ipynb` notebooks. This implies that after this PR DABs in the workspace / CLI from DBR will work for non-Python notebooks as well. non-Python notebooks for DABs deployment from local machines already works after the platform side changes to the API landed, this PR just adds integration tests for that bit of functionality. Note: Any platform side changes we needed for the import API have already been rolled out to production. ### Before DABs deploy would work fine for non-Python notebooks. But DABs deployments from DBR would not. ### After DABs deploys both from local machines and DBR will work fine. ## Testing For creating the `.ipynb` notebook fixtures used in the integration tests I created them directly from the VSCode UI. This ensures high fidelity with how users will create their non-Python notebooks locally. For Python notebooks this is supported out of the box by VSCode but for R and Scala notebooks this requires installing the Jupyter kernel for R and Scala on my local machine and using that from VSCode. For SQL, I ended up directly modifying the `language_info` field in the Jupyter metadata to create the test fixture. ### Discussion: Issues with configuring language at the cell level The language metadata for a Jupyter notebook is standardized at the notebook level (in the `language_info` field). Unfortunately, it's not standardized at the cell level. Thus, for example, if a user changes the language for their cell in VSCode (which is supported by the standard Jupyter VSCode integration), it'll cause a runtime error when the user actually attempts to run the notebook. This is because the cell-level metadata is encoded in a format specific to VSCode: ``` cells: []{ "vscode": { "languageId": "sql" } } ``` Supporting cell level languages is thus out of scope for this PR and can be revisited along with the workspace files team if there's strong customer interest.
2024-11-13 21:39:51 +00:00
case ExtensionJupyter:
return DetectJupyterWithFS(fsys, name)
default:
return false, "", nil
}
if fileHeader != header {
return false, "", nil
}
return true, language, nil
}
// Detect calls DetectWithFS with the local filesystem.
// The name argument may be a local relative path or a local absolute path.
func Detect(name string) (notebook bool, language workspace.Language, err error) {
d := filepath.ToSlash(filepath.Dir(name))
b := filepath.Base(name)
return DetectWithFS(os.DirFS(d), b)
}