Let notebook detection code use underlying metadata if available (#1574)

## Changes

If we're using a `vfs.Path` backed by a workspace filesystem filer, we
have access to the `workspace.ObjectInfo` value for every file. By
providing access to this value we can use it directly and avoid reading
the first line of the underlying file.

A follow-up change will implement the interface defined in this change
for the workspace filesystem filer.

## Tests

Unit tests.
This commit is contained in:
Pieter Noordhuis 2024-07-10 08:37:47 +02:00 committed by GitHub
parent 5bc5c3c26a
commit 8f56ca39a2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 161 additions and 10 deletions

View File

@ -12,27 +12,69 @@ import (
"github.com/databricks/databricks-sdk-go/service/workspace" "github.com/databricks/databricks-sdk-go/service/workspace"
) )
// FileInfoWithWorkspaceObjectInfo is an interface implemented by [fs.FileInfo] values that
// contain a file's underlying [workspace.ObjectInfo].
//
// This may be the case when working with a [filer.Filer] backed by the workspace API.
// For these files we do not need to read a file's header to know if it is a notebook;
// we can use the [workspace.ObjectInfo] value directly.
type FileInfoWithWorkspaceObjectInfo interface {
WorkspaceObjectInfo() workspace.ObjectInfo
}
// Maximum length in bytes of the notebook header. // Maximum length in bytes of the notebook header.
const headerLength = 32 const headerLength = 32
// readHeader reads the first N bytes from a file. // file wraps an fs.File and implements a few helper methods such that
func readHeader(fsys fs.FS, name string) ([]byte, error) { // they don't need to be inlined in the [DetectWithFS] function below.
type file struct {
f fs.File
}
func openFile(fsys fs.FS, name string) (*file, error) {
f, err := fsys.Open(name) f, err := fsys.Open(name)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer f.Close() return &file{f: f}, nil
}
func (f file) close() error {
return f.f.Close()
}
func (f file) readHeader() (string, error) {
// Scan header line with some padding. // Scan header line with some padding.
var buf = make([]byte, headerLength) var buf = make([]byte, headerLength)
n, err := f.Read([]byte(buf)) n, err := f.f.Read([]byte(buf))
if err != nil && err != io.EOF { if err != nil && err != io.EOF {
return nil, err return "", err
} }
// Trim buffer to actual read bytes. // Trim buffer to actual read bytes.
return buf[:n], nil buf = buf[:n]
// Read the first line from the buffer.
scanner := bufio.NewScanner(bytes.NewReader(buf))
scanner.Scan()
return scanner.Text(), nil
}
// getObjectInfo returns the [workspace.ObjectInfo] for the file if it is
// part of the [fs.FileInfo] value returned by the [fs.Stat] call.
func (f file) getObjectInfo() (oi workspace.ObjectInfo, ok bool, err error) {
stat, err := f.f.Stat()
if err != nil {
return workspace.ObjectInfo{}, false, err
}
// Use object info if available.
if i, ok := stat.(FileInfoWithWorkspaceObjectInfo); ok {
return i.WorkspaceObjectInfo(), true, nil
}
return workspace.ObjectInfo{}, false, nil
} }
// Detect returns whether the file at path is a Databricks notebook. // Detect returns whether the file at path is a Databricks notebook.
@ -40,13 +82,27 @@ func readHeader(fsys fs.FS, name string) ([]byte, error) {
func DetectWithFS(fsys fs.FS, name string) (notebook bool, language workspace.Language, err error) { func DetectWithFS(fsys fs.FS, name string) (notebook bool, language workspace.Language, err error) {
header := "" header := ""
buf, err := readHeader(fsys, name) f, err := openFile(fsys, name)
if err != nil {
return false, "", err
}
defer f.close()
// Use object info if available.
oi, ok, err := f.getObjectInfo()
if err != nil {
return false, "", err
}
if ok {
return oi.ObjectType == workspace.ObjectTypeNotebook, oi.Language, nil
}
// Read the first line of the file.
fileHeader, err := f.readHeader()
if err != nil { if err != nil {
return false, "", err return false, "", err
} }
scanner := bufio.NewScanner(bytes.NewReader(buf))
scanner.Scan()
fileHeader := scanner.Text()
// Determine which header to expect based on filename extension. // Determine which header to expect based on filename extension.
ext := strings.ToLower(filepath.Ext(name)) ext := strings.ToLower(filepath.Ext(name))

View File

@ -99,3 +99,21 @@ func TestDetectFileWithLongHeader(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
assert.False(t, nb) assert.False(t, nb)
} }
func TestDetectWithObjectInfo(t *testing.T) {
fakeFS := &fakeFS{
fakeFile{
fakeFileInfo{
workspace.ObjectInfo{
ObjectType: workspace.ObjectTypeNotebook,
Language: workspace.LanguagePython,
},
},
},
}
nb, lang, err := DetectWithFS(fakeFS, "doesntmatter")
require.NoError(t, err)
assert.True(t, nb)
assert.Equal(t, workspace.LanguagePython, lang)
}

View File

@ -0,0 +1,77 @@
package notebook
import (
"fmt"
"io/fs"
"time"
"github.com/databricks/databricks-sdk-go/service/workspace"
)
type fakeFS struct {
fakeFile
}
type fakeFile struct {
fakeFileInfo
}
func (f fakeFile) Close() error {
return nil
}
func (f fakeFile) Read(p []byte) (n int, err error) {
return 0, fmt.Errorf("not implemented")
}
func (f fakeFile) Stat() (fs.FileInfo, error) {
return f.fakeFileInfo, nil
}
type fakeFileInfo struct {
oi workspace.ObjectInfo
}
func (f fakeFileInfo) WorkspaceObjectInfo() workspace.ObjectInfo {
return f.oi
}
func (f fakeFileInfo) Name() string {
return ""
}
func (f fakeFileInfo) Size() int64 {
return 0
}
func (f fakeFileInfo) Mode() fs.FileMode {
return 0
}
func (f fakeFileInfo) ModTime() time.Time {
return time.Time{}
}
func (f fakeFileInfo) IsDir() bool {
return false
}
func (f fakeFileInfo) Sys() any {
return nil
}
func (f fakeFS) Open(name string) (fs.File, error) {
return f.fakeFile, nil
}
func (f fakeFS) Stat(name string) (fs.FileInfo, error) {
panic("not implemented")
}
func (f fakeFS) ReadDir(name string) ([]fs.DirEntry, error) {
panic("not implemented")
}
func (f fakeFS) ReadFile(name string) ([]byte, error) {
panic("not implemented")
}