mirror of https://github.com/databricks/cli.git
Use the `/dbfs/put` API endpoint to upload smaller DBFS files
This commit is contained in:
parent
e86a949d99
commit
c4cea1aeff
|
@ -1,11 +1,15 @@
|
||||||
package filer
|
package filer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
|
"mime/multipart"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"slices"
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
|
@ -14,6 +18,7 @@ import (
|
||||||
|
|
||||||
"github.com/databricks/databricks-sdk-go"
|
"github.com/databricks/databricks-sdk-go"
|
||||||
"github.com/databricks/databricks-sdk-go/apierr"
|
"github.com/databricks/databricks-sdk-go/apierr"
|
||||||
|
"github.com/databricks/databricks-sdk-go/client"
|
||||||
"github.com/databricks/databricks-sdk-go/service/files"
|
"github.com/databricks/databricks-sdk-go/service/files"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -79,6 +84,80 @@ func NewDbfsClient(w *databricks.WorkspaceClient, root string) (Filer, error) {
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (w *DbfsClient) uploadUsingDbfsPutApi(ctx context.Context, path string, overwrite bool, file *os.File) error {
|
||||||
|
apiClient, err := client.New(w.workspaceClient.Config)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to create API client: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
buf := &bytes.Buffer{}
|
||||||
|
writer := multipart.NewWriter(buf)
|
||||||
|
err = writer.WriteField("path", path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
err = writer.WriteField("overwrite", "True")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
contents, err := writer.CreateFormFile("contents", "")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = io.Copy(contents, file)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = writer.Close()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Request bodies of Content-Type multipart/form-data must are not supported by
|
||||||
|
// the Go SDK directly for DBFS. So we use the Do method directly.
|
||||||
|
return apiClient.Do(ctx, http.MethodPost, "/api/2.0/dbfs/put", map[string]string{
|
||||||
|
"Content-Type": writer.FormDataContentType(),
|
||||||
|
}, buf.Bytes(), nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *DbfsClient) uploadUsingDbfsStreamingApi(ctx context.Context, path string, overwrite bool, reader io.Reader) error {
|
||||||
|
fileMode := files.FileModeWrite
|
||||||
|
if overwrite {
|
||||||
|
fileMode |= files.FileModeOverwrite
|
||||||
|
}
|
||||||
|
|
||||||
|
handle, err := w.workspaceClient.Dbfs.Open(ctx, path, fileMode)
|
||||||
|
if err != nil {
|
||||||
|
var aerr *apierr.APIError
|
||||||
|
if !errors.As(err, &aerr) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// This API returns a 400 if the file already exists.
|
||||||
|
if aerr.StatusCode == http.StatusBadRequest {
|
||||||
|
if aerr.ErrorCode == "RESOURCE_ALREADY_EXISTS" {
|
||||||
|
return FileAlreadyExistsError{path}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = io.Copy(handle, reader)
|
||||||
|
cerr := handle.Close()
|
||||||
|
if err == nil {
|
||||||
|
err = cerr
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// MaxUploadLimitForPutApi is the maximum size in bytes of a file that can be uploaded
|
||||||
|
// using the /dbfs/put API. If the file is larger than this limit, the streaming
|
||||||
|
// API (/dbfs/create and /dbfs/add-block) will be used instead.
|
||||||
|
var MaxUploadLimitForPutApi int64 = 2 * 1024 * 1024
|
||||||
|
|
||||||
func (w *DbfsClient) Write(ctx context.Context, name string, reader io.Reader, mode ...WriteMode) error {
|
func (w *DbfsClient) Write(ctx context.Context, name string, reader io.Reader, mode ...WriteMode) error {
|
||||||
absPath, err := w.root.Join(name)
|
absPath, err := w.root.Join(name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -114,30 +193,27 @@ func (w *DbfsClient) Write(ctx context.Context, name string, reader io.Reader, m
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
handle, err := w.workspaceClient.Dbfs.Open(ctx, absPath, fileMode)
|
localFile, ok := reader.(*os.File)
|
||||||
|
|
||||||
|
// If the source is not a local file, we'll always use the streaming API endpoint.
|
||||||
|
if !ok {
|
||||||
|
return w.uploadUsingDbfsStreamingApi(ctx, absPath, slices.Contains(mode, OverwriteIfExists), reader)
|
||||||
|
}
|
||||||
|
|
||||||
|
stat, err := localFile.Stat()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
var aerr *apierr.APIError
|
return fmt.Errorf("failed to stat file: %w", err)
|
||||||
if !errors.As(err, &aerr) {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// This API returns a 400 if the file already exists.
|
|
||||||
if aerr.StatusCode == http.StatusBadRequest {
|
|
||||||
if aerr.ErrorCode == "RESOURCE_ALREADY_EXISTS" {
|
|
||||||
return FileAlreadyExistsError{absPath}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = io.Copy(handle, reader)
|
// If the source is a local file, but is too large then we'll use the streaming API endpoint.
|
||||||
cerr := handle.Close()
|
if stat.Size() > MaxUploadLimitForPutApi {
|
||||||
if err == nil {
|
return w.uploadUsingDbfsStreamingApi(ctx, absPath, slices.Contains(mode, OverwriteIfExists), localFile)
|
||||||
err = cerr
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return err
|
// Use the /dbfs/put API when the file is on the local filesystem
|
||||||
|
// and is small enough. This is the most common case when users use the
|
||||||
|
// `databricks fs cp` command.
|
||||||
|
return w.uploadUsingDbfsPutApi(ctx, absPath, slices.Contains(mode, OverwriteIfExists), localFile)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *DbfsClient) Read(ctx context.Context, name string) (io.ReadCloser, error) {
|
func (w *DbfsClient) Read(ctx context.Context, name string) (io.ReadCloser, error) {
|
||||||
|
|
Loading…
Reference in New Issue