2022-07-07 18:56:59 +00:00
|
|
|
package sync
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2022-09-14 15:50:29 +00:00
|
|
|
"fmt"
|
|
|
|
"io"
|
2022-07-07 18:56:59 +00:00
|
|
|
"log"
|
2022-11-24 20:41:57 +00:00
|
|
|
"net/http"
|
2022-09-14 15:50:29 +00:00
|
|
|
"os"
|
|
|
|
"path"
|
|
|
|
"path/filepath"
|
|
|
|
"strings"
|
2022-07-07 18:56:59 +00:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
2022-09-14 15:50:29 +00:00
|
|
|
"github.com/databricks/bricks/project"
|
2022-11-24 20:41:57 +00:00
|
|
|
"github.com/databricks/databricks-sdk-go"
|
|
|
|
"github.com/databricks/databricks-sdk-go/apierr"
|
|
|
|
"github.com/databricks/databricks-sdk-go/client"
|
2022-09-14 15:50:29 +00:00
|
|
|
"github.com/databricks/databricks-sdk-go/service/workspace"
|
2022-10-04 22:12:57 +00:00
|
|
|
"golang.org/x/sync/errgroup"
|
2022-07-07 18:56:59 +00:00
|
|
|
)
|
|
|
|
|
2022-10-19 14:22:55 +00:00
|
|
|
// TODO: add .databricks to .gitignore on bricks init
|
2022-07-07 18:56:59 +00:00
|
|
|
type watchdog struct {
|
|
|
|
ticker *time.Ticker
|
|
|
|
wg sync.WaitGroup
|
|
|
|
failure error // data race? make channel?
|
|
|
|
}
|
|
|
|
|
2022-10-04 22:12:57 +00:00
|
|
|
// See https://docs.databricks.com/resources/limits.html#limits-api-rate-limits for per api
|
|
|
|
// rate limits
|
|
|
|
const MaxRequestsInFlight = 20
|
|
|
|
|
2022-11-21 22:42:09 +00:00
|
|
|
// path: The local path of the file in the local file system
|
|
|
|
//
|
|
|
|
// The API calls for a python script foo.py would be
|
|
|
|
// `PUT foo.py`
|
|
|
|
// `DELETE foo.py`
|
|
|
|
//
|
|
|
|
// The API calls for a python notebook foo.py would be
|
|
|
|
// `PUT foo.py`
|
|
|
|
// `DELETE foo`
|
|
|
|
//
|
|
|
|
// The workspace file system backend strips .py from the file name if the python
|
|
|
|
// file is a notebook
|
2022-11-25 13:13:15 +00:00
|
|
|
func putFile(ctx context.Context, remotePath string, content io.Reader) error {
|
2022-09-16 09:06:58 +00:00
|
|
|
wsc := project.Get(ctx).WorkspacesClient()
|
2022-09-14 15:50:29 +00:00
|
|
|
// workspace mkdirs is idempotent
|
2022-11-25 13:13:15 +00:00
|
|
|
err := wsc.Workspace.MkdirsByPath(ctx, path.Dir(remotePath))
|
2022-09-14 15:50:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("could not mkdir to put file: %s", err)
|
|
|
|
}
|
2022-11-09 14:01:47 +00:00
|
|
|
apiClient, err := client.New(wsc.Config)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2022-09-14 15:50:29 +00:00
|
|
|
apiPath := fmt.Sprintf(
|
|
|
|
"/api/2.0/workspace-files/import-file/%s?overwrite=true",
|
2022-11-25 13:13:15 +00:00
|
|
|
strings.TrimLeft(remotePath, "/"))
|
2022-11-24 20:41:57 +00:00
|
|
|
return apiClient.Do(ctx, http.MethodPost, apiPath, content, nil)
|
2022-09-14 15:50:29 +00:00
|
|
|
}
|
|
|
|
|
2022-11-21 22:42:09 +00:00
|
|
|
// path: The remote path of the file in the workspace
|
2022-11-24 20:41:57 +00:00
|
|
|
func deleteFile(ctx context.Context, path string, w *databricks.WorkspaceClient) error {
|
|
|
|
err := w.Workspace.Delete(ctx,
|
2022-10-27 15:32:10 +00:00
|
|
|
workspace.Delete{
|
|
|
|
Path: path,
|
2022-11-30 12:56:52 +00:00
|
|
|
Recursive: false,
|
2022-10-27 15:32:10 +00:00
|
|
|
},
|
|
|
|
)
|
|
|
|
// We explictly ignore RESOURCE_DOES_NOT_EXIST errors for deletion of files
|
|
|
|
// This makes deletion operation idempotent and allows us to not crash syncing on
|
|
|
|
// edge cases for eg: this api fails to delete notebooks, and returns a
|
|
|
|
// RESOURCE_DOES_NOT_EXIST error instead
|
|
|
|
if val, ok := err.(apierr.APIError); ok && val.ErrorCode == "RESOURCE_DOES_NOT_EXIST" {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2022-11-24 20:41:57 +00:00
|
|
|
func getRemoteSyncCallback(ctx context.Context, root, remoteDir string, w *databricks.WorkspaceClient) func(localDiff diff) error {
|
2022-09-14 15:50:29 +00:00
|
|
|
return func(d diff) error {
|
2022-10-04 22:12:57 +00:00
|
|
|
|
|
|
|
// Abstraction over wait groups which allows you to get the errors
|
|
|
|
// returned in goroutines
|
|
|
|
var g errgroup.Group
|
|
|
|
|
|
|
|
// Allow MaxRequestLimit maxiumum concurrent api calls
|
|
|
|
g.SetLimit(MaxRequestsInFlight)
|
|
|
|
|
2022-11-21 22:42:09 +00:00
|
|
|
for _, remoteName := range d.delete {
|
|
|
|
// Copy of remoteName created to make this safe for concurrent use.
|
|
|
|
// directly using remoteName can cause race conditions since the loop
|
|
|
|
// might iterate over to the next remoteName before the go routine function
|
2022-10-04 22:12:57 +00:00
|
|
|
// is evaluated
|
2022-11-21 22:42:09 +00:00
|
|
|
remoteNameCopy := remoteName
|
2022-10-04 22:12:57 +00:00
|
|
|
g.Go(func() error {
|
2022-11-24 20:41:57 +00:00
|
|
|
err := deleteFile(ctx, path.Join(remoteDir, remoteNameCopy), w)
|
2022-10-04 22:12:57 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2022-11-21 22:42:09 +00:00
|
|
|
log.Printf("[INFO] Deleted %s", remoteNameCopy)
|
2022-10-04 22:12:57 +00:00
|
|
|
return nil
|
|
|
|
})
|
|
|
|
}
|
2022-11-21 22:42:09 +00:00
|
|
|
for _, localName := range d.put {
|
|
|
|
// Copy of localName created to make this safe for concurrent use.
|
|
|
|
localNameCopy := localName
|
2022-10-04 22:12:57 +00:00
|
|
|
g.Go(func() error {
|
2022-11-21 22:42:09 +00:00
|
|
|
f, err := os.Open(filepath.Join(root, localNameCopy))
|
2022-10-04 22:12:57 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2022-11-21 22:42:09 +00:00
|
|
|
err = putFile(ctx, path.Join(remoteDir, localNameCopy), f)
|
2022-10-04 22:12:57 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to upload file: %s", err)
|
|
|
|
}
|
|
|
|
err = f.Close()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2022-11-21 22:42:09 +00:00
|
|
|
log.Printf("[INFO] Uploaded %s", localNameCopy)
|
2022-10-04 22:12:57 +00:00
|
|
|
return nil
|
|
|
|
})
|
2022-09-14 15:50:29 +00:00
|
|
|
}
|
2022-10-04 22:12:57 +00:00
|
|
|
// wait for goroutines to finish and return first non-nil error return
|
|
|
|
// if any
|
|
|
|
if err := g.Wait(); err != nil {
|
|
|
|
return err
|
2022-09-14 15:50:29 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func spawnSyncRoutine(ctx context.Context,
|
|
|
|
interval time.Duration,
|
2022-10-19 14:22:55 +00:00
|
|
|
applyDiff func(diff) error,
|
|
|
|
remotePath string) error {
|
2022-07-07 18:56:59 +00:00
|
|
|
w := &watchdog{
|
|
|
|
ticker: time.NewTicker(interval),
|
|
|
|
}
|
|
|
|
w.wg.Add(1)
|
2022-10-19 14:22:55 +00:00
|
|
|
go w.main(ctx, applyDiff, remotePath)
|
2022-07-07 18:56:59 +00:00
|
|
|
w.wg.Wait()
|
|
|
|
return w.failure
|
|
|
|
}
|
|
|
|
|
|
|
|
// tradeoff: doing portable monitoring only due to macOS max descriptor manual ulimit setting requirement
|
|
|
|
// https://github.com/gorakhargosh/watchdog/blob/master/src/watchdog/observers/kqueue.py#L394-L418
|
2022-10-19 14:22:55 +00:00
|
|
|
func (w *watchdog) main(ctx context.Context, applyDiff func(diff) error, remotePath string) {
|
2022-07-07 18:56:59 +00:00
|
|
|
defer w.wg.Done()
|
2022-10-19 14:22:55 +00:00
|
|
|
snapshot, err := newSnapshot(ctx, remotePath)
|
|
|
|
if err != nil {
|
|
|
|
log.Printf("[ERROR] cannot create snapshot: %s", err)
|
|
|
|
w.failure = err
|
|
|
|
return
|
|
|
|
}
|
2022-09-19 14:47:55 +00:00
|
|
|
if *persistSnapshot {
|
2022-10-19 14:22:55 +00:00
|
|
|
err := snapshot.loadSnapshot(ctx)
|
2022-09-19 14:47:55 +00:00
|
|
|
if err != nil {
|
|
|
|
log.Printf("[ERROR] cannot load snapshot: %s", err)
|
|
|
|
w.failure = err
|
|
|
|
return
|
|
|
|
}
|
2022-09-14 15:50:29 +00:00
|
|
|
}
|
2022-10-19 14:22:55 +00:00
|
|
|
prj := project.Get(ctx)
|
2022-10-27 13:41:18 +00:00
|
|
|
var onlyOnceInitLog sync.Once
|
2022-07-07 18:56:59 +00:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
case <-w.ticker.C:
|
2022-10-19 14:22:55 +00:00
|
|
|
all, err := prj.GetFileSet().All()
|
2022-07-07 18:56:59 +00:00
|
|
|
if err != nil {
|
|
|
|
log.Printf("[ERROR] cannot list files: %s", err)
|
|
|
|
w.failure = err
|
|
|
|
return
|
|
|
|
}
|
2022-11-21 22:42:09 +00:00
|
|
|
change, err := snapshot.diff(all)
|
|
|
|
if err != nil {
|
|
|
|
w.failure = err
|
|
|
|
return
|
|
|
|
}
|
2022-07-07 18:56:59 +00:00
|
|
|
if change.IsEmpty() {
|
2022-10-27 13:41:18 +00:00
|
|
|
onlyOnceInitLog.Do(func() {
|
|
|
|
log.Printf("[INFO] Initial Sync Complete")
|
|
|
|
})
|
2022-07-07 18:56:59 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
log.Printf("[INFO] Action: %v", change)
|
2022-09-14 15:50:29 +00:00
|
|
|
err = applyDiff(change)
|
|
|
|
if err != nil {
|
|
|
|
w.failure = err
|
|
|
|
return
|
|
|
|
}
|
2022-09-19 14:47:55 +00:00
|
|
|
if *persistSnapshot {
|
2022-10-19 14:22:55 +00:00
|
|
|
err = snapshot.storeSnapshot(ctx)
|
2022-09-19 14:47:55 +00:00
|
|
|
if err != nil {
|
|
|
|
log.Printf("[ERROR] cannot store snapshot: %s", err)
|
|
|
|
w.failure = err
|
|
|
|
return
|
|
|
|
}
|
2022-07-07 18:56:59 +00:00
|
|
|
}
|
2022-10-27 13:41:18 +00:00
|
|
|
onlyOnceInitLog.Do(func() {
|
|
|
|
log.Printf("[INFO] Initial Sync Complete")
|
|
|
|
})
|
2022-07-07 18:56:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|