Deal with notebook remote filenames stripping .py (#90)

This PR introduces tracking of remote names and local names of files in snapshots to disambiguate between files which might have the same remote name and handle clean deleting of files whose remote name changes due (eg. python notebook getting converted to a python notebook)
This commit is contained in:
shreyas-goenka 2022-11-21 23:42:09 +01:00 committed by GitHub
parent ab1df558a2
commit b25bd318e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 304 additions and 54 deletions

View File

@ -1,12 +1,14 @@
package sync package sync
import ( import (
"bufio"
"context" "context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
"os" "os"
"path/filepath" "path/filepath"
"regexp"
"strings" "strings"
"time" "time"
@ -38,6 +40,13 @@ type Snapshot struct {
// key: relative file path from project root // key: relative file path from project root
// value: last time the remote instance of this file was updated // value: last time the remote instance of this file was updated
LastUpdatedTimes map[string]time.Time `json:"last_modified_times"` LastUpdatedTimes map[string]time.Time `json:"last_modified_times"`
// This map maps local file names to their remote names
// eg. notebook named "foo.py" locally would be stored as "foo", thus this
// map will contain an entry "foo.py" -> "foo"
LocalToRemoteNames map[string]string `json:"local_to_remote_names"`
// Inverse of localToRemoteNames. Together the form a bijective mapping (ie
// there is a 1:1 unique mapping between local and remote name)
RemoteToLocalNames map[string]string `json:"remote_to_local_names"`
} }
type diff struct { type diff struct {
@ -87,9 +96,11 @@ func newSnapshot(ctx context.Context, remotePath string) (*Snapshot, error) {
} }
return &Snapshot{ return &Snapshot{
Host: host, Host: host,
RemotePath: remotePath, RemotePath: remotePath,
LastUpdatedTimes: make(map[string]time.Time), LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string),
RemoteToLocalNames: make(map[string]string),
}, nil }, nil
} }
@ -161,9 +172,37 @@ func (d diff) String() string {
return strings.Join(changes, ", ") return strings.Join(changes, ", ")
} }
func (s Snapshot) diff(all []git.File) (change diff) { func getNotebookDetails(path string) (isNotebook bool, typeOfNotebook string, err error) {
isNotebook = false
typeOfNotebook = ""
isPythonFile, err := regexp.Match(`\.py$`, []byte(path))
if err != nil {
return
}
if isPythonFile {
f, err := os.Open(path)
if err != nil {
return false, "", err
}
defer f.Close()
scanner := bufio.NewScanner(f)
ok := scanner.Scan()
if !ok {
return false, "", scanner.Err()
}
// A python file is a notebook if it starts with the following magic string
isNotebook = strings.Contains(scanner.Text(), "# Databricks notebook source")
return isNotebook, "PYTHON", nil
}
return false, "", nil
}
func (s Snapshot) diff(all []git.File) (change diff, err error) {
currentFilenames := map[string]bool{} currentFilenames := map[string]bool{}
lastModifiedTimes := s.LastUpdatedTimes lastModifiedTimes := s.LastUpdatedTimes
remoteToLocalNames := s.RemoteToLocalNames
localToRemoteNames := s.LocalToRemoteNames
for _, f := range all { for _, f := range all {
// create set of current files to figure out if removals are needed // create set of current files to figure out if removals are needed
currentFilenames[f.Relative] = true currentFilenames[f.Relative] = true
@ -174,22 +213,52 @@ func (s Snapshot) diff(all []git.File) (change diff) {
if !seen || modified.After(lastSeenModified) { if !seen || modified.After(lastSeenModified) {
change.put = append(change.put, f.Relative) change.put = append(change.put, f.Relative)
lastModifiedTimes[f.Relative] = modified lastModifiedTimes[f.Relative] = modified
// Keep track of remote names of local files so we can delete them later
isNotebook, typeOfNotebook, err := getNotebookDetails(f.Absolute)
if err != nil {
return change, err
}
remoteName := f.Relative
if isNotebook && typeOfNotebook == "PYTHON" {
remoteName = strings.TrimSuffix(f.Relative, `.py`)
}
// If the remote handle of a file changes, we want to delete the old
// remote version of that file to avoid duplicates.
// This can happen if a python notebook is converted to a python
// script or vice versa
oldRemoteName, ok := localToRemoteNames[f.Relative]
if ok && oldRemoteName != remoteName {
change.delete = append(change.delete, oldRemoteName)
delete(remoteToLocalNames, oldRemoteName)
}
// We cannot allow two local files in the project to point to the same
// remote path
oldLocalName, ok := remoteToLocalNames[remoteName]
if ok && oldLocalName != f.Relative {
return change, fmt.Errorf("both %s and %s point to the same remote file location %s. Please remove one of them from your local project", oldLocalName, f.Relative, remoteName)
}
localToRemoteNames[f.Relative] = remoteName
remoteToLocalNames[remoteName] = f.Relative
} }
} }
// figure out files in the snapshot, but not on local filesystem // figure out files in the snapshot.lastModifiedTimes, but not on local
for relative := range lastModifiedTimes { // filesystem. These will be deleted
_, exists := currentFilenames[relative] for localName := range lastModifiedTimes {
_, exists := currentFilenames[localName]
if exists { if exists {
continue continue
} }
// add them to a delete batch // add them to a delete batch
change.delete = append(change.delete, relative) change.delete = append(change.delete, localToRemoteNames[localName])
// remove the file from snapshot
delete(lastModifiedTimes, relative)
} }
// and remove them from the snapshot // and remove them from the snapshot
for _, v := range change.delete { for _, remoteName := range change.delete {
delete(lastModifiedTimes, v) localName := remoteToLocalNames[remoteName]
delete(lastModifiedTimes, localName)
delete(remoteToLocalNames, remoteName)
delete(localToRemoteNames, localName)
} }
return return
} }

View File

@ -10,57 +10,220 @@ import (
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
) )
type testFile struct {
mtime time.Time
fd *os.File
path string
}
func createFile(t *testing.T, path string) *testFile {
f, err := os.Create(path)
assert.NoError(t, err)
fileInfo, err := os.Stat(path)
assert.NoError(t, err)
return &testFile{
path: path,
fd: f,
mtime: fileInfo.ModTime(),
}
}
func (f *testFile) close(t *testing.T) {
err := f.fd.Close()
assert.NoError(t, err)
}
func (f *testFile) overwrite(t *testing.T, s string) {
err := os.Truncate(f.path, 0)
assert.NoError(t, err)
_, err = f.fd.Seek(0, 0)
assert.NoError(t, err)
_, err = f.fd.WriteString(s)
assert.NoError(t, err)
// We manually update mtime after write because github actions file
// system does not :')
err = os.Chtimes(f.path, f.mtime.Add(time.Minute), f.mtime.Add(time.Minute))
assert.NoError(t, err)
f.mtime = f.mtime.Add(time.Minute)
}
func (f *testFile) remove(t *testing.T) {
err := os.Remove(f.path)
assert.NoError(t, err)
}
func assertKeysOfMap(t *testing.T, m map[string]time.Time, expectedKeys []string) {
keys := make([]string, len(m))
i := 0
for k := range m {
keys[i] = k
i++
}
assert.ElementsMatch(t, expectedKeys, keys)
}
func TestDiff(t *testing.T) { func TestDiff(t *testing.T) {
// Create temp project dir // Create temp project dir
projectDir := t.TempDir() projectDir := t.TempDir()
f1, err := os.Create(filepath.Join(projectDir, "hello.txt"))
assert.NoError(t, err)
defer f1.Close()
worldFilePath := filepath.Join(projectDir, "world.txt")
f2, err := os.Create(worldFilePath)
assert.NoError(t, err)
defer f2.Close()
fileSet := git.NewFileSet(projectDir) fileSet := git.NewFileSet(projectDir)
state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string),
RemoteToLocalNames: make(map[string]string),
}
f1 := createFile(t, filepath.Join(projectDir, "hello.txt"))
defer f1.close(t)
worldFilePath := filepath.Join(projectDir, "world.txt")
f2 := createFile(t, worldFilePath)
defer f2.close(t)
// New files are put
files, err := fileSet.All() files, err := fileSet.All()
assert.NoError(t, err) assert.NoError(t, err)
state := Snapshot{ change, err := state.diff(files)
LastUpdatedTimes: make(map[string]time.Time), assert.NoError(t, err)
}
change := state.diff(files)
// New files are added to put
assert.Len(t, change.delete, 0) assert.Len(t, change.delete, 0)
assert.Len(t, change.put, 2) assert.Len(t, change.put, 2)
assert.Contains(t, change.put, "hello.txt") assert.Contains(t, change.put, "hello.txt")
assert.Contains(t, change.put, "world.txt") assert.Contains(t, change.put, "world.txt")
assertKeysOfMap(t, state.LastUpdatedTimes, []string{"hello.txt", "world.txt"})
assert.Equal(t, map[string]string{"hello.txt": "hello.txt", "world.txt": "world.txt"}, state.LocalToRemoteNames)
assert.Equal(t, map[string]string{"hello.txt": "hello.txt", "world.txt": "world.txt"}, state.RemoteToLocalNames)
// Edited files are added to put. // world.txt is editted
// File system in the github actions env does not update f2.overwrite(t, "bunnies are cute.")
// mtime on writes to a file. So we are manually editting it
// instead of writing to world.txt
worldFileInfo, err := os.Stat(worldFilePath)
assert.NoError(t, err)
os.Chtimes(worldFilePath,
worldFileInfo.ModTime().Add(time.Nanosecond),
worldFileInfo.ModTime().Add(time.Nanosecond))
assert.NoError(t, err) assert.NoError(t, err)
files, err = fileSet.All() files, err = fileSet.All()
assert.NoError(t, err) assert.NoError(t, err)
change = state.diff(files) change, err = state.diff(files)
assert.NoError(t, err)
assert.Len(t, change.delete, 0) assert.Len(t, change.delete, 0)
assert.Len(t, change.put, 1) assert.Len(t, change.put, 1)
assert.Contains(t, change.put, "world.txt") assert.Contains(t, change.put, "world.txt")
assertKeysOfMap(t, state.LastUpdatedTimes, []string{"hello.txt", "world.txt"})
assert.Equal(t, map[string]string{"hello.txt": "hello.txt", "world.txt": "world.txt"}, state.LocalToRemoteNames)
assert.Equal(t, map[string]string{"hello.txt": "hello.txt", "world.txt": "world.txt"}, state.RemoteToLocalNames)
// Removed files are added to delete // hello.txt is deleted
err = os.Remove(filepath.Join(projectDir, "hello.txt")) f1.remove(t)
assert.NoError(t, err) assert.NoError(t, err)
files, err = fileSet.All() files, err = fileSet.All()
assert.NoError(t, err) assert.NoError(t, err)
change = state.diff(files) change, err = state.diff(files)
assert.NoError(t, err)
assert.Len(t, change.delete, 1) assert.Len(t, change.delete, 1)
assert.Len(t, change.put, 0) assert.Len(t, change.put, 0)
assert.Contains(t, change.delete, "hello.txt") assert.Contains(t, change.delete, "hello.txt")
assertKeysOfMap(t, state.LastUpdatedTimes, []string{"world.txt"})
assert.Equal(t, map[string]string{"world.txt": "world.txt"}, state.LocalToRemoteNames)
assert.Equal(t, map[string]string{"world.txt": "world.txt"}, state.RemoteToLocalNames)
}
func TestPythonNotebookDiff(t *testing.T) {
// Create temp project dir
projectDir := t.TempDir()
fileSet := git.NewFileSet(projectDir)
state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string),
RemoteToLocalNames: make(map[string]string),
}
foo := createFile(t, filepath.Join(projectDir, "foo.py"))
defer foo.close(t)
// Case 1: notebook foo.py is uploaded
files, err := fileSet.All()
assert.NoError(t, err)
foo.overwrite(t, "# Databricks notebook source\nprint(\"abc\")")
change, err := state.diff(files)
assert.NoError(t, err)
assert.Len(t, change.delete, 0)
assert.Len(t, change.put, 1)
assert.Contains(t, change.put, "foo.py")
assertKeysOfMap(t, state.LastUpdatedTimes, []string{"foo.py"})
assert.Equal(t, map[string]string{"foo.py": "foo"}, state.LocalToRemoteNames)
assert.Equal(t, map[string]string{"foo": "foo.py"}, state.RemoteToLocalNames)
// Case 2: notebook foo.py is converted to python script by removing
// magic keyword
foo.overwrite(t, "print(\"abc\")")
files, err = fileSet.All()
assert.NoError(t, err)
change, err = state.diff(files)
assert.NoError(t, err)
assert.Len(t, change.delete, 1)
assert.Len(t, change.put, 1)
assert.Contains(t, change.put, "foo.py")
assert.Contains(t, change.delete, "foo")
assertKeysOfMap(t, state.LastUpdatedTimes, []string{"foo.py"})
assert.Equal(t, map[string]string{"foo.py": "foo.py"}, state.LocalToRemoteNames)
assert.Equal(t, map[string]string{"foo.py": "foo.py"}, state.RemoteToLocalNames)
// Case 3: Python script foo.py is converted to a databricks notebook
foo.overwrite(t, "# Databricks notebook source\nprint(\"def\")")
files, err = fileSet.All()
assert.NoError(t, err)
change, err = state.diff(files)
assert.NoError(t, err)
assert.Len(t, change.delete, 1)
assert.Len(t, change.put, 1)
assert.Contains(t, change.put, "foo.py")
assert.Contains(t, change.delete, "foo.py")
assertKeysOfMap(t, state.LastUpdatedTimes, []string{"foo.py"})
assert.Equal(t, map[string]string{"foo.py": "foo"}, state.LocalToRemoteNames)
assert.Equal(t, map[string]string{"foo": "foo.py"}, state.RemoteToLocalNames)
// Case 4: Python notebook foo.py is deleted, and its remote name is used in change.delete
foo.remove(t)
assert.NoError(t, err)
files, err = fileSet.All()
assert.NoError(t, err)
change, err = state.diff(files)
assert.NoError(t, err)
assert.Len(t, change.delete, 1)
assert.Len(t, change.put, 0)
assert.Contains(t, change.delete, "foo")
assert.Len(t, state.LastUpdatedTimes, 0)
assert.Equal(t, map[string]string{}, state.LocalToRemoteNames)
assert.Equal(t, map[string]string{}, state.RemoteToLocalNames)
}
func TestErrorWhenIdenticalRemoteName(t *testing.T) {
// Create temp project dir
projectDir := t.TempDir()
fileSet := git.NewFileSet(projectDir)
state := Snapshot{
LastUpdatedTimes: make(map[string]time.Time),
LocalToRemoteNames: make(map[string]string),
RemoteToLocalNames: make(map[string]string),
}
// upload should work since they point to different destinations
pythonFoo := createFile(t, filepath.Join(projectDir, "foo.py"))
defer pythonFoo.close(t)
vanillaFoo := createFile(t, filepath.Join(projectDir, "foo"))
defer vanillaFoo.close(t)
files, err := fileSet.All()
assert.NoError(t, err)
change, err := state.diff(files)
assert.NoError(t, err)
assert.Len(t, change.delete, 0)
assert.Len(t, change.put, 2)
assert.Contains(t, change.put, "foo.py")
assert.Contains(t, change.put, "foo")
// errors out because they point to the same destination
pythonFoo.overwrite(t, "# Databricks notebook source\nprint(\"def\")")
files, err = fileSet.All()
assert.NoError(t, err)
change, err = state.diff(files)
assert.ErrorContains(t, err, "both foo and foo.py point to the same remote file location foo. Please remove one of them from your local project")
} }

View File

@ -31,6 +31,18 @@ type watchdog struct {
// rate limits // rate limits
const MaxRequestsInFlight = 20 const MaxRequestsInFlight = 20
// path: The local path of the file in the local file system
//
// The API calls for a python script foo.py would be
// `PUT foo.py`
// `DELETE foo.py`
//
// The API calls for a python notebook foo.py would be
// `PUT foo.py`
// `DELETE foo`
//
// The workspace file system backend strips .py from the file name if the python
// file is a notebook
func putFile(ctx context.Context, path string, content io.Reader) error { func putFile(ctx context.Context, path string, content io.Reader) error {
wsc := project.Get(ctx).WorkspacesClient() wsc := project.Get(ctx).WorkspacesClient()
// workspace mkdirs is idempotent // workspace mkdirs is idempotent
@ -48,6 +60,7 @@ func putFile(ctx context.Context, path string, content io.Reader) error {
return apiClient.Post(ctx, apiPath, content, nil) return apiClient.Post(ctx, apiPath, content, nil)
} }
// path: The remote path of the file in the workspace
func deleteFile(ctx context.Context, path string, wsc *workspaces.WorkspacesClient) error { func deleteFile(ctx context.Context, path string, wsc *workspaces.WorkspacesClient) error {
err := wsc.Workspace.Delete(ctx, err := wsc.Workspace.Delete(ctx,
workspace.Delete{ workspace.Delete{
@ -75,29 +88,30 @@ func getRemoteSyncCallback(ctx context.Context, root, remoteDir string, wsc *wor
// Allow MaxRequestLimit maxiumum concurrent api calls // Allow MaxRequestLimit maxiumum concurrent api calls
g.SetLimit(MaxRequestsInFlight) g.SetLimit(MaxRequestsInFlight)
for _, fileName := range d.delete { for _, remoteName := range d.delete {
// Copy of fileName created to make this safe for concurrent use. // Copy of remoteName created to make this safe for concurrent use.
// directly using fileName can cause race conditions since the loop // directly using remoteName can cause race conditions since the loop
// might iterate over to the next fileName before the go routine function // might iterate over to the next remoteName before the go routine function
// is evaluated // is evaluated
localFileName := fileName remoteNameCopy := remoteName
g.Go(func() error { g.Go(func() error {
err := deleteFile(ctx, path.Join(remoteDir, localFileName), wsc) err := deleteFile(ctx, path.Join(remoteDir, remoteNameCopy), wsc)
if err != nil { if err != nil {
return err return err
} }
log.Printf("[INFO] Deleted %s", localFileName) log.Printf("[INFO] Deleted %s", remoteNameCopy)
return nil return nil
}) })
} }
for _, fileName := range d.put { for _, localName := range d.put {
localFileName := fileName // Copy of localName created to make this safe for concurrent use.
localNameCopy := localName
g.Go(func() error { g.Go(func() error {
f, err := os.Open(filepath.Join(root, localFileName)) f, err := os.Open(filepath.Join(root, localNameCopy))
if err != nil { if err != nil {
return err return err
} }
err = putFile(ctx, path.Join(remoteDir, localFileName), f) err = putFile(ctx, path.Join(remoteDir, localNameCopy), f)
if err != nil { if err != nil {
return fmt.Errorf("failed to upload file: %s", err) return fmt.Errorf("failed to upload file: %s", err)
} }
@ -105,7 +119,7 @@ func getRemoteSyncCallback(ctx context.Context, root, remoteDir string, wsc *wor
if err != nil { if err != nil {
return err return err
} }
log.Printf("[INFO] Uploaded %s", localFileName) log.Printf("[INFO] Uploaded %s", localNameCopy)
return nil return nil
}) })
} }
@ -162,7 +176,11 @@ func (w *watchdog) main(ctx context.Context, applyDiff func(diff) error, remoteP
w.failure = err w.failure = err
return return
} }
change := snapshot.diff(all) change, err := snapshot.diff(all)
if err != nil {
w.failure = err
return
}
if change.IsEmpty() { if change.IsEmpty() {
onlyOnceInitLog.Do(func() { onlyOnceInitLog.Do(func() {
log.Printf("[INFO] Initial Sync Complete") log.Printf("[INFO] Initial Sync Complete")