2022-07-07 18:56:59 +00:00
package sync
import (
2022-11-21 22:42:09 +00:00
"bufio"
2022-10-19 14:22:55 +00:00
"context"
2022-09-14 15:50:29 +00:00
"encoding/json"
2022-07-07 18:56:59 +00:00
"fmt"
2022-12-12 13:31:06 +00:00
"log"
2022-09-14 15:50:29 +00:00
"os"
"path/filepath"
2022-11-21 22:42:09 +00:00
"regexp"
2022-07-07 18:56:59 +00:00
"strings"
"time"
2022-10-19 14:22:55 +00:00
"crypto/md5"
"encoding/hex"
2022-07-07 18:56:59 +00:00
"github.com/databricks/bricks/git"
)
2022-12-12 13:31:06 +00:00
// Bump it up every time a potentially breaking change is made to the snapshot schema
const LatestSnapshotVersion = "v1"
2022-10-19 14:22:55 +00:00
// A snapshot is a persistant store of knowledge bricks cli has about state of files
// in the remote repo. We use the last modified times (mtime) of files to determine
// whether a files need to be updated in the remote repo.
//
// 1. Any stale files in the remote repo are updated. That is if the last modified
// time recorded in the snapshot is less than the actual last modified time of the file
//
// 2. Any files present in snapshot but absent locally are deleted from remote path
//
// Changing either the databricks workspace (ie Host) or the remote path (ie RemotePath)
// local files are being synced to will make bricks cli switch to a different
// snapshot for persisting/loading sync state
type Snapshot struct {
2023-01-24 07:30:10 +00:00
// Path where this snapshot was loaded from and will be saved to.
// Intentionally not part of the snapshot state because it may be moved by the user.
SnapshotPath string ` json:"-" `
// New indicates if this is a fresh snapshot or if it was loaded from disk.
New bool ` json:"-" `
2022-12-12 13:31:06 +00:00
// version for snapshot schema. Only snapshots matching the latest snapshot
// schema version are used and older ones are invalidated (by deleting them)
Version string ` json:"version" `
2022-10-19 14:22:55 +00:00
// hostname of the workspace this snapshot is for
Host string ` json:"host" `
2022-12-12 13:31:06 +00:00
2022-10-19 14:22:55 +00:00
// Path in workspace for project repo
RemotePath string ` json:"remote_path" `
2022-12-12 13:31:06 +00:00
2022-10-19 14:22:55 +00:00
// Map of all files present in the remote repo with the:
// key: relative file path from project root
// value: last time the remote instance of this file was updated
LastUpdatedTimes map [ string ] time . Time ` json:"last_modified_times" `
2022-12-12 13:31:06 +00:00
2022-11-21 22:42:09 +00:00
// This map maps local file names to their remote names
// eg. notebook named "foo.py" locally would be stored as "foo", thus this
// map will contain an entry "foo.py" -> "foo"
LocalToRemoteNames map [ string ] string ` json:"local_to_remote_names" `
2022-12-12 13:31:06 +00:00
2022-11-21 22:42:09 +00:00
// Inverse of localToRemoteNames. Together the form a bijective mapping (ie
// there is a 1:1 unique mapping between local and remote name)
RemoteToLocalNames map [ string ] string ` json:"remote_to_local_names" `
2022-10-19 14:22:55 +00:00
}
2022-07-07 18:56:59 +00:00
2022-10-19 14:22:55 +00:00
const syncSnapshotDirName = "sync-snapshots"
2022-09-14 15:50:29 +00:00
2022-10-19 14:22:55 +00:00
func GetFileName ( host , remotePath string ) string {
hash := md5 . Sum ( [ ] byte ( host + remotePath ) )
hashString := hex . EncodeToString ( hash [ : ] )
return hashString [ : 16 ] + ".json"
}
// Compute path of the snapshot file on the local machine
// The file name for unique for a tuple of (host, remotePath)
// precisely it's the first 16 characters of md5(concat(host, remotePath))
2023-01-24 07:30:10 +00:00
func SnapshotPath ( opts * SyncOptions ) ( string , error ) {
snapshotDir := filepath . Join ( opts . SnapshotBasePath , syncSnapshotDirName )
2022-10-19 14:22:55 +00:00
if _ , err := os . Stat ( snapshotDir ) ; os . IsNotExist ( err ) {
err = os . Mkdir ( snapshotDir , os . ModeDir | os . ModePerm )
2022-09-14 15:50:29 +00:00
if err != nil {
2022-10-19 14:22:55 +00:00
return "" , fmt . Errorf ( "failed to create config directory: %s" , err )
2022-09-14 15:50:29 +00:00
}
}
2023-01-24 07:30:10 +00:00
fileName := GetFileName ( opts . Host , opts . RemotePath )
2022-10-19 14:22:55 +00:00
return filepath . Join ( snapshotDir , fileName ) , nil
}
2023-01-24 07:30:10 +00:00
func newSnapshot ( opts * SyncOptions ) ( * Snapshot , error ) {
path , err := SnapshotPath ( opts )
if err != nil {
return nil , err
2022-10-19 14:22:55 +00:00
}
return & Snapshot {
2023-01-24 07:30:10 +00:00
SnapshotPath : path ,
New : true ,
2022-12-12 13:31:06 +00:00
Version : LatestSnapshotVersion ,
2023-01-24 07:30:10 +00:00
Host : opts . Host ,
RemotePath : opts . RemotePath ,
2022-11-21 22:42:09 +00:00
LastUpdatedTimes : make ( map [ string ] time . Time ) ,
LocalToRemoteNames : make ( map [ string ] string ) ,
RemoteToLocalNames : make ( map [ string ] string ) ,
2022-10-19 14:22:55 +00:00
} , nil
}
2023-01-24 07:30:10 +00:00
func ( s * Snapshot ) Save ( ctx context . Context ) error {
f , err := os . OpenFile ( s . SnapshotPath , os . O_CREATE | os . O_WRONLY | os . O_TRUNC , 0644 )
2022-09-14 15:50:29 +00:00
if err != nil {
return fmt . Errorf ( "failed to create/open persisted sync snapshot file: %s" , err )
}
defer f . Close ( )
// persist snapshot to disk
bytes , err := json . MarshalIndent ( s , "" , " " )
if err != nil {
return fmt . Errorf ( "failed to json marshal in-memory snapshot: %s" , err )
}
_ , err = f . Write ( bytes )
if err != nil {
return fmt . Errorf ( "failed to write sync snapshot to disk: %s" , err )
}
return nil
}
2023-01-24 07:30:10 +00:00
func loadOrNewSnapshot ( opts * SyncOptions ) ( * Snapshot , error ) {
snapshot , err := newSnapshot ( opts )
2022-10-19 14:22:55 +00:00
if err != nil {
2023-01-24 07:30:10 +00:00
return nil , err
2022-09-14 15:50:29 +00:00
}
2023-01-24 07:30:10 +00:00
// Snapshot file not found. We return the new copy.
if _ , err := os . Stat ( snapshot . SnapshotPath ) ; os . IsNotExist ( err ) {
return snapshot , nil
}
2022-09-14 15:50:29 +00:00
2023-01-24 07:30:10 +00:00
bytes , err := os . ReadFile ( snapshot . SnapshotPath )
2022-09-14 15:50:29 +00:00
if err != nil {
2023-01-24 07:30:10 +00:00
return nil , fmt . Errorf ( "failed to read sync snapshot from disk: %s" , err )
2022-09-14 15:50:29 +00:00
}
2023-01-24 07:30:10 +00:00
var fromDisk Snapshot
err = json . Unmarshal ( bytes , & fromDisk )
2022-09-14 15:50:29 +00:00
if err != nil {
2023-01-24 07:30:10 +00:00
return nil , fmt . Errorf ( "failed to json unmarshal persisted snapshot: %s" , err )
2022-09-14 15:50:29 +00:00
}
2023-01-24 07:30:10 +00:00
2022-12-12 13:31:06 +00:00
// invalidate old snapshot with schema versions
2023-01-24 07:30:10 +00:00
if fromDisk . Version != LatestSnapshotVersion {
log . Printf ( "Did not load existing snapshot because its version is %s while the latest version is %s" , snapshot . Version , LatestSnapshotVersion )
return newSnapshot ( opts )
}
2022-12-12 13:31:06 +00:00
2023-01-24 07:30:10 +00:00
// unmarshal again over the existing snapshot instance
err = json . Unmarshal ( bytes , & snapshot )
if err != nil {
return nil , fmt . Errorf ( "failed to json unmarshal persisted snapshot: %s" , err )
2022-12-12 13:31:06 +00:00
}
2023-01-24 07:30:10 +00:00
snapshot . New = false
return snapshot , nil
2022-09-14 15:50:29 +00:00
}
2022-11-21 22:42:09 +00:00
func getNotebookDetails ( path string ) ( isNotebook bool , typeOfNotebook string , err error ) {
isNotebook = false
typeOfNotebook = ""
isPythonFile , err := regexp . Match ( ` \.py$ ` , [ ] byte ( path ) )
if err != nil {
return
}
if isPythonFile {
f , err := os . Open ( path )
if err != nil {
return false , "" , err
}
defer f . Close ( )
scanner := bufio . NewScanner ( f )
ok := scanner . Scan ( )
if ! ok {
return false , "" , scanner . Err ( )
}
// A python file is a notebook if it starts with the following magic string
isNotebook = strings . Contains ( scanner . Text ( ) , "# Databricks notebook source" )
return isNotebook , "PYTHON" , nil
}
return false , "" , nil
}
2022-12-12 13:31:06 +00:00
func ( s * Snapshot ) diff ( all [ ] git . File ) ( change diff , err error ) {
2022-07-07 18:56:59 +00:00
currentFilenames := map [ string ] bool { }
2022-10-19 14:22:55 +00:00
lastModifiedTimes := s . LastUpdatedTimes
2022-11-21 22:42:09 +00:00
remoteToLocalNames := s . RemoteToLocalNames
localToRemoteNames := s . LocalToRemoteNames
2022-07-07 18:56:59 +00:00
for _ , f := range all {
// create set of current files to figure out if removals are needed
currentFilenames [ f . Relative ] = true
// get current modified timestamp
modified := f . Modified ( )
2022-10-19 14:22:55 +00:00
lastSeenModified , seen := lastModifiedTimes [ f . Relative ]
2022-09-14 15:50:29 +00:00
if ! seen || modified . After ( lastSeenModified ) {
2022-10-19 14:22:55 +00:00
lastModifiedTimes [ f . Relative ] = modified
2022-11-21 22:42:09 +00:00
2022-12-22 10:35:32 +00:00
// change separators to '/' for file paths in remote store
unixFileName := filepath . ToSlash ( f . Relative )
// put file in databricks workspace
change . put = append ( change . put , unixFileName )
// get file metadata about whether it's a notebook
2022-11-21 22:42:09 +00:00
isNotebook , typeOfNotebook , err := getNotebookDetails ( f . Absolute )
if err != nil {
return change , err
}
2022-12-22 10:35:32 +00:00
// strip `.py` for python notebooks
remoteName := unixFileName
2022-11-21 22:42:09 +00:00
if isNotebook && typeOfNotebook == "PYTHON" {
2022-12-22 10:35:32 +00:00
remoteName = strings . TrimSuffix ( remoteName , ` .py ` )
2022-11-21 22:42:09 +00:00
}
// If the remote handle of a file changes, we want to delete the old
// remote version of that file to avoid duplicates.
// This can happen if a python notebook is converted to a python
// script or vice versa
oldRemoteName , ok := localToRemoteNames [ f . Relative ]
if ok && oldRemoteName != remoteName {
change . delete = append ( change . delete , oldRemoteName )
delete ( remoteToLocalNames , oldRemoteName )
}
// We cannot allow two local files in the project to point to the same
// remote path
oldLocalName , ok := remoteToLocalNames [ remoteName ]
if ok && oldLocalName != f . Relative {
return change , fmt . Errorf ( "both %s and %s point to the same remote file location %s. Please remove one of them from your local project" , oldLocalName , f . Relative , remoteName )
}
localToRemoteNames [ f . Relative ] = remoteName
remoteToLocalNames [ remoteName ] = f . Relative
2022-07-07 18:56:59 +00:00
}
}
2022-11-21 22:42:09 +00:00
// figure out files in the snapshot.lastModifiedTimes, but not on local
// filesystem. These will be deleted
for localName := range lastModifiedTimes {
_ , exists := currentFilenames [ localName ]
2022-07-07 18:56:59 +00:00
if exists {
continue
}
2022-12-12 13:31:06 +00:00
// TODO: https://databricks.atlassian.net/browse/DECO-429
// Add error wrapper giving instructions like this for all errors here :)
remoteName , ok := localToRemoteNames [ localName ]
if ! ok {
return change , fmt . Errorf ( "missing remote path for local path: %s. Please try syncing again after deleting .databricks/sync-snapshots dir from your project root" , localName )
}
2022-09-14 15:50:29 +00:00
// add them to a delete batch
2022-12-12 13:31:06 +00:00
change . delete = append ( change . delete , remoteName )
2022-07-07 18:56:59 +00:00
}
// and remove them from the snapshot
2022-11-21 22:42:09 +00:00
for _ , remoteName := range change . delete {
2022-12-12 13:31:06 +00:00
// we do note assert that remoteName exists in remoteToLocalNames since it
// will be missing for files with remote name changed
2022-11-21 22:42:09 +00:00
localName := remoteToLocalNames [ remoteName ]
2022-12-12 13:31:06 +00:00
2022-11-21 22:42:09 +00:00
delete ( lastModifiedTimes , localName )
delete ( remoteToLocalNames , remoteName )
delete ( localToRemoteNames , localName )
2022-07-07 18:56:59 +00:00
}
return
}