From fa0a734b3cb891f55babfc5f03c831b070cb4fdf Mon Sep 17 00:00:00 2001 From: Hari Selvarajan <105197202+HariGS-DB@users.noreply.github.com> Date: Mon, 10 Mar 2025 10:01:17 +0000 Subject: [PATCH] Enable offline install of labs projects (#2049) ## Changes This PR makes changes to the labs code base to allow for offline installation of labs projects (like UCX). By passing a flag --offline=true, the code will skip checking for project versions and download code from GitHub and instead will look from the local installation folder. This cmd is useful in systems where there is internet restriction, the user should follow a set-up as follows: - install a labs project on a machine which has internet - zip and copy the file to the intended machine and - run databricks labs install --offline=true it will look for the code in the same install directory and if present load from there. Closes #1646 related to https://github.com/databrickslabs/ucx/issues/3418 ## Tests Added unit test case and tested. NO_CHANGELOG=true --------- Signed-off-by: dependabot[bot] Co-authored-by: Pieter Noordhuis Co-authored-by: Lennart Kats (databricks) Co-authored-by: Denis Bilenko Co-authored-by: Julia Crawford (Databricks) Co-authored-by: Ilya Kuznetsov Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Andrew Nester Co-authored-by: Anton Nekipelov <226657+anton-107@users.noreply.github.com> Co-authored-by: shreyas-goenka <88374338+shreyas-goenka@users.noreply.github.com> --- cmd/labs/github/releases.go | 26 ++++++++++------- cmd/labs/github/releases_test.go | 2 +- cmd/labs/install.go | 26 +++++++++-------- cmd/labs/localcache/jsonfile.go | 4 +-- cmd/labs/project/fetcher.go | 45 ++++++++++++++++++++---------- cmd/labs/project/installer.go | 15 +++++++--- cmd/labs/project/installer_test.go | 39 ++++++++++++++++++++++++++ cmd/labs/project/project.go | 2 +- 8 files changed, 116 insertions(+), 43 deletions(-) diff --git a/cmd/labs/github/releases.go b/cmd/labs/github/releases.go index 0dae0317d..d0e2cb796 100644 --- a/cmd/labs/github/releases.go +++ b/cmd/labs/github/releases.go @@ -13,25 +13,31 @@ const cacheTTL = 1 * time.Hour // NewReleaseCache creates a release cache for a repository in the GitHub org. // Caller has to provide different cache directories for different repositories. -func NewReleaseCache(org, repo, cacheDir string) *ReleaseCache { +func NewReleaseCache(org, repo, cacheDir string, offlineInstall bool) *ReleaseCache { pattern := fmt.Sprintf("%s-%s-releases", org, repo) return &ReleaseCache{ - cache: localcache.NewLocalCache[Versions](cacheDir, pattern, cacheTTL), - Org: org, - Repo: repo, + cache: localcache.NewLocalCache[Versions](cacheDir, pattern, cacheTTL), + Org: org, + Repo: repo, + Offline: offlineInstall, } } type ReleaseCache struct { - cache localcache.LocalCache[Versions] - Org string - Repo string + cache localcache.LocalCache[Versions] + Org string + Repo string + Offline bool } func (r *ReleaseCache) Load(ctx context.Context) (Versions, error) { - return r.cache.Load(ctx, func() (Versions, error) { - return getVersions(ctx, r.Org, r.Repo) - }) + if !r.Offline { + return r.cache.Load(ctx, func() (Versions, error) { + return getVersions(ctx, r.Org, r.Repo) + }) + } + cached, err := r.cache.LoadCache() + return cached.Data, err } // getVersions is considered to be a private API, as we want the usage go through a cache diff --git a/cmd/labs/github/releases_test.go b/cmd/labs/github/releases_test.go index 93ac33aee..24d13fd60 100644 --- a/cmd/labs/github/releases_test.go +++ b/cmd/labs/github/releases_test.go @@ -26,7 +26,7 @@ func TestLoadsReleasesForCLI(t *testing.T) { ctx := context.Background() ctx = WithApiOverride(ctx, server.URL) - r := NewReleaseCache("databricks", "cli", t.TempDir()) + r := NewReleaseCache("databricks", "cli", t.TempDir(), false) all, err := r.Load(ctx) assert.NoError(t, err) assert.Len(t, all, 2) diff --git a/cmd/labs/install.go b/cmd/labs/install.go index 6ed6b2e91..b78793b97 100644 --- a/cmd/labs/install.go +++ b/cmd/labs/install.go @@ -7,16 +7,20 @@ import ( ) func newInstallCommand() *cobra.Command { - return &cobra.Command{ - Use: "install NAME", - Args: root.ExactArgs(1), - Short: "Installs project", - RunE: func(cmd *cobra.Command, args []string) error { - inst, err := project.NewInstaller(cmd, args[0]) - if err != nil { - return err - } - return inst.Install(cmd.Context()) - }, + cmd := &cobra.Command{} + var offlineInstall bool + + cmd.Flags().BoolVar(&offlineInstall, "offline", offlineInstall, `If installing in offline mode, set this flag to true.`) + + cmd.Use = "install NAME" + cmd.Args = root.ExactArgs(1) + cmd.Short = "Installs project" + cmd.RunE = func(cmd *cobra.Command, args []string) error { + inst, err := project.NewInstaller(cmd, args[0], offlineInstall) + if err != nil { + return err + } + return inst.Install(cmd.Context()) } + return cmd } diff --git a/cmd/labs/localcache/jsonfile.go b/cmd/labs/localcache/jsonfile.go index 50ed372f5..1a7c6b940 100644 --- a/cmd/labs/localcache/jsonfile.go +++ b/cmd/labs/localcache/jsonfile.go @@ -35,7 +35,7 @@ type LocalCache[T any] struct { } func (r *LocalCache[T]) Load(ctx context.Context, refresh func() (T, error)) (T, error) { - cached, err := r.loadCache() + cached, err := r.LoadCache() if errors.Is(err, fs.ErrNotExist) { return r.refreshCache(ctx, refresh, r.zero) } else if err != nil { @@ -96,7 +96,7 @@ func (r *LocalCache[T]) FileName() string { return filepath.Join(r.dir, r.name+".json") } -func (r *LocalCache[T]) loadCache() (*cached[T], error) { +func (r *LocalCache[T]) LoadCache() (*cached[T], error) { jsonFile := r.FileName() raw, err := os.ReadFile(r.FileName()) if err != nil { diff --git a/cmd/labs/project/fetcher.go b/cmd/labs/project/fetcher.go index 8f4fafde6..9ec2a5028 100644 --- a/cmd/labs/project/fetcher.go +++ b/cmd/labs/project/fetcher.go @@ -54,7 +54,7 @@ func (d *devInstallation) Install(ctx context.Context) error { return d.Installer.runHook(d.Command) } -func NewInstaller(cmd *cobra.Command, name string) (installable, error) { +func NewInstaller(cmd *cobra.Command, name string, offlineInstall bool) (installable, error) { if name == "." { wd, err := os.Getwd() if err != nil { @@ -75,28 +75,32 @@ func NewInstaller(cmd *cobra.Command, name string) (installable, error) { version = "latest" } f := &fetcher{name} - version, err := f.checkReleasedVersions(cmd, version) + + version, err := f.checkReleasedVersions(cmd, version, offlineInstall) if err != nil { return nil, fmt.Errorf("version: %w", err) } - prj, err := f.loadRemoteProjectDefinition(cmd, version) + + prj, err := f.loadRemoteProjectDefinition(cmd, version, offlineInstall) if err != nil { return nil, fmt.Errorf("remote: %w", err) } + return &installer{ - Project: prj, - version: version, - cmd: cmd, + Project: prj, + version: version, + cmd: cmd, + offlineInstall: offlineInstall, }, nil } func NewUpgrader(cmd *cobra.Command, name string) (*installer, error) { f := &fetcher{name} - version, err := f.checkReleasedVersions(cmd, "latest") + version, err := f.checkReleasedVersions(cmd, "latest", false) if err != nil { return nil, fmt.Errorf("version: %w", err) } - prj, err := f.loadRemoteProjectDefinition(cmd, version) + prj, err := f.loadRemoteProjectDefinition(cmd, version, false) if err != nil { return nil, fmt.Errorf("remote: %w", err) } @@ -115,7 +119,7 @@ type fetcher struct { name string } -func (f *fetcher) checkReleasedVersions(cmd *cobra.Command, version string) (string, error) { +func (f *fetcher) checkReleasedVersions(cmd *cobra.Command, version string, offlineInstall bool) (string, error) { ctx := cmd.Context() cacheDir, err := PathInLabs(ctx, f.name, "cache") if err != nil { @@ -123,7 +127,8 @@ func (f *fetcher) checkReleasedVersions(cmd *cobra.Command, version string) (str } // `databricks labs isntall X` doesn't know which exact version to fetch, so first // we fetch all versions and then pick the latest one dynamically. - versions, err := github.NewReleaseCache("databrickslabs", f.name, cacheDir).Load(ctx) + var versions github.Versions + versions, err = github.NewReleaseCache("databrickslabs", f.name, cacheDir, offlineInstall).Load(ctx) if err != nil { return "", fmt.Errorf("versions: %w", err) } @@ -140,11 +145,23 @@ func (f *fetcher) checkReleasedVersions(cmd *cobra.Command, version string) (str return version, nil } -func (i *fetcher) loadRemoteProjectDefinition(cmd *cobra.Command, version string) (*Project, error) { +func (i *fetcher) loadRemoteProjectDefinition(cmd *cobra.Command, version string, offlineInstall bool) (*Project, error) { ctx := cmd.Context() - raw, err := github.ReadFileFromRef(ctx, "databrickslabs", i.name, version, "labs.yml") - if err != nil { - return nil, fmt.Errorf("read labs.yml from GitHub: %w", err) + var raw []byte + var err error + if !offlineInstall { + raw, err = github.ReadFileFromRef(ctx, "databrickslabs", i.name, version, "labs.yml") + if err != nil { + return nil, fmt.Errorf("read labs.yml from GitHub: %w", err) + } + } else { + libDir, _ := PathInLabs(ctx, i.name, "lib") + fileName := filepath.Join(libDir, "labs.yml") + raw, err = os.ReadFile(fileName) + if err != nil { + return nil, fmt.Errorf("read labs.yml from local path %s: %w", libDir, err) + } } + return readFromBytes(ctx, raw) } diff --git a/cmd/labs/project/installer.go b/cmd/labs/project/installer.go index 2e42ce43d..e1abc91d2 100644 --- a/cmd/labs/project/installer.go +++ b/cmd/labs/project/installer.go @@ -76,7 +76,8 @@ type installer struct { // command instance is used for: // - auth profile flag override // - standard input, output, and error streams - cmd *cobra.Command + cmd *cobra.Command + offlineInstall bool } func (i *installer) Install(ctx context.Context) error { @@ -101,9 +102,15 @@ func (i *installer) Install(ctx context.Context) error { } else if err != nil { return fmt.Errorf("login: %w", err) } - err = i.downloadLibrary(ctx) - if err != nil { - return fmt.Errorf("lib: %w", err) + if !i.offlineInstall { + err = i.downloadLibrary(ctx) + if err != nil { + return fmt.Errorf("lib: %w", err) + } + } + + if _, err := os.Stat(i.LibDir()); os.IsNotExist(err) { + return fmt.Errorf("no local installation found: %w", err) } err = i.setupPythonVirtualEnvironment(ctx, w) if err != nil { diff --git a/cmd/labs/project/installer_test.go b/cmd/labs/project/installer_test.go index a01ba864a..ddeb018ea 100644 --- a/cmd/labs/project/installer_test.go +++ b/cmd/labs/project/installer_test.go @@ -241,6 +241,45 @@ func TestInstallerWorksForReleases(t *testing.T) { r.RunAndExpectOutput("setting up important infrastructure") } +func TestOfflineInstallerWorksForReleases(t *testing.T) { + // This cmd is useful in systems where there is internet restriction, the user should follow a set-up as follows: + // install a labs project on a machine which has internet + // zip and copy the file to the intended machine and + // run databricks labs install --offline=true + // it will look for the code in the same install directory and if present, install from there. + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/api/2.1/clusters/get" { + respondWithJSON(t, w, &compute.ClusterDetails{ + State: compute.StateRunning, + }) + return + } + t.Logf("Requested: %s", r.URL.Path) + t.FailNow() + })) + defer server.Close() + + ctx := installerContext(t, server) + newHome := copyTestdata(t, "testdata/installed-in-home") + ctx = env.WithUserHomeDir(ctx, newHome) + + ctx, stub := process.WithStub(ctx) + stub.WithStdoutFor(`python[\S]+ --version`, "Python 3.10.5") + // on Unix, we call `python3`, but on Windows it is `python.exe` + stub.WithStderrFor(`python[\S]+ -m venv .*/.databricks/labs/blueprint/state/venv`, "[mock venv create]") + stub.WithStderrFor(`python[\S]+ -m pip install --upgrade --upgrade-strategy eager .`, "[mock pip install]") + stub.WithStdoutFor(`python[\S]+ install.py`, "setting up important infrastructure") + + // simulate the case of GitHub Actions + ctx = env.Set(ctx, "DATABRICKS_HOST", server.URL) + ctx = env.Set(ctx, "DATABRICKS_TOKEN", "...") + ctx = env.Set(ctx, "DATABRICKS_CLUSTER_ID", "installer-cluster") + ctx = env.Set(ctx, "DATABRICKS_WAREHOUSE_ID", "installer-warehouse") + + r := testcli.NewRunner(t, ctx, "labs", "install", "blueprint", "--offline=true", "--debug") + r.RunAndExpectOutput("setting up important infrastructure") +} + func TestInstallerWorksForDevelopment(t *testing.T) { defer func() { if !t.Failed() { diff --git a/cmd/labs/project/project.go b/cmd/labs/project/project.go index 75f5e584f..a9f5f45c2 100644 --- a/cmd/labs/project/project.go +++ b/cmd/labs/project/project.go @@ -307,7 +307,7 @@ func (p *Project) checkUpdates(cmd *cobra.Command) error { // might not be installed yet return nil } - r := github.NewReleaseCache("databrickslabs", p.Name, p.CacheDir()) + r := github.NewReleaseCache("databrickslabs", p.Name, p.CacheDir(), false) versions, err := r.Load(ctx) if err != nil { return err