Enable offline install of labs projects (#2049)

## Changes
<!-- Summary of your changes that are easy to understand -->
This PR makes changes to the labs code base to allow for offline
installation of labs projects (like UCX). By passing a flag
--offline=true, the code will skip checking for project versions and
download code from GitHub and instead will look from the local
installation folder. This cmd is useful in systems where there is
internet restriction, the user should follow a set-up as follows:

- install a labs project on a machine which has internet
- zip and copy the file to the intended machine and 
- run databricks labs install <project name>--offline=true
it will look for the code in the same install directory and if present
load from there.


Closes #1646 

related to https://github.com/databrickslabs/ucx/issues/3418
## Tests
<!-- How is this tested? -->

Added unit test case and tested.

NO_CHANGELOG=true

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: Pieter Noordhuis <pieter.noordhuis@databricks.com>
Co-authored-by: Lennart Kats (databricks) <lennart.kats@databricks.com>
Co-authored-by: Denis Bilenko <denis.bilenko@databricks.com>
Co-authored-by: Julia Crawford (Databricks) <julia.crawford@databricks.com>
Co-authored-by: Ilya Kuznetsov <ilya.kuznetsov@databricks.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Andrew Nester <andrew.nester@databricks.com>
Co-authored-by: Anton Nekipelov <226657+anton-107@users.noreply.github.com>
Co-authored-by: shreyas-goenka <88374338+shreyas-goenka@users.noreply.github.com>
This commit is contained in:
Hari Selvarajan 2025-03-10 10:01:17 +00:00 committed by GitHub
parent b5a7023ef1
commit fa0a734b3c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 116 additions and 43 deletions

View File

@ -13,25 +13,31 @@ const cacheTTL = 1 * time.Hour
// NewReleaseCache creates a release cache for a repository in the GitHub org.
// Caller has to provide different cache directories for different repositories.
func NewReleaseCache(org, repo, cacheDir string) *ReleaseCache {
func NewReleaseCache(org, repo, cacheDir string, offlineInstall bool) *ReleaseCache {
pattern := fmt.Sprintf("%s-%s-releases", org, repo)
return &ReleaseCache{
cache: localcache.NewLocalCache[Versions](cacheDir, pattern, cacheTTL),
Org: org,
Repo: repo,
cache: localcache.NewLocalCache[Versions](cacheDir, pattern, cacheTTL),
Org: org,
Repo: repo,
Offline: offlineInstall,
}
}
type ReleaseCache struct {
cache localcache.LocalCache[Versions]
Org string
Repo string
cache localcache.LocalCache[Versions]
Org string
Repo string
Offline bool
}
func (r *ReleaseCache) Load(ctx context.Context) (Versions, error) {
return r.cache.Load(ctx, func() (Versions, error) {
return getVersions(ctx, r.Org, r.Repo)
})
if !r.Offline {
return r.cache.Load(ctx, func() (Versions, error) {
return getVersions(ctx, r.Org, r.Repo)
})
}
cached, err := r.cache.LoadCache()
return cached.Data, err
}
// getVersions is considered to be a private API, as we want the usage go through a cache

View File

@ -26,7 +26,7 @@ func TestLoadsReleasesForCLI(t *testing.T) {
ctx := context.Background()
ctx = WithApiOverride(ctx, server.URL)
r := NewReleaseCache("databricks", "cli", t.TempDir())
r := NewReleaseCache("databricks", "cli", t.TempDir(), false)
all, err := r.Load(ctx)
assert.NoError(t, err)
assert.Len(t, all, 2)

View File

@ -7,16 +7,20 @@ import (
)
func newInstallCommand() *cobra.Command {
return &cobra.Command{
Use: "install NAME",
Args: root.ExactArgs(1),
Short: "Installs project",
RunE: func(cmd *cobra.Command, args []string) error {
inst, err := project.NewInstaller(cmd, args[0])
if err != nil {
return err
}
return inst.Install(cmd.Context())
},
cmd := &cobra.Command{}
var offlineInstall bool
cmd.Flags().BoolVar(&offlineInstall, "offline", offlineInstall, `If installing in offline mode, set this flag to true.`)
cmd.Use = "install NAME"
cmd.Args = root.ExactArgs(1)
cmd.Short = "Installs project"
cmd.RunE = func(cmd *cobra.Command, args []string) error {
inst, err := project.NewInstaller(cmd, args[0], offlineInstall)
if err != nil {
return err
}
return inst.Install(cmd.Context())
}
return cmd
}

View File

@ -35,7 +35,7 @@ type LocalCache[T any] struct {
}
func (r *LocalCache[T]) Load(ctx context.Context, refresh func() (T, error)) (T, error) {
cached, err := r.loadCache()
cached, err := r.LoadCache()
if errors.Is(err, fs.ErrNotExist) {
return r.refreshCache(ctx, refresh, r.zero)
} else if err != nil {
@ -96,7 +96,7 @@ func (r *LocalCache[T]) FileName() string {
return filepath.Join(r.dir, r.name+".json")
}
func (r *LocalCache[T]) loadCache() (*cached[T], error) {
func (r *LocalCache[T]) LoadCache() (*cached[T], error) {
jsonFile := r.FileName()
raw, err := os.ReadFile(r.FileName())
if err != nil {

View File

@ -54,7 +54,7 @@ func (d *devInstallation) Install(ctx context.Context) error {
return d.Installer.runHook(d.Command)
}
func NewInstaller(cmd *cobra.Command, name string) (installable, error) {
func NewInstaller(cmd *cobra.Command, name string, offlineInstall bool) (installable, error) {
if name == "." {
wd, err := os.Getwd()
if err != nil {
@ -75,28 +75,32 @@ func NewInstaller(cmd *cobra.Command, name string) (installable, error) {
version = "latest"
}
f := &fetcher{name}
version, err := f.checkReleasedVersions(cmd, version)
version, err := f.checkReleasedVersions(cmd, version, offlineInstall)
if err != nil {
return nil, fmt.Errorf("version: %w", err)
}
prj, err := f.loadRemoteProjectDefinition(cmd, version)
prj, err := f.loadRemoteProjectDefinition(cmd, version, offlineInstall)
if err != nil {
return nil, fmt.Errorf("remote: %w", err)
}
return &installer{
Project: prj,
version: version,
cmd: cmd,
Project: prj,
version: version,
cmd: cmd,
offlineInstall: offlineInstall,
}, nil
}
func NewUpgrader(cmd *cobra.Command, name string) (*installer, error) {
f := &fetcher{name}
version, err := f.checkReleasedVersions(cmd, "latest")
version, err := f.checkReleasedVersions(cmd, "latest", false)
if err != nil {
return nil, fmt.Errorf("version: %w", err)
}
prj, err := f.loadRemoteProjectDefinition(cmd, version)
prj, err := f.loadRemoteProjectDefinition(cmd, version, false)
if err != nil {
return nil, fmt.Errorf("remote: %w", err)
}
@ -115,7 +119,7 @@ type fetcher struct {
name string
}
func (f *fetcher) checkReleasedVersions(cmd *cobra.Command, version string) (string, error) {
func (f *fetcher) checkReleasedVersions(cmd *cobra.Command, version string, offlineInstall bool) (string, error) {
ctx := cmd.Context()
cacheDir, err := PathInLabs(ctx, f.name, "cache")
if err != nil {
@ -123,7 +127,8 @@ func (f *fetcher) checkReleasedVersions(cmd *cobra.Command, version string) (str
}
// `databricks labs isntall X` doesn't know which exact version to fetch, so first
// we fetch all versions and then pick the latest one dynamically.
versions, err := github.NewReleaseCache("databrickslabs", f.name, cacheDir).Load(ctx)
var versions github.Versions
versions, err = github.NewReleaseCache("databrickslabs", f.name, cacheDir, offlineInstall).Load(ctx)
if err != nil {
return "", fmt.Errorf("versions: %w", err)
}
@ -140,11 +145,23 @@ func (f *fetcher) checkReleasedVersions(cmd *cobra.Command, version string) (str
return version, nil
}
func (i *fetcher) loadRemoteProjectDefinition(cmd *cobra.Command, version string) (*Project, error) {
func (i *fetcher) loadRemoteProjectDefinition(cmd *cobra.Command, version string, offlineInstall bool) (*Project, error) {
ctx := cmd.Context()
raw, err := github.ReadFileFromRef(ctx, "databrickslabs", i.name, version, "labs.yml")
if err != nil {
return nil, fmt.Errorf("read labs.yml from GitHub: %w", err)
var raw []byte
var err error
if !offlineInstall {
raw, err = github.ReadFileFromRef(ctx, "databrickslabs", i.name, version, "labs.yml")
if err != nil {
return nil, fmt.Errorf("read labs.yml from GitHub: %w", err)
}
} else {
libDir, _ := PathInLabs(ctx, i.name, "lib")
fileName := filepath.Join(libDir, "labs.yml")
raw, err = os.ReadFile(fileName)
if err != nil {
return nil, fmt.Errorf("read labs.yml from local path %s: %w", libDir, err)
}
}
return readFromBytes(ctx, raw)
}

View File

@ -76,7 +76,8 @@ type installer struct {
// command instance is used for:
// - auth profile flag override
// - standard input, output, and error streams
cmd *cobra.Command
cmd *cobra.Command
offlineInstall bool
}
func (i *installer) Install(ctx context.Context) error {
@ -101,9 +102,15 @@ func (i *installer) Install(ctx context.Context) error {
} else if err != nil {
return fmt.Errorf("login: %w", err)
}
err = i.downloadLibrary(ctx)
if err != nil {
return fmt.Errorf("lib: %w", err)
if !i.offlineInstall {
err = i.downloadLibrary(ctx)
if err != nil {
return fmt.Errorf("lib: %w", err)
}
}
if _, err := os.Stat(i.LibDir()); os.IsNotExist(err) {
return fmt.Errorf("no local installation found: %w", err)
}
err = i.setupPythonVirtualEnvironment(ctx, w)
if err != nil {

View File

@ -241,6 +241,45 @@ func TestInstallerWorksForReleases(t *testing.T) {
r.RunAndExpectOutput("setting up important infrastructure")
}
func TestOfflineInstallerWorksForReleases(t *testing.T) {
// This cmd is useful in systems where there is internet restriction, the user should follow a set-up as follows:
// install a labs project on a machine which has internet
// zip and copy the file to the intended machine and
// run databricks labs install --offline=true
// it will look for the code in the same install directory and if present, install from there.
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/api/2.1/clusters/get" {
respondWithJSON(t, w, &compute.ClusterDetails{
State: compute.StateRunning,
})
return
}
t.Logf("Requested: %s", r.URL.Path)
t.FailNow()
}))
defer server.Close()
ctx := installerContext(t, server)
newHome := copyTestdata(t, "testdata/installed-in-home")
ctx = env.WithUserHomeDir(ctx, newHome)
ctx, stub := process.WithStub(ctx)
stub.WithStdoutFor(`python[\S]+ --version`, "Python 3.10.5")
// on Unix, we call `python3`, but on Windows it is `python.exe`
stub.WithStderrFor(`python[\S]+ -m venv .*/.databricks/labs/blueprint/state/venv`, "[mock venv create]")
stub.WithStderrFor(`python[\S]+ -m pip install --upgrade --upgrade-strategy eager .`, "[mock pip install]")
stub.WithStdoutFor(`python[\S]+ install.py`, "setting up important infrastructure")
// simulate the case of GitHub Actions
ctx = env.Set(ctx, "DATABRICKS_HOST", server.URL)
ctx = env.Set(ctx, "DATABRICKS_TOKEN", "...")
ctx = env.Set(ctx, "DATABRICKS_CLUSTER_ID", "installer-cluster")
ctx = env.Set(ctx, "DATABRICKS_WAREHOUSE_ID", "installer-warehouse")
r := testcli.NewRunner(t, ctx, "labs", "install", "blueprint", "--offline=true", "--debug")
r.RunAndExpectOutput("setting up important infrastructure")
}
func TestInstallerWorksForDevelopment(t *testing.T) {
defer func() {
if !t.Failed() {

View File

@ -307,7 +307,7 @@ func (p *Project) checkUpdates(cmd *cobra.Command) error {
// might not be installed yet
return nil
}
r := github.NewReleaseCache("databrickslabs", p.Name, p.CacheDir())
r := github.NewReleaseCache("databrickslabs", p.Name, p.CacheDir(), false)
versions, err := r.Load(ctx)
if err != nil {
return err