databricks-cli/lib/flavor/py/setup_py.go

308 lines
8.7 KiB
Go
Raw Normal View History

2022-10-31 11:09:52 +00:00
package py
import (
"context"
"encoding/json"
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"golang.org/x/exp/slices"
"github.com/databricks/bricks/lib/dbr"
"github.com/databricks/bricks/lib/flavor"
"github.com/databricks/bricks/lib/spawn"
"github.com/databricks/bricks/python"
"github.com/databricks/databricks-sdk-go/databricks/apierr"
"github.com/databricks/databricks-sdk-go/service/commands"
"github.com/databricks/databricks-sdk-go/service/libraries"
)
type SetupDotPy struct {
SetupPy string `json:"setup_py,omitempty"`
MirrorLibraries bool `json:"mirror_libraries,omitempty"`
venv string
wheelName string
}
func (s *SetupDotPy) RequiresCluster() bool {
return true
}
// Python libraries do not require a restart
func (s *SetupDotPy) RequiresRestart() bool {
return false
}
func (s *SetupDotPy) setupPyLoc(prj flavor.Project) string {
if s.SetupPy == "" {
s.SetupPy = "setup.py"
}
return filepath.Join(prj.Root(), s.SetupPy)
}
// We detect only setuptools build backend for now. Hatchling, PDM,
// and Flit _might_ be added in some distant future.
//
// See: https://packaging.python.org/en/latest/tutorials/packaging-projects/
func (s *SetupDotPy) Detected(prj flavor.Project) bool {
_, err := os.Stat(s.setupPyLoc(prj))
return err == nil
}
// readDistribution "parses" metadata from setup.py file from context
// of current project root and virtual env
func (s *SetupDotPy) readDistribution(ctx context.Context, prj flavor.Project) (d Distribution, err error) {
ctx = spawn.WithRoot(ctx, filepath.Dir(s.setupPyLoc(prj)))
out, err := python.Py(ctx, "-c", commands.TrimLeadingWhitespace(`
import setuptools, json, sys
setup_config = {} # actual args for setuptools.dist.Distribution
def capture(**kwargs): global setup_config; setup_config = kwargs
setuptools.setup = capture
import setup
json.dump(setup_config, sys.stdout)`))
if err != nil {
return
}
err = json.Unmarshal([]byte(out), &d)
return
}
func (s *SetupDotPy) Prepare(ctx context.Context, prj flavor.Project, status func(string)) error {
venv, err := s.fastDetectVirtualEnv(prj.Root())
if err != nil {
return err
}
s.venv = venv
if s.venv == "" {
// this allows CLI to be usable in existing projects with existing virtualenvs
venv = filepath.Join(prj.Root(), ".databricks") // TODO: integrate with pipenv
err := os.MkdirAll(venv, 0o700)
if err != nil {
return fmt.Errorf("mk venv: %w", err)
}
status(fmt.Sprintf("Creating virtualenv in %s", venv))
_, err = python.Py(ctx, "-m", "venv", venv)
if err != nil {
return fmt.Errorf("create venv: %w", err)
}
s.venv = venv
status("Upgrading pip")
_, err = s.Pip(ctx, "install", "--upgrade", "pip", "wheel")
if err != nil {
return fmt.Errorf("upgrade pip: %w", err)
}
}
env, err := s.Freeze(ctx)
if err != nil {
return fmt.Errorf("pip freeze: %s", err)
}
var remotePkgs []string
d, err := s.readDistribution(ctx, prj)
if err != nil {
return fmt.Errorf("setup.py: %s", err)
}
if s.MirrorLibraries {
// TODO: name `MirrorLibraries` is TBD
// TODO: must be part of init command survey
status("Fetching remote libraries")
remoteInfo, err := s.runtimeInfo(ctx, prj, status)
if err != nil && errors.As(err, &apierr.APIError{}) {
return err
}
// TODO: check Spark compatibility with locall install_requires
// TODO: check Python version compatibility with local virtualenv
if err != errNoCluster {
skipLibs := []string{
"dbus-python",
"distro-info",
"pip",
"psycopg2",
"pygobject",
"python-apt",
"requests-unixsocket",
"setuptools",
"unattended-upgrades",
"wheel",
}
PYPI:
for _, pkg := range remoteInfo.PyPI {
if env.Has(pkg.PyPiName()) {
continue
}
if pkg.Name == d.NormalizedName() {
// skip installing self
continue
}
for _, skip := range skipLibs {
if skip == pkg.Name {
continue PYPI
}
}
remotePkgs = append(remotePkgs, pkg.PyPiName())
}
}
}
type depList struct {
name string
packages []string
}
dbrDepsName := "remote cluster"
dependencyLists := []depList{
{dbrDepsName, remotePkgs},
{"install_requires", d.InstallRequires},
{"tests_require", d.TestsRequire},
}
for _, deps := range dependencyLists {
for _, dep := range deps.packages {
if env.Has(dep) {
continue
}
status(fmt.Sprintf("Installing %s in virtualenv (%s)", dep, deps.name))
_, err = s.Pip(ctx, "install", "--prefer-binary", dep)
if err != nil && deps.name == dbrDepsName &&
strings.Contains(err.Error(), "Could not find a version") {
continue
}
if err != nil {
return fmt.Errorf("%s: %w", dep, err)
}
// repeatedly run pip freeze so that we potentially have less installs
env, err = s.Freeze(ctx)
if err != nil {
return fmt.Errorf("pip freeze: %s", err)
}
}
}
return nil
}
var errNoCluster = errors.New("no development cluster")
func (s *SetupDotPy) runtimeInfo(ctx context.Context, prj flavor.Project,
status func(string)) (*dbr.RuntimeInfo, error) {
clusterId, err := prj.GetDevelopmentClusterId(ctx)
if err != nil && errors.As(err, &apierr.APIError{}) {
return nil, err
}
if err != nil {
return nil, errNoCluster
}
return dbr.GetRuntimeInfo(ctx, prj.WorkspacesClient(), clusterId, status)
}
func (s *SetupDotPy) Freeze(ctx context.Context) (Environment, error) {
out, err := s.Pip(ctx, "freeze")
if err != nil {
return nil, err
}
env := Environment{}
deps := strings.Split(out, "\n")
for _, raw := range deps {
env = append(env, DependencyFromSpec(raw))
}
return env, nil
}
func (s *SetupDotPy) LocalArtifacts(ctx context.Context, prj flavor.Project) (flavor.Artifacts, error) {
dist, err := s.readDistribution(ctx, prj)
if err != nil {
return nil, err
}
all := flavor.Artifacts{}
// install dependencies for the wheel to run
for _, dependency := range dist.InstallRequires {
if strings.HasPrefix(dependency, "pyspark") {
// pyspark will conflict with DBR
continue
}
all = append(all, flavor.Artifact{
Flavor: s,
Library: libraries.Library{
Pypi: &libraries.PythonPyPiLibrary{
Package: dependency,
},
},
})
}
s.wheelName = dist.WheelName()
all = append(all, flavor.Artifact{
Flavor: s,
Library: libraries.Library{
Whl: fmt.Sprintf("%s/.databricks/dist/%s", prj.Root(), s.wheelName),
},
})
return all, nil
}
// Build creates a python wheel, while keeping project root in a clean state, removing the need
// to execute rm -fr dist build *.egg-info after each build
func (s *SetupDotPy) Build(ctx context.Context, prj flavor.Project, status func(string)) error {
status(fmt.Sprintf("Building %s", s.wheelName))
ctx = spawn.WithRoot(ctx, filepath.Dir(s.setupPyLoc(prj)))
_, err := s.Py(ctx, "setup.py",
// see https://github.com/pypa/setuptools/blob/main/setuptools/_distutils/command/build.py#L23-L31
"build", "--build-lib=.databricks/build/lib", "--build-base=.databricks/build",
// see https://github.com/pypa/setuptools/blob/main/setuptools/command/egg_info.py#L167-L168
"egg_info", "--egg-base=.databricks",
// see https://github.com/pypa/wheel/blob/main/src/wheel/bdist_wheel.py#L140
"bdist_wheel", "--dist-dir=.databricks/dist")
return err
}
// Py calls project-specific Python interpreter from the virtual env from project root dir
func (s *SetupDotPy) Py(ctx context.Context, args ...string) (string, error) {
if s.venv == "" {
return "", fmt.Errorf("virtualenv not detected")
}
out, err := spawn.ExecAndPassErr(ctx, fmt.Sprintf("%s/bin/python3", s.venv), args...)
if err != nil {
// current error message chain is longer:
// failed to call {pyExec} __non_existing__.py: {pyExec}: can't open
// ... file '{pwd}/__non_existing__.py': [Errno 2] No such file or directory"
// probably we'll need to make it shorter:
// can't open file '$PWD/__non_existing__.py': [Errno 2] No such file or directory
return "", err
}
return strings.Trim(string(out), "\n\r"), nil
}
func (s *SetupDotPy) Pip(ctx context.Context, args ...string) (string, error) {
return s.Py(ctx, append([]string{"-m", "pip"}, args...)...)
}
// fastDetectVirtualEnv performs very quick detection, by running over top level directories only
func (s *SetupDotPy) fastDetectVirtualEnv(root string) (string, error) {
wdf, err := os.Open(root)
if err != nil {
return "", err
}
files, err := wdf.ReadDir(0)
if err != nil {
return "", err
}
// virtual env is most likely in dot-directory
slices.SortFunc(files, func(a, b fs.DirEntry) bool {
return a.Name() < b.Name()
})
for _, v := range files {
if !v.IsDir() {
continue
}
candidate := fmt.Sprintf("%s/%s", root, v.Name())
_, err = os.Stat(fmt.Sprintf("%s/pyvenv.cfg", candidate))
if errors.Is(err, os.ErrNotExist) {
continue
}
if err != nil {
return "", err
}
return candidate, nil
}
return "", nil
}