Fix race condition when restarting continuous jobs (#1849)

## Changes We don't need to cancel existing runs when the job is continuous and unpaused. The `/jobs/run-now` command will cancel the existing run and trigger a new one automatically. Cancelling the job manually can cause a race condition where both the manual trigger from the CLI and the continuous trigger from the job configuration happens at the same time. This PR prevents that from happening. ## Tests Unit tests and manually
2024-10-22 20:29:17 +05:30 · 2024-10-22 20:29:17 +05:30 · 3bab21e72e
parent 68d69d6e0b
commit 3bab21e72e
6 changed files with 247 additions and 8 deletions
--- a/bundle/run/job.go
+++ b/bundle/run/job.go
@ -317,6 +317,29 @@ func (r *jobRunner) Cancel(ctx context.Context) error {
 	return errGroup.Wait()
 }

+func (r *jobRunner) Restart(ctx context.Context, opts *Options) (output.RunOutput, error) {
+	// We don't need to cancel existing runs if the job is continuous and unpaused.
+	// the /jobs/run-now API will automatically cancel any existing runs before starting a new one.
+	//
+	// /jobs/run-now will not cancel existing runs if the job is continuous and paused.
+	// New job runs will be queued instead and will wait for existing runs to finish.
+	// In this case, we need to cancel the existing runs before starting a new one.
+	continuous := r.job.JobSettings.Continuous
+	if continuous != nil && continuous.PauseStatus == jobs.PauseStatusUnpaused {
+		return r.Run(ctx, opts)
+	}
+
+	s := cmdio.Spinner(ctx)
+	s <- "Cancelling all active job runs"
+	err := r.Cancel(ctx)
+	close(s)
+	if err != nil {
+		return nil, err
+	}
+
+	return r.Run(ctx, opts)
+}
+
 func (r *jobRunner) ParseArgs(args []string, opts *Options) error {
 	return r.posArgsHandler().ParseArgs(args, opts)
 }
--- a/bundle/run/job_test.go
+++ b/bundle/run/job_test.go
@ -1,6 +1,7 @@
 package run

 import (
+	"bytes"
 	"context"
 	"testing"
 	"time"
@ -8,6 +9,8 @@ import (
 	"github.com/databricks/cli/bundle"
 	"github.com/databricks/cli/bundle/config"
 	"github.com/databricks/cli/bundle/config/resources"
+	"github.com/databricks/cli/libs/cmdio"
+	"github.com/databricks/cli/libs/flags"
 	"github.com/databricks/databricks-sdk-go/experimental/mocks"
 	"github.com/databricks/databricks-sdk-go/service/jobs"
 	"github.com/stretchr/testify/mock"
@ -126,3 +129,132 @@ func TestJobRunnerCancelWithNoActiveRuns(t *testing.T) {
 	err := runner.Cancel(context.Background())
 	require.NoError(t, err)
 }
+
+func TestJobRunnerRestart(t *testing.T) {
+	for _, jobSettings := range []*jobs.JobSettings{
+		{},
+		{
+			Continuous: &jobs.Continuous{
+				PauseStatus: jobs.PauseStatusPaused,
+			},
+		},
+	} {
+		job := &resources.Job{
+			ID:          "123",
+			JobSettings: jobSettings,
+		}
+		b := &bundle.Bundle{
+			Config: config.Root{
+				Resources: config.Resources{
+					Jobs: map[string]*resources.Job{
+						"test_job": job,
+					},
+				},
+			},
+		}
+
+		runner := jobRunner{key: "test", bundle: b, job: job}
+
+		m := mocks.NewMockWorkspaceClient(t)
+		b.SetWorkpaceClient(m.WorkspaceClient)
+		ctx := context.Background()
+		ctx = cmdio.InContext(ctx, cmdio.NewIO(flags.OutputText, &bytes.Buffer{}, &bytes.Buffer{}, &bytes.Buffer{}, "", ""))
+		ctx = cmdio.NewContext(ctx, cmdio.NewLogger(flags.ModeAppend))
+
+		jobApi := m.GetMockJobsAPI()
+		jobApi.EXPECT().ListRunsAll(mock.Anything, jobs.ListRunsRequest{
+			ActiveOnly: true,
+			JobId:      123,
+		}).Return([]jobs.BaseRun{
+			{RunId: 1},
+			{RunId: 2},
+		}, nil)
+
+		// Mock the runner cancelling existing job runs.
+		mockWait := &jobs.WaitGetRunJobTerminatedOrSkipped[struct{}]{
+			Poll: func(time time.Duration, f func(j *jobs.Run)) (*jobs.Run, error) {
+				return nil, nil
+			},
+		}
+		jobApi.EXPECT().CancelRun(mock.Anything, jobs.CancelRun{
+			RunId: 1,
+		}).Return(mockWait, nil)
+		jobApi.EXPECT().CancelRun(mock.Anything, jobs.CancelRun{
+			RunId: 2,
+		}).Return(mockWait, nil)
+
+		// Mock the runner triggering a job run
+		mockWaitForRun := &jobs.WaitGetRunJobTerminatedOrSkipped[jobs.RunNowResponse]{
+			Poll: func(d time.Duration, f func(*jobs.Run)) (*jobs.Run, error) {
+				return &jobs.Run{
+					State: &jobs.RunState{
+						ResultState: jobs.RunResultStateSuccess,
+					},
+				}, nil
+			},
+		}
+		jobApi.EXPECT().RunNow(mock.Anything, jobs.RunNow{
+			JobId: 123,
+		}).Return(mockWaitForRun, nil)
+
+		// Mock the runner getting the job output
+		jobApi.EXPECT().GetRun(mock.Anything, jobs.GetRunRequest{}).Return(&jobs.Run{}, nil)
+
+		_, err := runner.Restart(ctx, &Options{})
+		require.NoError(t, err)
+	}
+}
+
+func TestJobRunnerRestartForContinuousUnpausedJobs(t *testing.T) {
+	job := &resources.Job{
+		ID: "123",
+		JobSettings: &jobs.JobSettings{
+			Continuous: &jobs.Continuous{
+				PauseStatus: jobs.PauseStatusUnpaused,
+			},
+		},
+	}
+	b := &bundle.Bundle{
+		Config: config.Root{
+			Resources: config.Resources{
+				Jobs: map[string]*resources.Job{
+					"test_job": job,
+				},
+			},
+		},
+	}
+
+	runner := jobRunner{key: "test", bundle: b, job: job}
+
+	m := mocks.NewMockWorkspaceClient(t)
+	b.SetWorkpaceClient(m.WorkspaceClient)
+	ctx := context.Background()
+	ctx = cmdio.InContext(ctx, cmdio.NewIO(flags.OutputText, &bytes.Buffer{}, &bytes.Buffer{}, &bytes.Buffer{}, "", "..."))
+	ctx = cmdio.NewContext(ctx, cmdio.NewLogger(flags.ModeAppend))
+
+	jobApi := m.GetMockJobsAPI()
+
+	// The runner should not try and cancel existing job runs for unpaused continuous jobs.
+	jobApi.AssertNotCalled(t, "ListRunsAll")
+	jobApi.AssertNotCalled(t, "CancelRun")
+
+	// Mock the runner triggering a job run
+	mockWaitForRun := &jobs.WaitGetRunJobTerminatedOrSkipped[jobs.RunNowResponse]{
+		Poll: func(d time.Duration, f func(*jobs.Run)) (*jobs.Run, error) {
+			return &jobs.Run{
+				State: &jobs.RunState{
+					ResultState: jobs.RunResultStateSuccess,
+				},
+			}, nil
+		},
+	}
+	jobApi.EXPECT().RunNow(mock.Anything, jobs.RunNow{
+		JobId: 123,
+	}).Return(mockWaitForRun, nil)
+
+	// Mock the runner getting the job output
+	jobApi.EXPECT().GetRun(mock.Anything, jobs.GetRunRequest{}).Return(&jobs.Run{}, nil)
+
+	_, err := runner.Restart(ctx, &Options{})
+	require.NoError(t, err)
+}
--- a/bundle/run/pipeline.go
+++ b/bundle/run/pipeline.go
@ -183,6 +183,18 @@ func (r *pipelineRunner) Cancel(ctx context.Context) error {
 	return err
 }

+func (r *pipelineRunner) Restart(ctx context.Context, opts *Options) (output.RunOutput, error) {
+	s := cmdio.Spinner(ctx)
+	s <- "Cancelling the active pipeline update"
+	err := r.Cancel(ctx)
+	close(s)
+	if err != nil {
+		return nil, err
+	}
+
+	return r.Run(ctx, opts)
+}
+
 func (r *pipelineRunner) ParseArgs(args []string, opts *Options) error {
 	if len(args) == 0 {
 		return nil
--- a/bundle/run/pipeline_test.go
+++ b/bundle/run/pipeline_test.go
@ -1,6 +1,7 @@
 package run

 import (
+	"bytes"
 	"context"
 	"testing"
 	"time"
@ -8,8 +9,12 @@ import (
 	"github.com/databricks/cli/bundle"
 	"github.com/databricks/cli/bundle/config"
 	"github.com/databricks/cli/bundle/config/resources"
+	"github.com/databricks/cli/libs/cmdio"
+	"github.com/databricks/cli/libs/flags"
+	sdk_config "github.com/databricks/databricks-sdk-go/config"
 	"github.com/databricks/databricks-sdk-go/experimental/mocks"
 	"github.com/databricks/databricks-sdk-go/service/pipelines"
+	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
 )

@ -47,3 +52,68 @@ func TestPipelineRunnerCancel(t *testing.T) {
 	err := runner.Cancel(context.Background())
 	require.NoError(t, err)
 }
+
+func TestPipelineRunnerRestart(t *testing.T) {
+	pipeline := &resources.Pipeline{
+		ID: "123",
+	}
+
+	b := &bundle.Bundle{
+		Config: config.Root{
+			Resources: config.Resources{
+				Pipelines: map[string]*resources.Pipeline{
+					"test_pipeline": pipeline,
+				},
+			},
+		},
+	}
+
+	runner := pipelineRunner{key: "test", bundle: b, pipeline: pipeline}
+
+	m := mocks.NewMockWorkspaceClient(t)
+	m.WorkspaceClient.Config = &sdk_config.Config{
+		Host: "https://test.com",
+	}
+	b.SetWorkpaceClient(m.WorkspaceClient)
+	ctx := context.Background()
+	ctx = cmdio.InContext(ctx, cmdio.NewIO(flags.OutputText, &bytes.Buffer{}, &bytes.Buffer{}, &bytes.Buffer{}, "", "..."))
+	ctx = cmdio.NewContext(ctx, cmdio.NewLogger(flags.ModeAppend))
+
+	mockWait := &pipelines.WaitGetPipelineIdle[struct{}]{
+		Poll: func(time.Duration, func(*pipelines.GetPipelineResponse)) (*pipelines.GetPipelineResponse, error) {
+			return nil, nil
+		},
+	}
+
+	pipelineApi := m.GetMockPipelinesAPI()
+	pipelineApi.EXPECT().Stop(mock.Anything, pipelines.StopRequest{
+		PipelineId: "123",
+	}).Return(mockWait, nil)
+
+	pipelineApi.EXPECT().GetByPipelineId(mock.Anything, "123").Return(&pipelines.GetPipelineResponse{}, nil)
+
+	// Mock runner starting a new update
+	pipelineApi.EXPECT().StartUpdate(mock.Anything, pipelines.StartUpdate{
+		PipelineId: "123",
+	}).Return(&pipelines.StartUpdateResponse{
+		UpdateId: "456",
+	}, nil)
+
+	// Mock runner polling for events
+	pipelineApi.EXPECT().ListPipelineEventsAll(mock.Anything, pipelines.ListPipelineEventsRequest{
+		Filter:     `update_id = '456'`,
+		MaxResults: 100,
+		PipelineId: "123",
+	}).Return([]pipelines.PipelineEvent{}, nil)
+
+	// Mock runner polling for update status
+	pipelineApi.EXPECT().GetUpdateByPipelineIdAndUpdateId(mock.Anything, "123", "456").
+		Return(&pipelines.GetUpdateResponse{
+			Update: &pipelines.UpdateInfo{
+				State: pipelines.UpdateInfoStateCompleted,
+			},
+		}, nil)
+
+	_, err := runner.Restart(ctx, &Options{})
+	require.NoError(t, err)
+}
--- a/bundle/run/runner.go
+++ b/bundle/run/runner.go
@ -27,6 +27,10 @@ type Runner interface {
 	// Run the underlying worklow.
 	Run(ctx context.Context, opts *Options) (output.RunOutput, error)

+	// Restart the underlying workflow by cancelling any existing runs before
+	// starting a new one.
+	Restart(ctx context.Context, opts *Options) (output.RunOutput, error)
+
 	// Cancel the underlying workflow.
 	Cancel(ctx context.Context) error

--- a/cmd/bundle/run.go
+++ b/cmd/bundle/run.go
@ -8,6 +8,7 @@ import (
 	"github.com/databricks/cli/bundle/deploy/terraform"
 	"github.com/databricks/cli/bundle/phases"
 	"github.com/databricks/cli/bundle/run"
+	"github.com/databricks/cli/bundle/run/output"
 	"github.com/databricks/cli/cmd/bundle/utils"
 	"github.com/databricks/cli/cmd/root"
 	"github.com/databricks/cli/libs/cmdio"
@ -100,19 +101,16 @@ task or a Python wheel task, the second example applies.
 		}

 		runOptions.NoWait = noWait
+		var output output.RunOutput
 		if restart {
-			s := cmdio.Spinner(ctx)
-			s <- "Cancelling all runs"
-			err := runner.Cancel(ctx)
-			close(s)
-			if err != nil {
-				return err
-			}
+			output, err = runner.Restart(ctx, &runOptions)
+		} else {
+			output, err = runner.Run(ctx, &runOptions)
 		}
-		output, err := runner.Run(ctx, &runOptions)
 		if err != nil {
 			return err
 		}
+
 		if output != nil {
 			switch root.OutputType(cmd) {
 			case flags.OutputText: