From fd271d33f86ca968d5ed78cc80138804d64115a3 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 28 Feb 2025 11:39:01 +0100 Subject: [PATCH] add max retries --- libs/telemetry/upload.go | 17 +++++++++++++++++ libs/telemetry/upload_test.go | 22 ++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/libs/telemetry/upload.go b/libs/telemetry/upload.go index 75ff3daa4..e6afc6c22 100644 --- a/libs/telemetry/upload.go +++ b/libs/telemetry/upload.go @@ -83,6 +83,9 @@ func Upload(ctx context.Context) (*ResponseBody, error) { ctx, cancel := context.WithTimeout(ctx, maxUploadTime) defer cancel() + maxRetries := 3 + count := 0 + resp := &ResponseBody{} for { select { @@ -93,6 +96,8 @@ func Upload(ctx context.Context) (*ResponseBody, error) { // Proceed } + count++ + // Log the CLI telemetry events. err := apiClient.Do(ctx, http.MethodPost, "/telemetry-ext", nil, nil, RequestBody{ UploadTime: time.Now().UnixMilli(), @@ -110,5 +115,17 @@ func Upload(ctx context.Context) (*ResponseBody, error) { if resp.NumProtoSuccess == int64(len(in.Logs)) { return resp, nil } + + // We retry if the logs were partially uploaded. Subsequent retries have + // a chance of uploading all logs successfully. However we limit the number + // of retries to avoid excessive load on the telemetry endpoint. + if count > maxRetries { + return nil, fmt.Errorf("Failed to upload all telemetry logs after 4 tries. Only %d/%d logs uploaded", resp.NumProtoSuccess, len(in.Logs)) + } + + // Add a delay of 1 second before retrying. We avoid retrying immediately + // to avoid overwhelming the telemetry endpoint. + time.Sleep(1 * time.Second) + } } diff --git a/libs/telemetry/upload_test.go b/libs/telemetry/upload_test.go index ec6e8fe37..aefc626b1 100644 --- a/libs/telemetry/upload_test.go +++ b/libs/telemetry/upload_test.go @@ -102,3 +102,25 @@ func TestTelemetryUploadCanceled(t *testing.T) { // with a timeout error. assert.ErrorContains(t, err, "Failed to flush telemetry log due to timeout") } + +func TestTelemetryUploadMaxRetries(t *testing.T) { + server := testserver.New(t) + t.Cleanup(server.Close) + count := 0 + + server.Handle("POST", "/telemetry-ext", func(req testserver.Request) any { + count++ + return ResponseBody{ + NumProtoSuccess: 1, + } + }) + + t.Setenv("DATABRICKS_HOST", server.URL) + t.Setenv("DATABRICKS_TOKEN", "token") + + configureStdin(t) + + _, err := Upload(context.Background()) + assert.EqualError(t, err, "Failed to upload all telemetry logs after 4 tries. Only 1/2 logs uploaded") + assert.Equal(t, 4, count) +}