2025-03-05 13:54:30 +00:00
|
|
|
package telemetry
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2025-03-05 14:21:12 +00:00
|
|
|
"encoding/json"
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"net/http"
|
|
|
|
"time"
|
2025-03-05 13:54:30 +00:00
|
|
|
|
2025-03-06 13:14:16 +00:00
|
|
|
"github.com/databricks/cli/libs/env"
|
2025-03-05 14:21:12 +00:00
|
|
|
"github.com/databricks/cli/libs/log"
|
2025-03-05 13:54:30 +00:00
|
|
|
"github.com/databricks/cli/libs/telemetry/protos"
|
2025-03-05 14:21:12 +00:00
|
|
|
"github.com/databricks/databricks-sdk-go/apierr"
|
|
|
|
"github.com/databricks/databricks-sdk-go/client"
|
|
|
|
"github.com/databricks/databricks-sdk-go/config"
|
2025-03-05 13:54:30 +00:00
|
|
|
"github.com/google/uuid"
|
|
|
|
)
|
|
|
|
|
2025-03-05 14:57:41 +00:00
|
|
|
// Environment variable to disable telemetry. If this is set to any value, telemetry
|
|
|
|
// will be disabled.
|
2025-03-06 13:14:16 +00:00
|
|
|
const disableEnvVar = "DATABRICKS_CLI_DISABLE_TELEMETRY"
|
2025-03-05 14:57:41 +00:00
|
|
|
|
2025-03-05 13:54:30 +00:00
|
|
|
func Log(ctx context.Context, event protos.DatabricksCliLog) {
|
|
|
|
fromContext(ctx).log(event)
|
|
|
|
}
|
|
|
|
|
|
|
|
func HasLogs(ctx context.Context) bool {
|
2025-03-06 13:14:16 +00:00
|
|
|
return len(fromContext(ctx).logs) > 0
|
2025-03-05 13:54:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type logger struct {
|
|
|
|
logs []protos.FrontendLog
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *logger) log(event protos.DatabricksCliLog) {
|
|
|
|
if l.logs == nil {
|
|
|
|
l.logs = make([]protos.FrontendLog, 0)
|
|
|
|
}
|
|
|
|
l.logs = append(l.logs, protos.FrontendLog{
|
|
|
|
FrontendLogEventID: uuid.New().String(),
|
|
|
|
Entry: protos.FrontendLogEntry{
|
|
|
|
DatabricksCliLog: event,
|
|
|
|
},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2025-03-06 10:36:21 +00:00
|
|
|
const (
|
|
|
|
uploadTimeout = 3 * time.Second
|
|
|
|
waitBetweenRetries = 200 * time.Millisecond
|
|
|
|
)
|
|
|
|
|
2025-03-06 13:14:16 +00:00
|
|
|
func Upload(ctx context.Context, cfg *config.Config, ec protos.ExecutionContext) error {
|
2025-03-05 14:21:12 +00:00
|
|
|
l := fromContext(ctx)
|
|
|
|
if len(l.logs) == 0 {
|
2025-03-06 13:14:16 +00:00
|
|
|
log.Debugf(ctx, "no telemetry logs to upload")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Telemetry is disabled. We don't upload logs.
|
|
|
|
if env.Get(ctx, disableEnvVar) != "" {
|
|
|
|
log.Debugf(ctx, "telemetry upload is disabled. Not uploading any logs.")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set the execution context for all logs.
|
|
|
|
for i := range l.logs {
|
|
|
|
l.logs[i].Entry.DatabricksCliLog.ExecutionContext = &ec
|
2025-03-05 14:21:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
protoLogs := make([]string, len(l.logs))
|
|
|
|
for i, log := range l.logs {
|
|
|
|
b, err := json.Marshal(log)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to marshal log: %s", err)
|
|
|
|
}
|
|
|
|
protoLogs[i] = string(b)
|
|
|
|
}
|
|
|
|
|
|
|
|
apiClient, err := client.New(cfg)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to create API client: %w", err)
|
|
|
|
}
|
|
|
|
|
2025-03-06 10:36:21 +00:00
|
|
|
ctx, cancel := context.WithTimeout(ctx, uploadTimeout)
|
2025-03-05 15:27:26 +00:00
|
|
|
defer cancel()
|
2025-03-05 14:21:12 +00:00
|
|
|
|
|
|
|
// Only try uploading logs for a maximum of 3 times.
|
|
|
|
for i := range 3 {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return fmt.Errorf("uploading telemetry logs timed out: %w", ctx.Err())
|
|
|
|
default:
|
|
|
|
// proceed
|
|
|
|
}
|
|
|
|
|
2025-03-05 15:27:26 +00:00
|
|
|
resp, err := attempt(ctx, apiClient, protoLogs)
|
2025-03-05 14:21:12 +00:00
|
|
|
|
|
|
|
// All logs were uploaded successfully.
|
|
|
|
if err == nil && resp.NumProtoSuccess >= int64(len(protoLogs)) {
|
2025-03-05 15:21:14 +00:00
|
|
|
log.Debugf(ctx, "All %d logs uploaded successfully", len(protoLogs))
|
2025-03-05 14:21:12 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Partial success. Retry.
|
|
|
|
if err == nil && resp.NumProtoSuccess < int64(len(protoLogs)) {
|
2025-03-06 13:31:13 +00:00
|
|
|
log.Debugf(ctx, "Attempt %d was a partial success. Number of logs uploaded: %d out of %d", i+1, resp.NumProtoSuccess, len(protoLogs))
|
2025-03-06 10:36:21 +00:00
|
|
|
time.Sleep(waitBetweenRetries)
|
2025-03-05 14:21:12 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2025-03-06 13:31:13 +00:00
|
|
|
// Do not retry if the context deadline was exceeded. This means that our
|
|
|
|
// timeout of three seconds was triggered and we should not try again.
|
|
|
|
if errors.Is(err, context.DeadlineExceeded) {
|
|
|
|
log.Debugf(ctx, "Attempt %d failed due to a timeout. Will not retry", i+1)
|
|
|
|
return fmt.Errorf("uploading telemetry logs timed out: %w", err)
|
|
|
|
}
|
|
|
|
|
2025-03-06 10:04:52 +00:00
|
|
|
// We retry for all 5xx responses. Note that the SDK only retries for 503 and 429
|
|
|
|
// (as of 6th March 2025) so we need some additional logic here to retry for other
|
|
|
|
// 5xx responses.
|
|
|
|
// SDK ref: https://github.com/databricks/databricks-sdk-go/blob/cdb28002afacb8b762348534a4c4040a9f19c24b/apierr/errors.go#L91
|
|
|
|
//
|
|
|
|
// The UI infra team (who owns the /telemetry-ext API) recommends retrying for
|
|
|
|
// all 5xx responses.
|
2025-03-05 14:21:12 +00:00
|
|
|
var apiErr *apierr.APIError
|
2025-03-06 10:04:52 +00:00
|
|
|
if errors.As(err, &apiErr) && apiErr.StatusCode >= 500 {
|
2025-03-06 13:31:13 +00:00
|
|
|
log.Debugf(ctx, "Attempt %d failed due to a server side error. Retrying status code: %d", i+1, apiErr.StatusCode)
|
2025-03-06 10:36:21 +00:00
|
|
|
time.Sleep(waitBetweenRetries)
|
2025-03-05 14:21:12 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return errors.New("failed to upload telemetry logs after three attempts")
|
|
|
|
}
|
|
|
|
|
|
|
|
func attempt(ctx context.Context, apiClient *client.DatabricksClient, protoLogs []string) (*ResponseBody, error) {
|
|
|
|
resp := &ResponseBody{}
|
|
|
|
err := apiClient.Do(ctx, http.MethodPost, "/telemetry-ext", nil, nil, RequestBody{
|
|
|
|
UploadTime: time.Now().UnixMilli(),
|
|
|
|
// There is a bug in the `/telemetry-ext` API which requires us to
|
|
|
|
// send an empty array for the `Items` field. Otherwise the API returns
|
|
|
|
// a 500.
|
|
|
|
Items: []string{},
|
|
|
|
ProtoLogs: protoLogs,
|
|
|
|
}, resp)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(resp.Errors) > 0 {
|
|
|
|
return nil, fmt.Errorf("uploading telemetry failed: %v", resp.Errors)
|
|
|
|
}
|
|
|
|
|
|
|
|
return resp, nil
|
|
|
|
}
|