2024-11-29 00:57:48 +00:00
|
|
|
package telemetry
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"encoding/json"
|
|
|
|
"net/http"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/databricks/cli/libs/log"
|
|
|
|
"github.com/google/uuid"
|
|
|
|
)
|
|
|
|
|
|
|
|
// Interface abstraction created to mock out the Databricks client for testing.
|
|
|
|
type DatabricksApiClient interface {
|
|
|
|
Do(ctx context.Context, method, path string,
|
|
|
|
headers map[string]string, request, response any,
|
|
|
|
visitors ...func(*http.Request) error) error
|
|
|
|
}
|
|
|
|
|
2024-12-30 06:59:18 +00:00
|
|
|
func Log(ctx context.Context, event DatabricksCliLog) {
|
2024-11-29 00:57:48 +00:00
|
|
|
l := fromContext(ctx)
|
|
|
|
|
2024-12-30 06:59:18 +00:00
|
|
|
l.logs = append(l.logs, FrontendLog{
|
2024-11-29 00:57:48 +00:00
|
|
|
// The telemetry endpoint deduplicates logs based on the FrontendLogEventID.
|
|
|
|
// This it's important to generate a unique ID for each log event.
|
|
|
|
FrontendLogEventID: uuid.New().String(),
|
2024-12-30 06:50:14 +00:00
|
|
|
Entry: FrontendLogEntry{
|
|
|
|
DatabricksCliLog: event,
|
|
|
|
},
|
2024-12-30 06:59:18 +00:00
|
|
|
})
|
2024-11-29 00:57:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type logger struct {
|
2024-12-30 06:59:18 +00:00
|
|
|
logs []FrontendLog
|
2024-11-29 00:57:48 +00:00
|
|
|
}
|
|
|
|
|
2024-12-27 08:12:06 +00:00
|
|
|
// This function is meant to be only to be used in tests to introspect the telemetry logs
|
|
|
|
// that have been logged so far.
|
2024-12-30 06:59:18 +00:00
|
|
|
func GetLogs(ctx context.Context) []FrontendLog {
|
2024-12-27 07:57:24 +00:00
|
|
|
l := fromContext(ctx)
|
2024-12-30 06:59:18 +00:00
|
|
|
return l.logs
|
2024-12-27 07:57:24 +00:00
|
|
|
}
|
|
|
|
|
2024-11-29 00:57:48 +00:00
|
|
|
// Maximum additional time to wait for the telemetry event to flush. We expect the flush
|
|
|
|
// method to be called when the CLI command is about to exist, so this caps the maximum
|
|
|
|
// additional time the user will experience because of us logging CLI telemetry.
|
2024-12-30 07:13:10 +00:00
|
|
|
var MaxAdditionalWaitTime = 5 * time.Second
|
2024-11-29 00:57:48 +00:00
|
|
|
|
|
|
|
// We make the API call to the /telemetry-ext endpoint to log the CLI telemetry events
|
|
|
|
// right about as the CLI command is about to exit. The API endpoint can handle
|
|
|
|
// payloads with ~1000 events easily. Thus we log all the events at once instead of
|
2024-11-29 01:02:18 +00:00
|
|
|
// batching the logs across multiple API calls.
|
2024-11-29 00:57:48 +00:00
|
|
|
func Flush(ctx context.Context, apiClient DatabricksApiClient) {
|
|
|
|
// Set a maximum time to wait for the telemetry event to flush.
|
2024-12-20 06:46:22 +00:00
|
|
|
ctx, cancel := context.WithTimeout(ctx, MaxAdditionalWaitTime)
|
|
|
|
defer cancel()
|
2024-11-29 00:57:48 +00:00
|
|
|
l := fromContext(ctx)
|
|
|
|
|
2024-12-30 06:59:18 +00:00
|
|
|
if len(l.logs) == 0 {
|
2024-12-27 06:05:04 +00:00
|
|
|
log.Debugf(ctx, "No telemetry events to flush")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2024-12-30 06:59:18 +00:00
|
|
|
var protoLogs []string
|
|
|
|
for _, event := range l.logs {
|
|
|
|
s, err := json.Marshal(event)
|
|
|
|
if err != nil {
|
|
|
|
log.Debugf(ctx, "Error marshalling the telemetry event %v: %v", event, err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
protoLogs = append(protoLogs, string(s))
|
|
|
|
}
|
|
|
|
|
2024-11-29 00:57:48 +00:00
|
|
|
resp := &ResponseBody{}
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
log.Debugf(ctx, "Timed out before flushing telemetry events")
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
// Proceed
|
|
|
|
}
|
|
|
|
|
|
|
|
// Log the CLI telemetry events.
|
|
|
|
err := apiClient.Do(ctx, http.MethodPost, "/telemetry-ext", nil, RequestBody{
|
|
|
|
UploadTime: time.Now().Unix(),
|
2024-12-30 06:59:18 +00:00
|
|
|
ProtoLogs: protoLogs,
|
2024-11-29 00:57:48 +00:00
|
|
|
|
|
|
|
// A bug in the telemetry API requires us to send an empty items array.
|
|
|
|
// Otherwise we get an opaque 500 internal server error.
|
|
|
|
Items: []string{},
|
|
|
|
}, resp)
|
|
|
|
if err != nil {
|
|
|
|
// The SDK automatically performs retries for 429s and 503s. Thus if we
|
|
|
|
// see an error here, do not retry logging the telemetry.
|
|
|
|
log.Debugf(ctx, "Error making the API request to /telemetry-ext: %v", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// If not all the logs were successfully sent, we'll retry and log everything
|
|
|
|
// again.
|
|
|
|
//
|
|
|
|
// Note: This will result in server side duplications but that's fine since
|
|
|
|
// we can always deduplicate in the data pipeline itself.
|
2024-12-30 06:59:18 +00:00
|
|
|
if len(l.logs) > int(resp.NumProtoSuccess) {
|
2024-11-29 00:57:48 +00:00
|
|
|
log.Debugf(ctx, "Not all logs were successfully sent. Retrying...")
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// All logs were successfully sent. We can exit the function.
|
|
|
|
log.Debugf(ctx, "Successfully flushed telemetry events")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|