From 06b71fe5da2785244719ebf76695eb1469552ba0 Mon Sep 17 00:00:00 2001 From: shreyas-goenka <88374338+shreyas-goenka@users.noreply.github.com> Date: Wed, 12 Mar 2025 18:35:10 +0530 Subject: [PATCH] Add synchronous logger for telemetry (#2432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Changes This PR adds a synchronous telemetry logger for the CLI with a max timeout of 3 seconds. Due to the 3-second timeout configuration, this is only meant to be used in long-running commands. This is a short-term solution. Eventually, we'd like to transition to a daemon process to upload the telemetry logs to amortise the costs of configuring authentication and maintaining a warm pool of HTTP connections, as well as a better UX for the end user. Note that users can set the `DATABRICKS_CLI_DISABLE_TELEMETRY` environment variable to disable telemetry collection. ## Why To collect telemetry, which was previously inaccessible to us, and answer questions like which templates customers like to use and which DABs features would be safe to deprecate. ## Tests Unit and acceptance tests. Also manually verified that the telemetry upload works: ``` (artifact-playground) ➜ cli git:(sync-logger) cli selftest send-telemetry --debug 15:58:20 Info: start pid=40386 version=0.0.0-dev+a2825ca89a23 args="cli, selftest, send-telemetry, --debug" 15:58:20 Debug: Loading DEFAULT profile from /Users/shreyas.goenka/.databrickscfg pid=40386 sdk=true 15:58:20 Info: completed execution pid=40386 exit_code=0 15:58:21 Debug: POST /telemetry-ext > { > "items": null, > "protoLogs": [ > "{\"frontend_log_event_id\":\"82d29b3a-d5ff-48f3-8a21-dae6e08d2999\",\"entry\":{\"databricks_cli_log\":{\"... (232 more bytes)", > "{\"frontend_log_event_id\":\"d6be8220-7db8-45d9-97d6-4c09c25e2664\",\"entry\":{\"databricks_cli_log\":{\"... (232 more bytes)" > ], > "uploadTime": 1741186700967 > } < HTTP/2.0 200 OK < { < "errors": null, < "numProtoSuccess": 2, < "numSuccess": 0 < } pid=40386 sdk=true ``` --- acceptance/acceptance_test.go | 4 + acceptance/bundle/debug/out.stderr.txt | 1 + acceptance/config_test.go | 3 + acceptance/telemetry/failure/out.requests.txt | 51 +++++ acceptance/telemetry/failure/output.txt | 53 +++++ acceptance/telemetry/failure/script | 1 + acceptance/telemetry/failure/test.toml | 9 + .../partial-success/out.requests.txt | 51 +++++ .../telemetry/partial-success/output.txt | 50 +++++ acceptance/telemetry/partial-success/script | 1 + .../telemetry/partial-success/test.toml | 8 + acceptance/telemetry/skipped/output.txt | 5 + acceptance/telemetry/skipped/script | 3 + acceptance/telemetry/success/out.requests.txt | 17 ++ acceptance/telemetry/success/output.txt | 19 ++ acceptance/telemetry/success/script | 1 + acceptance/telemetry/test.toml | 38 ++++ acceptance/telemetry/timeout/out.requests.txt | 17 ++ acceptance/telemetry/timeout/output.txt | 17 ++ acceptance/telemetry/timeout/script | 4 + acceptance/telemetry/timeout/test.toml | 9 + cmd/root/root.go | 34 +++- cmd/selftest/selftest.go | 1 + cmd/selftest/send_telemetry.go | 29 +++ libs/telemetry/context.go | 25 +++ libs/telemetry/logger.go | 188 ++++++++++++++++++ libs/telemetry/logger_test.go | 155 +++++++++++++++ 27 files changed, 791 insertions(+), 3 deletions(-) create mode 100644 acceptance/telemetry/failure/out.requests.txt create mode 100644 acceptance/telemetry/failure/output.txt create mode 100644 acceptance/telemetry/failure/script create mode 100644 acceptance/telemetry/failure/test.toml create mode 100644 acceptance/telemetry/partial-success/out.requests.txt create mode 100644 acceptance/telemetry/partial-success/output.txt create mode 100644 acceptance/telemetry/partial-success/script create mode 100644 acceptance/telemetry/partial-success/test.toml create mode 100644 acceptance/telemetry/skipped/output.txt create mode 100644 acceptance/telemetry/skipped/script create mode 100644 acceptance/telemetry/success/out.requests.txt create mode 100644 acceptance/telemetry/success/output.txt create mode 100644 acceptance/telemetry/success/script create mode 100644 acceptance/telemetry/test.toml create mode 100644 acceptance/telemetry/timeout/out.requests.txt create mode 100644 acceptance/telemetry/timeout/output.txt create mode 100644 acceptance/telemetry/timeout/script create mode 100644 acceptance/telemetry/timeout/test.toml create mode 100644 cmd/selftest/send_telemetry.go create mode 100644 libs/telemetry/context.go create mode 100644 libs/telemetry/logger.go create mode 100644 libs/telemetry/logger_test.go diff --git a/acceptance/acceptance_test.go b/acceptance/acceptance_test.go index 4a38e6f13..489abb09b 100644 --- a/acceptance/acceptance_test.go +++ b/acceptance/acceptance_test.go @@ -338,6 +338,10 @@ func runTest(t *testing.T, dir, coverDir string, repls testdiff.ReplacementsCont items := strings.Split(stub.Pattern, " ") require.Len(t, items, 2) server.Handle(items[0], items[1], func(req testserver.Request) any { + if stub.DelaySeconds != nil { + time.Sleep(time.Duration((*stub.DelaySeconds) * float64(time.Second))) + } + return stub.Response }) } diff --git a/acceptance/bundle/debug/out.stderr.txt b/acceptance/bundle/debug/out.stderr.txt index b9ac5c4d9..631f51990 100644 --- a/acceptance/bundle/debug/out.stderr.txt +++ b/acceptance/bundle/debug/out.stderr.txt @@ -99,3 +99,4 @@ < } pid=12345 mutator=validate:files_to_sync sdk=true 10:07:59 Debug: Path /Workspace/Users/[USERNAME]/.bundle/debug/default/files has type directory (ID: 0) pid=12345 mutator=validate:files_to_sync 10:07:59 Info: completed execution pid=12345 exit_code=0 +10:07:59 Debug: no telemetry logs to upload pid=12345 diff --git a/acceptance/config_test.go b/acceptance/config_test.go index e61d91ad3..4b73082f5 100644 --- a/acceptance/config_test.go +++ b/acceptance/config_test.go @@ -74,6 +74,9 @@ type ServerStub struct { // The response body to return. Response testserver.Response + + // Artificial delay in seconds to simulate slow responses. + DelaySeconds *float64 } // FindConfigs finds all the config relevant for this test, diff --git a/acceptance/telemetry/failure/out.requests.txt b/acceptance/telemetry/failure/out.requests.txt new file mode 100644 index 000000000..95b21f5c3 --- /dev/null +++ b/acceptance/telemetry/failure/out.requests.txt @@ -0,0 +1,51 @@ +{ + "headers": { + "Authorization": [ + "Bearer [DATABRICKS_TOKEN]" + ] + }, + "method": "POST", + "path": "/telemetry-ext", + "body": { + "uploadTime": "UNIX_TIME_MILLIS", + "items": [], + "protoLogs": [ + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}", + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}" + ] + } +} +{ + "headers": { + "Authorization": [ + "Bearer [DATABRICKS_TOKEN]" + ] + }, + "method": "POST", + "path": "/telemetry-ext", + "body": { + "uploadTime": "UNIX_TIME_MILLIS", + "items": [], + "protoLogs": [ + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}", + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}" + ] + } +} +{ + "headers": { + "Authorization": [ + "Bearer [DATABRICKS_TOKEN]" + ] + }, + "method": "POST", + "path": "/telemetry-ext", + "body": { + "uploadTime": "UNIX_TIME_MILLIS", + "items": [], + "protoLogs": [ + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}", + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}" + ] + } +} diff --git a/acceptance/telemetry/failure/output.txt b/acceptance/telemetry/failure/output.txt new file mode 100644 index 000000000..50f620829 --- /dev/null +++ b/acceptance/telemetry/failure/output.txt @@ -0,0 +1,53 @@ + +>>> [CLI] selftest send-telemetry --debug +HH:MM:SS Info: start pid=PID version=[DEV_VERSION] args="[CLI], selftest, send-telemetry, --debug" +HH:MM:SS Info: completed execution pid=PID exit_code=0 +HH:MM:SS Debug: POST /telemetry-ext +> { +> "items": null, +> "protoLogs": [ +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)", +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)" +> ], +> "uploadTime": "UNIX_TIME_MILLIS" +> } +< HTTP/1.1 501 Not Implemented +< { +< "error_code": "ERROR_CODE", +< "message": "Endpoint not implemented." +< } pid=PID sdk=true +HH:MM:SS Debug: non-retriable error: Endpoint not implemented. pid=PID sdk=true +HH:MM:SS Info: Attempt 1 failed due to a server side error. Retrying status code: 501 pid=PID +HH:MM:SS Debug: POST /telemetry-ext +> { +> "items": null, +> "protoLogs": [ +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)", +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)" +> ], +> "uploadTime": "UNIX_TIME_MILLIS" +> } +< HTTP/1.1 501 Not Implemented +< { +< "error_code": "ERROR_CODE", +< "message": "Endpoint not implemented." +< } pid=PID sdk=true +HH:MM:SS Debug: non-retriable error: Endpoint not implemented. pid=PID sdk=true +HH:MM:SS Info: Attempt 2 failed due to a server side error. Retrying status code: 501 pid=PID +HH:MM:SS Debug: POST /telemetry-ext +> { +> "items": null, +> "protoLogs": [ +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)", +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)" +> ], +> "uploadTime": "UNIX_TIME_MILLIS" +> } +< HTTP/1.1 501 Not Implemented +< { +< "error_code": "ERROR_CODE", +< "message": "Endpoint not implemented." +< } pid=PID sdk=true +HH:MM:SS Debug: non-retriable error: Endpoint not implemented. pid=PID sdk=true +HH:MM:SS Info: Attempt 3 failed due to a server side error. Retrying status code: 501 pid=PID +HH:MM:SS Info: telemetry upload failed: failed to upload telemetry logs after three attempts pid=PID diff --git a/acceptance/telemetry/failure/script b/acceptance/telemetry/failure/script new file mode 100644 index 000000000..7fe85f0cd --- /dev/null +++ b/acceptance/telemetry/failure/script @@ -0,0 +1 @@ +trace $CLI selftest send-telemetry --debug diff --git a/acceptance/telemetry/failure/test.toml b/acceptance/telemetry/failure/test.toml new file mode 100644 index 000000000..a7fa942e4 --- /dev/null +++ b/acceptance/telemetry/failure/test.toml @@ -0,0 +1,9 @@ +[[Server]] +Pattern = "POST /telemetry-ext" +Response.Body = ''' +{ + "error_code": "ERROR_CODE", + "message": "Endpoint not implemented." +} +''' +Response.StatusCode = 501 diff --git a/acceptance/telemetry/partial-success/out.requests.txt b/acceptance/telemetry/partial-success/out.requests.txt new file mode 100644 index 000000000..95b21f5c3 --- /dev/null +++ b/acceptance/telemetry/partial-success/out.requests.txt @@ -0,0 +1,51 @@ +{ + "headers": { + "Authorization": [ + "Bearer [DATABRICKS_TOKEN]" + ] + }, + "method": "POST", + "path": "/telemetry-ext", + "body": { + "uploadTime": "UNIX_TIME_MILLIS", + "items": [], + "protoLogs": [ + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}", + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}" + ] + } +} +{ + "headers": { + "Authorization": [ + "Bearer [DATABRICKS_TOKEN]" + ] + }, + "method": "POST", + "path": "/telemetry-ext", + "body": { + "uploadTime": "UNIX_TIME_MILLIS", + "items": [], + "protoLogs": [ + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}", + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}" + ] + } +} +{ + "headers": { + "Authorization": [ + "Bearer [DATABRICKS_TOKEN]" + ] + }, + "method": "POST", + "path": "/telemetry-ext", + "body": { + "uploadTime": "UNIX_TIME_MILLIS", + "items": [], + "protoLogs": [ + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}", + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}" + ] + } +} diff --git a/acceptance/telemetry/partial-success/output.txt b/acceptance/telemetry/partial-success/output.txt new file mode 100644 index 000000000..2757377fb --- /dev/null +++ b/acceptance/telemetry/partial-success/output.txt @@ -0,0 +1,50 @@ + +>>> [CLI] selftest send-telemetry --debug +HH:MM:SS Info: start pid=PID version=[DEV_VERSION] args="[CLI], selftest, send-telemetry, --debug" +HH:MM:SS Info: completed execution pid=PID exit_code=0 +HH:MM:SS Debug: POST /telemetry-ext +> { +> "items": null, +> "protoLogs": [ +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)", +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)" +> ], +> "uploadTime": "UNIX_TIME_MILLIS" +> } +< HTTP/1.1 200 OK +< { +< "errors": null, +< "numProtoSuccess": 1 +< } pid=PID sdk=true +HH:MM:SS Debug: Attempt 1 was a partial success. Number of logs uploaded: 1 out of 2 pid=PID +HH:MM:SS Debug: POST /telemetry-ext +> { +> "items": null, +> "protoLogs": [ +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)", +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)" +> ], +> "uploadTime": "UNIX_TIME_MILLIS" +> } +< HTTP/1.1 200 OK +< { +< "errors": null, +< "numProtoSuccess": 1 +< } pid=PID sdk=true +HH:MM:SS Debug: Attempt 2 was a partial success. Number of logs uploaded: 1 out of 2 pid=PID +HH:MM:SS Debug: POST /telemetry-ext +> { +> "items": null, +> "protoLogs": [ +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)", +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)" +> ], +> "uploadTime": "UNIX_TIME_MILLIS" +> } +< HTTP/1.1 200 OK +< { +< "errors": null, +< "numProtoSuccess": 1 +< } pid=PID sdk=true +HH:MM:SS Debug: Attempt 3 was a partial success. Number of logs uploaded: 1 out of 2 pid=PID +HH:MM:SS Info: telemetry upload failed: failed to upload telemetry logs after three attempts pid=PID diff --git a/acceptance/telemetry/partial-success/script b/acceptance/telemetry/partial-success/script new file mode 100644 index 000000000..7fe85f0cd --- /dev/null +++ b/acceptance/telemetry/partial-success/script @@ -0,0 +1 @@ +trace $CLI selftest send-telemetry --debug diff --git a/acceptance/telemetry/partial-success/test.toml b/acceptance/telemetry/partial-success/test.toml new file mode 100644 index 000000000..4e2bd9fab --- /dev/null +++ b/acceptance/telemetry/partial-success/test.toml @@ -0,0 +1,8 @@ +[[Server]] +Pattern = "POST /telemetry-ext" +Response.Body = ''' +{ + "errors": [], + "numProtoSuccess": 1 +} +''' diff --git a/acceptance/telemetry/skipped/output.txt b/acceptance/telemetry/skipped/output.txt new file mode 100644 index 000000000..640e03d54 --- /dev/null +++ b/acceptance/telemetry/skipped/output.txt @@ -0,0 +1,5 @@ + +>>> [CLI] selftest send-telemetry --debug +HH:MM:SS Info: start pid=PID version=[DEV_VERSION] args="[CLI], selftest, send-telemetry, --debug" +HH:MM:SS Info: completed execution pid=PID exit_code=0 +HH:MM:SS Debug: telemetry upload is disabled. Not uploading any logs. pid=PID diff --git a/acceptance/telemetry/skipped/script b/acceptance/telemetry/skipped/script new file mode 100644 index 000000000..247171a6f --- /dev/null +++ b/acceptance/telemetry/skipped/script @@ -0,0 +1,3 @@ +export DATABRICKS_CLI_DISABLE_TELEMETRY="true" + +trace $CLI selftest send-telemetry --debug diff --git a/acceptance/telemetry/success/out.requests.txt b/acceptance/telemetry/success/out.requests.txt new file mode 100644 index 000000000..d8260f562 --- /dev/null +++ b/acceptance/telemetry/success/out.requests.txt @@ -0,0 +1,17 @@ +{ + "headers": { + "Authorization": [ + "Bearer [DATABRICKS_TOKEN]" + ] + }, + "method": "POST", + "path": "/telemetry-ext", + "body": { + "uploadTime": "UNIX_TIME_MILLIS", + "items": [], + "protoLogs": [ + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}", + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}" + ] + } +} diff --git a/acceptance/telemetry/success/output.txt b/acceptance/telemetry/success/output.txt new file mode 100644 index 000000000..0afbd768c --- /dev/null +++ b/acceptance/telemetry/success/output.txt @@ -0,0 +1,19 @@ + +>>> [CLI] selftest send-telemetry --debug +HH:MM:SS Info: start pid=PID version=[DEV_VERSION] args="[CLI], selftest, send-telemetry, --debug" +HH:MM:SS Info: completed execution pid=PID exit_code=0 +HH:MM:SS Debug: POST /telemetry-ext +> { +> "items": null, +> "protoLogs": [ +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)", +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)" +> ], +> "uploadTime": "UNIX_TIME_MILLIS" +> } +< HTTP/1.1 200 OK +< { +< "errors": null, +< "numProtoSuccess": 2 +< } pid=PID sdk=true +HH:MM:SS Debug: All 2 logs uploaded successfully pid=PID diff --git a/acceptance/telemetry/success/script b/acceptance/telemetry/success/script new file mode 100644 index 000000000..7fe85f0cd --- /dev/null +++ b/acceptance/telemetry/success/script @@ -0,0 +1 @@ +trace $CLI selftest send-telemetry --debug diff --git a/acceptance/telemetry/test.toml b/acceptance/telemetry/test.toml new file mode 100644 index 000000000..169fe21d5 --- /dev/null +++ b/acceptance/telemetry/test.toml @@ -0,0 +1,38 @@ +IncludeRequestHeaders = ["Authorization"] +RecordRequests = true + +Local = true +Cloud = false + +[[Repls]] +Old = '17\d{11}' +New = '"UNIX_TIME_MILLIS"' + +[[Repls]] +Old = 'darwin|linux|windows' +New = '[OS]' + +[[Repls]] +Old = 'execution_time_ms\\\":\d{1,5},' +New = 'execution_time_ms\":\"SMALL_INT\",' + +[[Server]] +Pattern = "POST /telemetry-ext" +Response.Body = ''' +{ + "errors": [], + "numProtoSuccess": 2 +} +''' + +[[Repls]] +Old = "(?:[01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9]" +New = "HH:MM:SS" + +[[Repls]] +Old = "pid=[0-9]+" +New = "pid=PID" + +[[Repls]] +Old = "\\([0-9]+ more bytes\\)" +New = "(N more bytes)" diff --git a/acceptance/telemetry/timeout/out.requests.txt b/acceptance/telemetry/timeout/out.requests.txt new file mode 100644 index 000000000..d8260f562 --- /dev/null +++ b/acceptance/telemetry/timeout/out.requests.txt @@ -0,0 +1,17 @@ +{ + "headers": { + "Authorization": [ + "Bearer [DATABRICKS_TOKEN]" + ] + }, + "method": "POST", + "path": "/telemetry-ext", + "body": { + "uploadTime": "UNIX_TIME_MILLIS", + "items": [], + "protoLogs": [ + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}", + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}" + ] + } +} diff --git a/acceptance/telemetry/timeout/output.txt b/acceptance/telemetry/timeout/output.txt new file mode 100644 index 000000000..aff859481 --- /dev/null +++ b/acceptance/telemetry/timeout/output.txt @@ -0,0 +1,17 @@ + +>>> [CLI] selftest send-telemetry --debug +HH:MM:SS Info: start pid=PID version=[DEV_VERSION] args="[CLI], selftest, send-telemetry, --debug" +HH:MM:SS Info: completed execution pid=PID exit_code=0 +HH:MM:SS Debug: POST /telemetry-ext +> { +> "items": null, +> "protoLogs": [ +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)", +> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)" +> ], +> "uploadTime": "UNIX_TIME_MILLIS" +> } +< Error: Post "[DATABRICKS_URL]/telemetry-ext": context deadline exceeded pid=PID sdk=true +HH:MM:SS Debug: non-retriable error: Post "[DATABRICKS_URL]/telemetry-ext": context deadline exceeded pid=PID sdk=true +HH:MM:SS Debug: Attempt 1 failed due to a timeout. Will not retry pid=PID +HH:MM:SS Info: telemetry upload failed: uploading telemetry logs timed out: Post "[DATABRICKS_URL]/telemetry-ext": context deadline exceeded pid=PID diff --git a/acceptance/telemetry/timeout/script b/acceptance/telemetry/timeout/script new file mode 100644 index 000000000..0fc86234f --- /dev/null +++ b/acceptance/telemetry/timeout/script @@ -0,0 +1,4 @@ +# Set timeout to 0.1 second to make this test faster +export DATABRICKS_CLI_TELEMETRY_TIMEOUT=0.1 + +trace $CLI selftest send-telemetry --debug diff --git a/acceptance/telemetry/timeout/test.toml b/acceptance/telemetry/timeout/test.toml new file mode 100644 index 000000000..7e1b4d5b2 --- /dev/null +++ b/acceptance/telemetry/timeout/test.toml @@ -0,0 +1,9 @@ +[[Server]] +Pattern = "POST /telemetry-ext" +Response.Body = ''' +{ + "errors": [], + "numProtoSuccess": 2 +} +''' +DelaySeconds = 5 diff --git a/cmd/root/root.go b/cmd/root/root.go index 9e3fa4d2e..ca1fccbf1 100644 --- a/cmd/root/root.go +++ b/cmd/root/root.go @@ -6,14 +6,18 @@ import ( "fmt" "log/slog" "os" + "runtime" "runtime/debug" "strings" + "time" "github.com/databricks/cli/internal/build" "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/command" "github.com/databricks/cli/libs/dbr" "github.com/databricks/cli/libs/log" + "github.com/databricks/cli/libs/telemetry" + "github.com/databricks/cli/libs/telemetry/protos" "github.com/spf13/cobra" ) @@ -75,9 +79,6 @@ func New(ctx context.Context) *cobra.Command { // get the context back ctx = cmd.Context() - // Detect if the CLI is running on DBR and store this on the context. - ctx = dbr.DetectRuntime(ctx) - // Configure our user agent with the command that's about to be executed. ctx = withCommandInUserAgent(ctx, cmd) ctx = withCommandExecIdInUserAgent(ctx) @@ -125,9 +126,17 @@ Stack Trace: %s`, version, r, string(trace)) }() + // Configure a telemetry logger and store it in the context. + ctx = telemetry.WithNewLogger(ctx) + + // Detect if the CLI is running on DBR and store this on the context. + ctx = dbr.DetectRuntime(ctx) + // Set a command execution ID value in the context ctx = command.GenerateExecId(ctx) + startTime := time.Now() + // Run the command cmd, err = cmd.ExecuteContextC(ctx) if err != nil && !errors.Is(err, ErrAlreadyPrinted) { @@ -155,5 +164,24 @@ Stack Trace: } } + exitCode := 0 + if err != nil { + exitCode = 1 + } + + ctx = cmd.Context() + telemetryErr := telemetry.Upload(ctx, protos.ExecutionContext{ + CmdExecID: command.ExecId(ctx), + Version: build.GetInfo().Version, + Command: commandString(cmd), + OperatingSystem: runtime.GOOS, + DbrVersion: dbr.RuntimeVersion(ctx), + ExecutionTimeMs: time.Since(startTime).Milliseconds(), + ExitCode: int64(exitCode), + }) + if telemetryErr != nil { + log.Infof(ctx, "telemetry upload failed: %s", telemetryErr) + } + return err } diff --git a/cmd/selftest/selftest.go b/cmd/selftest/selftest.go index 5e2a2c30e..039285399 100644 --- a/cmd/selftest/selftest.go +++ b/cmd/selftest/selftest.go @@ -11,6 +11,7 @@ func New() *cobra.Command { Hidden: true, } + cmd.AddCommand(newSendTelemetry()) cmd.AddCommand(newPanic()) cmd.AddCommand(newPatchWhl()) return cmd diff --git a/cmd/selftest/send_telemetry.go b/cmd/selftest/send_telemetry.go new file mode 100644 index 000000000..b091058e7 --- /dev/null +++ b/cmd/selftest/send_telemetry.go @@ -0,0 +1,29 @@ +package selftest + +import ( + "github.com/databricks/cli/cmd/root" + "github.com/databricks/cli/libs/telemetry" + "github.com/databricks/cli/libs/telemetry/protos" + "github.com/spf13/cobra" +) + +func newSendTelemetry() *cobra.Command { + cmd := &cobra.Command{ + Use: "send-telemetry", + Short: "log some test telemetry events", + PreRunE: root.MustWorkspaceClient, + } + + cmd.RunE = func(cmd *cobra.Command, args []string) error { + for _, v := range []string{"VALUE1", "VALUE2"} { + telemetry.Log(cmd.Context(), protos.DatabricksCliLog{ + CliTestEvent: &protos.CliTestEvent{ + Name: protos.DummyCliEnum(v), + }, + }) + } + return nil + } + + return cmd +} diff --git a/libs/telemetry/context.go b/libs/telemetry/context.go new file mode 100644 index 000000000..e556e462c --- /dev/null +++ b/libs/telemetry/context.go @@ -0,0 +1,25 @@ +package telemetry + +import ( + "context" + "errors" +) + +// Private type to store the telemetry logger in the context +type telemetryLogger int + +// Key to store the telemetry logger in the context +var telemetryLoggerKey telemetryLogger + +func WithNewLogger(ctx context.Context) context.Context { + return context.WithValue(ctx, telemetryLoggerKey, &logger{}) +} + +func fromContext(ctx context.Context) *logger { + v := ctx.Value(telemetryLoggerKey) + if v == nil { + panic(errors.New("telemetry logger not found in the context")) + } + + return v.(*logger) +} diff --git a/libs/telemetry/logger.go b/libs/telemetry/logger.go new file mode 100644 index 000000000..b9b0faa1b --- /dev/null +++ b/libs/telemetry/logger.go @@ -0,0 +1,188 @@ +package telemetry + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "strconv" + "time" + + "github.com/databricks/cli/libs/command" + "github.com/databricks/cli/libs/env" + "github.com/databricks/cli/libs/log" + "github.com/databricks/cli/libs/telemetry/protos" + "github.com/databricks/databricks-sdk-go/apierr" + "github.com/databricks/databricks-sdk-go/client" + "github.com/google/uuid" +) + +const ( + // Environment variable to disable telemetry. If this is set to any value, telemetry + // will be disabled. + disableEnvVar = "DATABRICKS_CLI_DISABLE_TELEMETRY" + + // Timeout in seconds for uploading telemetry logs. + timeoutEnvVar = "DATABRICKS_CLI_TELEMETRY_TIMEOUT" +) + +func Log(ctx context.Context, event protos.DatabricksCliLog) { + fromContext(ctx).log(event) +} + +type logger struct { + logs []protos.FrontendLog +} + +func (l *logger) log(event protos.DatabricksCliLog) { + l.logs = append(l.logs, protos.FrontendLog{ + FrontendLogEventID: uuid.New().String(), + Entry: protos.FrontendLogEntry{ + DatabricksCliLog: event, + }, + }) +} + +const ( + defaultUploadTimeout = 3 * time.Second + waitBetweenRetries = 200 * time.Millisecond +) + +func Upload(ctx context.Context, ec protos.ExecutionContext) error { + l := fromContext(ctx) + if len(l.logs) == 0 { + log.Debugf(ctx, "no telemetry logs to upload") + return nil + } + + // Telemetry is disabled. We don't upload logs. + if env.Get(ctx, disableEnvVar) != "" { + log.Debugf(ctx, "telemetry upload is disabled. Not uploading any logs.") + return nil + } + + // Set the execution context for all logs. + for i := range l.logs { + l.logs[i].Entry.DatabricksCliLog.ExecutionContext = &ec + } + + protoLogs := make([]string, len(l.logs)) + for i, log := range l.logs { + b, err := json.Marshal(log) + if err != nil { + return fmt.Errorf("failed to marshal log: %s", err) + } + protoLogs[i] = string(b) + } + + apiClient, err := client.New(command.ConfigUsed(ctx)) + if err != nil { + return fmt.Errorf("failed to create API client: %w", err) + } + + uploadTimeout := defaultUploadTimeout + if v := env.Get(ctx, timeoutEnvVar); v != "" { + timeout, err := strconv.ParseFloat(v, 64) + if err != nil { + return fmt.Errorf("failed to parse timeout: %w", err) + } + + uploadTimeout = time.Duration(timeout * float64(time.Second)) + } + + ctx, cancel := context.WithTimeout(ctx, uploadTimeout) + defer cancel() + + deadline, ok := ctx.Deadline() + if !ok { + log.Infof(ctx, "context has no deadline. This is unexpected. Not uploading telemetry logs") + return nil + } + + // Only try uploading logs for a maximum of 3 times. + for i := range 3 { + select { + case <-ctx.Done(): + return fmt.Errorf("uploading telemetry logs timed out: %w", ctx.Err()) + default: + // proceed + } + + resp, err := attempt(ctx, apiClient, protoLogs) + + // All logs were uploaded successfully. + if err == nil && resp.NumProtoSuccess >= int64(len(protoLogs)) { + log.Debugf(ctx, "All %d logs uploaded successfully", len(protoLogs)) + return nil + } + + // Partial success. Retry. + if err == nil && resp.NumProtoSuccess < int64(len(protoLogs)) { + log.Debugf(ctx, "Attempt %d was a partial success. Number of logs uploaded: %d out of %d", i+1, resp.NumProtoSuccess, len(protoLogs)) + + remainingTime := time.Until(deadline) + if remainingTime < waitBetweenRetries { + log.Debugf(ctx, "not enough time to retry before context deadline.") + break + } + + time.Sleep(waitBetweenRetries) + continue + } + + // Do not retry if the context deadline was exceeded. This means that our + // timeout of three seconds was triggered and we should not try again. + if errors.Is(err, context.DeadlineExceeded) { + log.Debugf(ctx, "Attempt %d failed due to a timeout. Will not retry", i+1) + return fmt.Errorf("uploading telemetry logs timed out: %w", err) + } + + // We retry for all 5xx responses. Note that the SDK only retries for 503 and 429 + // (as of 6th March 2025) so we need some additional logic here to retry for other + // 5xx responses. + // Note: We should never see a 503 or 429 here because the SDK default timeout + // of 1 minute is more than the 3 second timeout we set above. + // + // SDK ref: https://github.com/databricks/databricks-sdk-go/blob/cdb28002afacb8b762348534a4c4040a9f19c24b/apierr/errors.go#L91 + // + // The UI infra team (who owns the /telemetry-ext API) recommends retrying for + // all 5xx responses. + var apiErr *apierr.APIError + if errors.As(err, &apiErr) && apiErr.StatusCode >= 500 { + log.Infof(ctx, "Attempt %d failed due to a server side error. Retrying status code: %d", i+1, apiErr.StatusCode) + + remainingTime := time.Until(deadline) + if remainingTime < waitBetweenRetries { + log.Debugf(ctx, "not enough time to retry before context deadline.") + break + } + + time.Sleep(waitBetweenRetries) + continue + } + } + + return errors.New("failed to upload telemetry logs after three attempts") +} + +func attempt(ctx context.Context, apiClient *client.DatabricksClient, protoLogs []string) (*ResponseBody, error) { + resp := &ResponseBody{} + err := apiClient.Do(ctx, http.MethodPost, "/telemetry-ext", nil, nil, RequestBody{ + UploadTime: time.Now().UnixMilli(), + // There is a bug in the `/telemetry-ext` API which requires us to + // send an empty array for the `Items` field. Otherwise the API returns + // a 500. + Items: []string{}, + ProtoLogs: protoLogs, + }, resp) + if err != nil { + return nil, err + } + + if len(resp.Errors) > 0 { + return nil, fmt.Errorf("uploading telemetry failed: %v", resp.Errors) + } + + return resp, nil +} diff --git a/libs/telemetry/logger_test.go b/libs/telemetry/logger_test.go new file mode 100644 index 000000000..b4bbe4cc7 --- /dev/null +++ b/libs/telemetry/logger_test.go @@ -0,0 +1,155 @@ +package telemetry + +import ( + "context" + "testing" + + "github.com/databricks/cli/libs/command" + "github.com/databricks/cli/libs/telemetry/protos" + "github.com/databricks/cli/libs/testserver" + "github.com/databricks/databricks-sdk-go/apierr" + "github.com/databricks/databricks-sdk-go/config" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestTelemetryUploadRetriesOnPartialSuccess(t *testing.T) { + server := testserver.New(t) + t.Cleanup(server.Close) + + count := 0 + server.Handle("POST", "/telemetry-ext", func(req testserver.Request) any { + count++ + if count == 1 { + return ResponseBody{ + NumProtoSuccess: 1, + } + } + if count == 2 { + return ResponseBody{ + NumProtoSuccess: 2, + } + } + return nil + }) + + ctx := WithNewLogger(context.Background()) + + Log(ctx, protos.DatabricksCliLog{ + CliTestEvent: &protos.CliTestEvent{ + Name: protos.DummyCliEnumValue1, + }, + }) + Log(ctx, protos.DatabricksCliLog{ + CliTestEvent: &protos.CliTestEvent{ + Name: protos.DummyCliEnumValue2, + }, + }) + + ctx = command.SetConfigUsed(ctx, &config.Config{ + Host: server.URL, + Token: "token", + }) + + err := Upload(ctx, protos.ExecutionContext{}) + require.NoError(t, err) + assert.Equal(t, 2, count) +} + +func uploadRetriesFor(t *testing.T, statusCode int) { + server := testserver.New(t) + t.Cleanup(server.Close) + + count := 0 + server.Handle("POST", "/telemetry-ext", func(req testserver.Request) any { + count++ + if count == 1 { + return testserver.Response{ + StatusCode: statusCode, + Body: apierr.APIError{ + StatusCode: statusCode, + Message: "Some error", + }, + } + } + if count == 2 { + return ResponseBody{ + NumProtoSuccess: 2, + } + } + return nil + }) + + t.Setenv("DATABRICKS_HOST", server.URL) + t.Setenv("DATABRICKS_TOKEN", "token") + + ctx := WithNewLogger(context.Background()) + + Log(ctx, protos.DatabricksCliLog{ + CliTestEvent: &protos.CliTestEvent{ + Name: protos.DummyCliEnumValue1, + }, + }) + Log(ctx, protos.DatabricksCliLog{ + CliTestEvent: &protos.CliTestEvent{ + Name: protos.DummyCliEnumValue2, + }, + }) + ctx = command.SetConfigUsed(ctx, &config.Config{ + Host: server.URL, + Token: "token", + }) + + err := Upload(ctx, protos.ExecutionContext{}) + require.NoError(t, err) + assert.Equal(t, 2, count) +} + +func TestTelemetryUploadRetriesForStatusCodes(t *testing.T) { + // These retries happen in the CLI itself since the SDK does not automatically + // retry for 5xx errors. + uploadRetriesFor(t, 500) + uploadRetriesFor(t, 504) + + // These retries happen on the SDK layer. + // ref: https://github.com/databricks/databricks-sdk-go/blob/cdb28002afacb8b762348534a4c4040a9f19c24b/apierr/errors.go#L91 + uploadRetriesFor(t, 503) + uploadRetriesFor(t, 429) +} + +func TestTelemetryUploadMaxRetries(t *testing.T) { + server := testserver.New(t) + t.Cleanup(server.Close) + count := 0 + + server.Handle("POST", "/telemetry-ext", func(req testserver.Request) any { + count++ + return ResponseBody{ + NumProtoSuccess: 1, + } + }) + + t.Setenv("DATABRICKS_HOST", server.URL) + t.Setenv("DATABRICKS_TOKEN", "token") + ctx := WithNewLogger(context.Background()) + + Log(ctx, protos.DatabricksCliLog{ + CliTestEvent: &protos.CliTestEvent{ + Name: protos.DummyCliEnumValue1, + }, + }) + Log(ctx, protos.DatabricksCliLog{ + CliTestEvent: &protos.CliTestEvent{ + Name: protos.DummyCliEnumValue2, + }, + }) + + ctx = command.SetConfigUsed(ctx, &config.Config{ + Host: server.URL, + Token: "token", + }) + + err := Upload(ctx, protos.ExecutionContext{}) + assert.EqualError(t, err, "failed to upload telemetry logs after three attempts") + assert.Equal(t, 3, count) +}