mirror of https://github.com/databricks/cli.git
Add synchronous logger for telemetry (#2432)
## Changes This PR adds a synchronous telemetry logger for the CLI with a max timeout of 3 seconds. Due to the 3-second timeout configuration, this is only meant to be used in long-running commands. This is a short-term solution. Eventually, we'd like to transition to a daemon process to upload the telemetry logs to amortise the costs of configuring authentication and maintaining a warm pool of HTTP connections, as well as a better UX for the end user. Note that users can set the `DATABRICKS_CLI_DISABLE_TELEMETRY` environment variable to disable telemetry collection. ## Why To collect telemetry, which was previously inaccessible to us, and answer questions like which templates customers like to use and which DABs features would be safe to deprecate. ## Tests Unit and acceptance tests. Also manually verified that the telemetry upload works: ``` (artifact-playground) ➜ cli git:(sync-logger) cli selftest send-telemetry --debug 15:58:20 Info: start pid=40386 version=0.0.0-dev+a2825ca89a23 args="cli, selftest, send-telemetry, --debug" 15:58:20 Debug: Loading DEFAULT profile from /Users/shreyas.goenka/.databrickscfg pid=40386 sdk=true 15:58:20 Info: completed execution pid=40386 exit_code=0 15:58:21 Debug: POST /telemetry-ext > { > "items": null, > "protoLogs": [ > "{\"frontend_log_event_id\":\"82d29b3a-d5ff-48f3-8a21-dae6e08d2999\",\"entry\":{\"databricks_cli_log\":{\"... (232 more bytes)", > "{\"frontend_log_event_id\":\"d6be8220-7db8-45d9-97d6-4c09c25e2664\",\"entry\":{\"databricks_cli_log\":{\"... (232 more bytes)" > ], > "uploadTime": 1741186700967 > } < HTTP/2.0 200 OK < { < "errors": null, < "numProtoSuccess": 2, < "numSuccess": 0 < } pid=40386 sdk=true ```
This commit is contained in:
parent
8b51eeb57a
commit
06b71fe5da
|
@ -338,6 +338,10 @@ func runTest(t *testing.T, dir, coverDir string, repls testdiff.ReplacementsCont
|
|||
items := strings.Split(stub.Pattern, " ")
|
||||
require.Len(t, items, 2)
|
||||
server.Handle(items[0], items[1], func(req testserver.Request) any {
|
||||
if stub.DelaySeconds != nil {
|
||||
time.Sleep(time.Duration((*stub.DelaySeconds) * float64(time.Second)))
|
||||
}
|
||||
|
||||
return stub.Response
|
||||
})
|
||||
}
|
||||
|
|
|
@ -99,3 +99,4 @@
|
|||
< } pid=12345 mutator=validate:files_to_sync sdk=true
|
||||
10:07:59 Debug: Path /Workspace/Users/[USERNAME]/.bundle/debug/default/files has type directory (ID: 0) pid=12345 mutator=validate:files_to_sync
|
||||
10:07:59 Info: completed execution pid=12345 exit_code=0
|
||||
10:07:59 Debug: no telemetry logs to upload pid=12345
|
||||
|
|
|
@ -74,6 +74,9 @@ type ServerStub struct {
|
|||
|
||||
// The response body to return.
|
||||
Response testserver.Response
|
||||
|
||||
// Artificial delay in seconds to simulate slow responses.
|
||||
DelaySeconds *float64
|
||||
}
|
||||
|
||||
// FindConfigs finds all the config relevant for this test,
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"headers": {
|
||||
"Authorization": [
|
||||
"Bearer [DATABRICKS_TOKEN]"
|
||||
]
|
||||
},
|
||||
"method": "POST",
|
||||
"path": "/telemetry-ext",
|
||||
"body": {
|
||||
"uploadTime": "UNIX_TIME_MILLIS",
|
||||
"items": [],
|
||||
"protoLogs": [
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}",
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}"
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"headers": {
|
||||
"Authorization": [
|
||||
"Bearer [DATABRICKS_TOKEN]"
|
||||
]
|
||||
},
|
||||
"method": "POST",
|
||||
"path": "/telemetry-ext",
|
||||
"body": {
|
||||
"uploadTime": "UNIX_TIME_MILLIS",
|
||||
"items": [],
|
||||
"protoLogs": [
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}",
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}"
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"headers": {
|
||||
"Authorization": [
|
||||
"Bearer [DATABRICKS_TOKEN]"
|
||||
]
|
||||
},
|
||||
"method": "POST",
|
||||
"path": "/telemetry-ext",
|
||||
"body": {
|
||||
"uploadTime": "UNIX_TIME_MILLIS",
|
||||
"items": [],
|
||||
"protoLogs": [
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}",
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}"
|
||||
]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
|
||||
>>> [CLI] selftest send-telemetry --debug
|
||||
HH:MM:SS Info: start pid=PID version=[DEV_VERSION] args="[CLI], selftest, send-telemetry, --debug"
|
||||
HH:MM:SS Info: completed execution pid=PID exit_code=0
|
||||
HH:MM:SS Debug: POST /telemetry-ext
|
||||
> {
|
||||
> "items": null,
|
||||
> "protoLogs": [
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)",
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)"
|
||||
> ],
|
||||
> "uploadTime": "UNIX_TIME_MILLIS"
|
||||
> }
|
||||
< HTTP/1.1 501 Not Implemented
|
||||
< {
|
||||
< "error_code": "ERROR_CODE",
|
||||
< "message": "Endpoint not implemented."
|
||||
< } pid=PID sdk=true
|
||||
HH:MM:SS Debug: non-retriable error: Endpoint not implemented. pid=PID sdk=true
|
||||
HH:MM:SS Info: Attempt 1 failed due to a server side error. Retrying status code: 501 pid=PID
|
||||
HH:MM:SS Debug: POST /telemetry-ext
|
||||
> {
|
||||
> "items": null,
|
||||
> "protoLogs": [
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)",
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)"
|
||||
> ],
|
||||
> "uploadTime": "UNIX_TIME_MILLIS"
|
||||
> }
|
||||
< HTTP/1.1 501 Not Implemented
|
||||
< {
|
||||
< "error_code": "ERROR_CODE",
|
||||
< "message": "Endpoint not implemented."
|
||||
< } pid=PID sdk=true
|
||||
HH:MM:SS Debug: non-retriable error: Endpoint not implemented. pid=PID sdk=true
|
||||
HH:MM:SS Info: Attempt 2 failed due to a server side error. Retrying status code: 501 pid=PID
|
||||
HH:MM:SS Debug: POST /telemetry-ext
|
||||
> {
|
||||
> "items": null,
|
||||
> "protoLogs": [
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)",
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)"
|
||||
> ],
|
||||
> "uploadTime": "UNIX_TIME_MILLIS"
|
||||
> }
|
||||
< HTTP/1.1 501 Not Implemented
|
||||
< {
|
||||
< "error_code": "ERROR_CODE",
|
||||
< "message": "Endpoint not implemented."
|
||||
< } pid=PID sdk=true
|
||||
HH:MM:SS Debug: non-retriable error: Endpoint not implemented. pid=PID sdk=true
|
||||
HH:MM:SS Info: Attempt 3 failed due to a server side error. Retrying status code: 501 pid=PID
|
||||
HH:MM:SS Info: telemetry upload failed: failed to upload telemetry logs after three attempts pid=PID
|
|
@ -0,0 +1 @@
|
|||
trace $CLI selftest send-telemetry --debug
|
|
@ -0,0 +1,9 @@
|
|||
[[Server]]
|
||||
Pattern = "POST /telemetry-ext"
|
||||
Response.Body = '''
|
||||
{
|
||||
"error_code": "ERROR_CODE",
|
||||
"message": "Endpoint not implemented."
|
||||
}
|
||||
'''
|
||||
Response.StatusCode = 501
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"headers": {
|
||||
"Authorization": [
|
||||
"Bearer [DATABRICKS_TOKEN]"
|
||||
]
|
||||
},
|
||||
"method": "POST",
|
||||
"path": "/telemetry-ext",
|
||||
"body": {
|
||||
"uploadTime": "UNIX_TIME_MILLIS",
|
||||
"items": [],
|
||||
"protoLogs": [
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}",
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}"
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"headers": {
|
||||
"Authorization": [
|
||||
"Bearer [DATABRICKS_TOKEN]"
|
||||
]
|
||||
},
|
||||
"method": "POST",
|
||||
"path": "/telemetry-ext",
|
||||
"body": {
|
||||
"uploadTime": "UNIX_TIME_MILLIS",
|
||||
"items": [],
|
||||
"protoLogs": [
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}",
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}"
|
||||
]
|
||||
}
|
||||
}
|
||||
{
|
||||
"headers": {
|
||||
"Authorization": [
|
||||
"Bearer [DATABRICKS_TOKEN]"
|
||||
]
|
||||
},
|
||||
"method": "POST",
|
||||
"path": "/telemetry-ext",
|
||||
"body": {
|
||||
"uploadTime": "UNIX_TIME_MILLIS",
|
||||
"items": [],
|
||||
"protoLogs": [
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}",
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}"
|
||||
]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
|
||||
>>> [CLI] selftest send-telemetry --debug
|
||||
HH:MM:SS Info: start pid=PID version=[DEV_VERSION] args="[CLI], selftest, send-telemetry, --debug"
|
||||
HH:MM:SS Info: completed execution pid=PID exit_code=0
|
||||
HH:MM:SS Debug: POST /telemetry-ext
|
||||
> {
|
||||
> "items": null,
|
||||
> "protoLogs": [
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)",
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)"
|
||||
> ],
|
||||
> "uploadTime": "UNIX_TIME_MILLIS"
|
||||
> }
|
||||
< HTTP/1.1 200 OK
|
||||
< {
|
||||
< "errors": null,
|
||||
< "numProtoSuccess": 1
|
||||
< } pid=PID sdk=true
|
||||
HH:MM:SS Debug: Attempt 1 was a partial success. Number of logs uploaded: 1 out of 2 pid=PID
|
||||
HH:MM:SS Debug: POST /telemetry-ext
|
||||
> {
|
||||
> "items": null,
|
||||
> "protoLogs": [
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)",
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)"
|
||||
> ],
|
||||
> "uploadTime": "UNIX_TIME_MILLIS"
|
||||
> }
|
||||
< HTTP/1.1 200 OK
|
||||
< {
|
||||
< "errors": null,
|
||||
< "numProtoSuccess": 1
|
||||
< } pid=PID sdk=true
|
||||
HH:MM:SS Debug: Attempt 2 was a partial success. Number of logs uploaded: 1 out of 2 pid=PID
|
||||
HH:MM:SS Debug: POST /telemetry-ext
|
||||
> {
|
||||
> "items": null,
|
||||
> "protoLogs": [
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)",
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)"
|
||||
> ],
|
||||
> "uploadTime": "UNIX_TIME_MILLIS"
|
||||
> }
|
||||
< HTTP/1.1 200 OK
|
||||
< {
|
||||
< "errors": null,
|
||||
< "numProtoSuccess": 1
|
||||
< } pid=PID sdk=true
|
||||
HH:MM:SS Debug: Attempt 3 was a partial success. Number of logs uploaded: 1 out of 2 pid=PID
|
||||
HH:MM:SS Info: telemetry upload failed: failed to upload telemetry logs after three attempts pid=PID
|
|
@ -0,0 +1 @@
|
|||
trace $CLI selftest send-telemetry --debug
|
|
@ -0,0 +1,8 @@
|
|||
[[Server]]
|
||||
Pattern = "POST /telemetry-ext"
|
||||
Response.Body = '''
|
||||
{
|
||||
"errors": [],
|
||||
"numProtoSuccess": 1
|
||||
}
|
||||
'''
|
|
@ -0,0 +1,5 @@
|
|||
|
||||
>>> [CLI] selftest send-telemetry --debug
|
||||
HH:MM:SS Info: start pid=PID version=[DEV_VERSION] args="[CLI], selftest, send-telemetry, --debug"
|
||||
HH:MM:SS Info: completed execution pid=PID exit_code=0
|
||||
HH:MM:SS Debug: telemetry upload is disabled. Not uploading any logs. pid=PID
|
|
@ -0,0 +1,3 @@
|
|||
export DATABRICKS_CLI_DISABLE_TELEMETRY="true"
|
||||
|
||||
trace $CLI selftest send-telemetry --debug
|
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
"headers": {
|
||||
"Authorization": [
|
||||
"Bearer [DATABRICKS_TOKEN]"
|
||||
]
|
||||
},
|
||||
"method": "POST",
|
||||
"path": "/telemetry-ext",
|
||||
"body": {
|
||||
"uploadTime": "UNIX_TIME_MILLIS",
|
||||
"items": [],
|
||||
"protoLogs": [
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}",
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}"
|
||||
]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
|
||||
>>> [CLI] selftest send-telemetry --debug
|
||||
HH:MM:SS Info: start pid=PID version=[DEV_VERSION] args="[CLI], selftest, send-telemetry, --debug"
|
||||
HH:MM:SS Info: completed execution pid=PID exit_code=0
|
||||
HH:MM:SS Debug: POST /telemetry-ext
|
||||
> {
|
||||
> "items": null,
|
||||
> "protoLogs": [
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)",
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)"
|
||||
> ],
|
||||
> "uploadTime": "UNIX_TIME_MILLIS"
|
||||
> }
|
||||
< HTTP/1.1 200 OK
|
||||
< {
|
||||
< "errors": null,
|
||||
< "numProtoSuccess": 2
|
||||
< } pid=PID sdk=true
|
||||
HH:MM:SS Debug: All 2 logs uploaded successfully pid=PID
|
|
@ -0,0 +1 @@
|
|||
trace $CLI selftest send-telemetry --debug
|
|
@ -0,0 +1,38 @@
|
|||
IncludeRequestHeaders = ["Authorization"]
|
||||
RecordRequests = true
|
||||
|
||||
Local = true
|
||||
Cloud = false
|
||||
|
||||
[[Repls]]
|
||||
Old = '17\d{11}'
|
||||
New = '"UNIX_TIME_MILLIS"'
|
||||
|
||||
[[Repls]]
|
||||
Old = 'darwin|linux|windows'
|
||||
New = '[OS]'
|
||||
|
||||
[[Repls]]
|
||||
Old = 'execution_time_ms\\\":\d{1,5},'
|
||||
New = 'execution_time_ms\":\"SMALL_INT\",'
|
||||
|
||||
[[Server]]
|
||||
Pattern = "POST /telemetry-ext"
|
||||
Response.Body = '''
|
||||
{
|
||||
"errors": [],
|
||||
"numProtoSuccess": 2
|
||||
}
|
||||
'''
|
||||
|
||||
[[Repls]]
|
||||
Old = "(?:[01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9]"
|
||||
New = "HH:MM:SS"
|
||||
|
||||
[[Repls]]
|
||||
Old = "pid=[0-9]+"
|
||||
New = "pid=PID"
|
||||
|
||||
[[Repls]]
|
||||
Old = "\\([0-9]+ more bytes\\)"
|
||||
New = "(N more bytes)"
|
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
"headers": {
|
||||
"Authorization": [
|
||||
"Bearer [DATABRICKS_TOKEN]"
|
||||
]
|
||||
},
|
||||
"method": "POST",
|
||||
"path": "/telemetry-ext",
|
||||
"body": {
|
||||
"uploadTime": "UNIX_TIME_MILLIS",
|
||||
"items": [],
|
||||
"protoLogs": [
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE1\"}}}}",
|
||||
"{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[UUID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"selftest_send-telemetry\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"cli_test_event\":{\"name\":\"VALUE2\"}}}}"
|
||||
]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
|
||||
>>> [CLI] selftest send-telemetry --debug
|
||||
HH:MM:SS Info: start pid=PID version=[DEV_VERSION] args="[CLI], selftest, send-telemetry, --debug"
|
||||
HH:MM:SS Info: completed execution pid=PID exit_code=0
|
||||
HH:MM:SS Debug: POST /telemetry-ext
|
||||
> {
|
||||
> "items": null,
|
||||
> "protoLogs": [
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)",
|
||||
> "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"... (N more bytes)"
|
||||
> ],
|
||||
> "uploadTime": "UNIX_TIME_MILLIS"
|
||||
> }
|
||||
< Error: Post "[DATABRICKS_URL]/telemetry-ext": context deadline exceeded pid=PID sdk=true
|
||||
HH:MM:SS Debug: non-retriable error: Post "[DATABRICKS_URL]/telemetry-ext": context deadline exceeded pid=PID sdk=true
|
||||
HH:MM:SS Debug: Attempt 1 failed due to a timeout. Will not retry pid=PID
|
||||
HH:MM:SS Info: telemetry upload failed: uploading telemetry logs timed out: Post "[DATABRICKS_URL]/telemetry-ext": context deadline exceeded pid=PID
|
|
@ -0,0 +1,4 @@
|
|||
# Set timeout to 0.1 second to make this test faster
|
||||
export DATABRICKS_CLI_TELEMETRY_TIMEOUT=0.1
|
||||
|
||||
trace $CLI selftest send-telemetry --debug
|
|
@ -0,0 +1,9 @@
|
|||
[[Server]]
|
||||
Pattern = "POST /telemetry-ext"
|
||||
Response.Body = '''
|
||||
{
|
||||
"errors": [],
|
||||
"numProtoSuccess": 2
|
||||
}
|
||||
'''
|
||||
DelaySeconds = 5
|
|
@ -6,14 +6,18 @@ import (
|
|||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/databricks/cli/internal/build"
|
||||
"github.com/databricks/cli/libs/cmdio"
|
||||
"github.com/databricks/cli/libs/command"
|
||||
"github.com/databricks/cli/libs/dbr"
|
||||
"github.com/databricks/cli/libs/log"
|
||||
"github.com/databricks/cli/libs/telemetry"
|
||||
"github.com/databricks/cli/libs/telemetry/protos"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
|
@ -75,9 +79,6 @@ func New(ctx context.Context) *cobra.Command {
|
|||
// get the context back
|
||||
ctx = cmd.Context()
|
||||
|
||||
// Detect if the CLI is running on DBR and store this on the context.
|
||||
ctx = dbr.DetectRuntime(ctx)
|
||||
|
||||
// Configure our user agent with the command that's about to be executed.
|
||||
ctx = withCommandInUserAgent(ctx, cmd)
|
||||
ctx = withCommandExecIdInUserAgent(ctx)
|
||||
|
@ -125,9 +126,17 @@ Stack Trace:
|
|||
%s`, version, r, string(trace))
|
||||
}()
|
||||
|
||||
// Configure a telemetry logger and store it in the context.
|
||||
ctx = telemetry.WithNewLogger(ctx)
|
||||
|
||||
// Detect if the CLI is running on DBR and store this on the context.
|
||||
ctx = dbr.DetectRuntime(ctx)
|
||||
|
||||
// Set a command execution ID value in the context
|
||||
ctx = command.GenerateExecId(ctx)
|
||||
|
||||
startTime := time.Now()
|
||||
|
||||
// Run the command
|
||||
cmd, err = cmd.ExecuteContextC(ctx)
|
||||
if err != nil && !errors.Is(err, ErrAlreadyPrinted) {
|
||||
|
@ -155,5 +164,24 @@ Stack Trace:
|
|||
}
|
||||
}
|
||||
|
||||
exitCode := 0
|
||||
if err != nil {
|
||||
exitCode = 1
|
||||
}
|
||||
|
||||
ctx = cmd.Context()
|
||||
telemetryErr := telemetry.Upload(ctx, protos.ExecutionContext{
|
||||
CmdExecID: command.ExecId(ctx),
|
||||
Version: build.GetInfo().Version,
|
||||
Command: commandString(cmd),
|
||||
OperatingSystem: runtime.GOOS,
|
||||
DbrVersion: dbr.RuntimeVersion(ctx),
|
||||
ExecutionTimeMs: time.Since(startTime).Milliseconds(),
|
||||
ExitCode: int64(exitCode),
|
||||
})
|
||||
if telemetryErr != nil {
|
||||
log.Infof(ctx, "telemetry upload failed: %s", telemetryErr)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ func New() *cobra.Command {
|
|||
Hidden: true,
|
||||
}
|
||||
|
||||
cmd.AddCommand(newSendTelemetry())
|
||||
cmd.AddCommand(newPanic())
|
||||
cmd.AddCommand(newPatchWhl())
|
||||
return cmd
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
package selftest
|
||||
|
||||
import (
|
||||
"github.com/databricks/cli/cmd/root"
|
||||
"github.com/databricks/cli/libs/telemetry"
|
||||
"github.com/databricks/cli/libs/telemetry/protos"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
func newSendTelemetry() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "send-telemetry",
|
||||
Short: "log some test telemetry events",
|
||||
PreRunE: root.MustWorkspaceClient,
|
||||
}
|
||||
|
||||
cmd.RunE = func(cmd *cobra.Command, args []string) error {
|
||||
for _, v := range []string{"VALUE1", "VALUE2"} {
|
||||
telemetry.Log(cmd.Context(), protos.DatabricksCliLog{
|
||||
CliTestEvent: &protos.CliTestEvent{
|
||||
Name: protos.DummyCliEnum(v),
|
||||
},
|
||||
})
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
return cmd
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
)
|
||||
|
||||
// Private type to store the telemetry logger in the context
|
||||
type telemetryLogger int
|
||||
|
||||
// Key to store the telemetry logger in the context
|
||||
var telemetryLoggerKey telemetryLogger
|
||||
|
||||
func WithNewLogger(ctx context.Context) context.Context {
|
||||
return context.WithValue(ctx, telemetryLoggerKey, &logger{})
|
||||
}
|
||||
|
||||
func fromContext(ctx context.Context) *logger {
|
||||
v := ctx.Value(telemetryLoggerKey)
|
||||
if v == nil {
|
||||
panic(errors.New("telemetry logger not found in the context"))
|
||||
}
|
||||
|
||||
return v.(*logger)
|
||||
}
|
|
@ -0,0 +1,188 @@
|
|||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/databricks/cli/libs/command"
|
||||
"github.com/databricks/cli/libs/env"
|
||||
"github.com/databricks/cli/libs/log"
|
||||
"github.com/databricks/cli/libs/telemetry/protos"
|
||||
"github.com/databricks/databricks-sdk-go/apierr"
|
||||
"github.com/databricks/databricks-sdk-go/client"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
const (
|
||||
// Environment variable to disable telemetry. If this is set to any value, telemetry
|
||||
// will be disabled.
|
||||
disableEnvVar = "DATABRICKS_CLI_DISABLE_TELEMETRY"
|
||||
|
||||
// Timeout in seconds for uploading telemetry logs.
|
||||
timeoutEnvVar = "DATABRICKS_CLI_TELEMETRY_TIMEOUT"
|
||||
)
|
||||
|
||||
func Log(ctx context.Context, event protos.DatabricksCliLog) {
|
||||
fromContext(ctx).log(event)
|
||||
}
|
||||
|
||||
type logger struct {
|
||||
logs []protos.FrontendLog
|
||||
}
|
||||
|
||||
func (l *logger) log(event protos.DatabricksCliLog) {
|
||||
l.logs = append(l.logs, protos.FrontendLog{
|
||||
FrontendLogEventID: uuid.New().String(),
|
||||
Entry: protos.FrontendLogEntry{
|
||||
DatabricksCliLog: event,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
const (
|
||||
defaultUploadTimeout = 3 * time.Second
|
||||
waitBetweenRetries = 200 * time.Millisecond
|
||||
)
|
||||
|
||||
func Upload(ctx context.Context, ec protos.ExecutionContext) error {
|
||||
l := fromContext(ctx)
|
||||
if len(l.logs) == 0 {
|
||||
log.Debugf(ctx, "no telemetry logs to upload")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Telemetry is disabled. We don't upload logs.
|
||||
if env.Get(ctx, disableEnvVar) != "" {
|
||||
log.Debugf(ctx, "telemetry upload is disabled. Not uploading any logs.")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Set the execution context for all logs.
|
||||
for i := range l.logs {
|
||||
l.logs[i].Entry.DatabricksCliLog.ExecutionContext = &ec
|
||||
}
|
||||
|
||||
protoLogs := make([]string, len(l.logs))
|
||||
for i, log := range l.logs {
|
||||
b, err := json.Marshal(log)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal log: %s", err)
|
||||
}
|
||||
protoLogs[i] = string(b)
|
||||
}
|
||||
|
||||
apiClient, err := client.New(command.ConfigUsed(ctx))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create API client: %w", err)
|
||||
}
|
||||
|
||||
uploadTimeout := defaultUploadTimeout
|
||||
if v := env.Get(ctx, timeoutEnvVar); v != "" {
|
||||
timeout, err := strconv.ParseFloat(v, 64)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse timeout: %w", err)
|
||||
}
|
||||
|
||||
uploadTimeout = time.Duration(timeout * float64(time.Second))
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, uploadTimeout)
|
||||
defer cancel()
|
||||
|
||||
deadline, ok := ctx.Deadline()
|
||||
if !ok {
|
||||
log.Infof(ctx, "context has no deadline. This is unexpected. Not uploading telemetry logs")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Only try uploading logs for a maximum of 3 times.
|
||||
for i := range 3 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return fmt.Errorf("uploading telemetry logs timed out: %w", ctx.Err())
|
||||
default:
|
||||
// proceed
|
||||
}
|
||||
|
||||
resp, err := attempt(ctx, apiClient, protoLogs)
|
||||
|
||||
// All logs were uploaded successfully.
|
||||
if err == nil && resp.NumProtoSuccess >= int64(len(protoLogs)) {
|
||||
log.Debugf(ctx, "All %d logs uploaded successfully", len(protoLogs))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Partial success. Retry.
|
||||
if err == nil && resp.NumProtoSuccess < int64(len(protoLogs)) {
|
||||
log.Debugf(ctx, "Attempt %d was a partial success. Number of logs uploaded: %d out of %d", i+1, resp.NumProtoSuccess, len(protoLogs))
|
||||
|
||||
remainingTime := time.Until(deadline)
|
||||
if remainingTime < waitBetweenRetries {
|
||||
log.Debugf(ctx, "not enough time to retry before context deadline.")
|
||||
break
|
||||
}
|
||||
|
||||
time.Sleep(waitBetweenRetries)
|
||||
continue
|
||||
}
|
||||
|
||||
// Do not retry if the context deadline was exceeded. This means that our
|
||||
// timeout of three seconds was triggered and we should not try again.
|
||||
if errors.Is(err, context.DeadlineExceeded) {
|
||||
log.Debugf(ctx, "Attempt %d failed due to a timeout. Will not retry", i+1)
|
||||
return fmt.Errorf("uploading telemetry logs timed out: %w", err)
|
||||
}
|
||||
|
||||
// We retry for all 5xx responses. Note that the SDK only retries for 503 and 429
|
||||
// (as of 6th March 2025) so we need some additional logic here to retry for other
|
||||
// 5xx responses.
|
||||
// Note: We should never see a 503 or 429 here because the SDK default timeout
|
||||
// of 1 minute is more than the 3 second timeout we set above.
|
||||
//
|
||||
// SDK ref: https://github.com/databricks/databricks-sdk-go/blob/cdb28002afacb8b762348534a4c4040a9f19c24b/apierr/errors.go#L91
|
||||
//
|
||||
// The UI infra team (who owns the /telemetry-ext API) recommends retrying for
|
||||
// all 5xx responses.
|
||||
var apiErr *apierr.APIError
|
||||
if errors.As(err, &apiErr) && apiErr.StatusCode >= 500 {
|
||||
log.Infof(ctx, "Attempt %d failed due to a server side error. Retrying status code: %d", i+1, apiErr.StatusCode)
|
||||
|
||||
remainingTime := time.Until(deadline)
|
||||
if remainingTime < waitBetweenRetries {
|
||||
log.Debugf(ctx, "not enough time to retry before context deadline.")
|
||||
break
|
||||
}
|
||||
|
||||
time.Sleep(waitBetweenRetries)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
return errors.New("failed to upload telemetry logs after three attempts")
|
||||
}
|
||||
|
||||
func attempt(ctx context.Context, apiClient *client.DatabricksClient, protoLogs []string) (*ResponseBody, error) {
|
||||
resp := &ResponseBody{}
|
||||
err := apiClient.Do(ctx, http.MethodPost, "/telemetry-ext", nil, nil, RequestBody{
|
||||
UploadTime: time.Now().UnixMilli(),
|
||||
// There is a bug in the `/telemetry-ext` API which requires us to
|
||||
// send an empty array for the `Items` field. Otherwise the API returns
|
||||
// a 500.
|
||||
Items: []string{},
|
||||
ProtoLogs: protoLogs,
|
||||
}, resp)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(resp.Errors) > 0 {
|
||||
return nil, fmt.Errorf("uploading telemetry failed: %v", resp.Errors)
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
}
|
|
@ -0,0 +1,155 @@
|
|||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/databricks/cli/libs/command"
|
||||
"github.com/databricks/cli/libs/telemetry/protos"
|
||||
"github.com/databricks/cli/libs/testserver"
|
||||
"github.com/databricks/databricks-sdk-go/apierr"
|
||||
"github.com/databricks/databricks-sdk-go/config"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestTelemetryUploadRetriesOnPartialSuccess(t *testing.T) {
|
||||
server := testserver.New(t)
|
||||
t.Cleanup(server.Close)
|
||||
|
||||
count := 0
|
||||
server.Handle("POST", "/telemetry-ext", func(req testserver.Request) any {
|
||||
count++
|
||||
if count == 1 {
|
||||
return ResponseBody{
|
||||
NumProtoSuccess: 1,
|
||||
}
|
||||
}
|
||||
if count == 2 {
|
||||
return ResponseBody{
|
||||
NumProtoSuccess: 2,
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
ctx := WithNewLogger(context.Background())
|
||||
|
||||
Log(ctx, protos.DatabricksCliLog{
|
||||
CliTestEvent: &protos.CliTestEvent{
|
||||
Name: protos.DummyCliEnumValue1,
|
||||
},
|
||||
})
|
||||
Log(ctx, protos.DatabricksCliLog{
|
||||
CliTestEvent: &protos.CliTestEvent{
|
||||
Name: protos.DummyCliEnumValue2,
|
||||
},
|
||||
})
|
||||
|
||||
ctx = command.SetConfigUsed(ctx, &config.Config{
|
||||
Host: server.URL,
|
||||
Token: "token",
|
||||
})
|
||||
|
||||
err := Upload(ctx, protos.ExecutionContext{})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 2, count)
|
||||
}
|
||||
|
||||
func uploadRetriesFor(t *testing.T, statusCode int) {
|
||||
server := testserver.New(t)
|
||||
t.Cleanup(server.Close)
|
||||
|
||||
count := 0
|
||||
server.Handle("POST", "/telemetry-ext", func(req testserver.Request) any {
|
||||
count++
|
||||
if count == 1 {
|
||||
return testserver.Response{
|
||||
StatusCode: statusCode,
|
||||
Body: apierr.APIError{
|
||||
StatusCode: statusCode,
|
||||
Message: "Some error",
|
||||
},
|
||||
}
|
||||
}
|
||||
if count == 2 {
|
||||
return ResponseBody{
|
||||
NumProtoSuccess: 2,
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
t.Setenv("DATABRICKS_HOST", server.URL)
|
||||
t.Setenv("DATABRICKS_TOKEN", "token")
|
||||
|
||||
ctx := WithNewLogger(context.Background())
|
||||
|
||||
Log(ctx, protos.DatabricksCliLog{
|
||||
CliTestEvent: &protos.CliTestEvent{
|
||||
Name: protos.DummyCliEnumValue1,
|
||||
},
|
||||
})
|
||||
Log(ctx, protos.DatabricksCliLog{
|
||||
CliTestEvent: &protos.CliTestEvent{
|
||||
Name: protos.DummyCliEnumValue2,
|
||||
},
|
||||
})
|
||||
ctx = command.SetConfigUsed(ctx, &config.Config{
|
||||
Host: server.URL,
|
||||
Token: "token",
|
||||
})
|
||||
|
||||
err := Upload(ctx, protos.ExecutionContext{})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 2, count)
|
||||
}
|
||||
|
||||
func TestTelemetryUploadRetriesForStatusCodes(t *testing.T) {
|
||||
// These retries happen in the CLI itself since the SDK does not automatically
|
||||
// retry for 5xx errors.
|
||||
uploadRetriesFor(t, 500)
|
||||
uploadRetriesFor(t, 504)
|
||||
|
||||
// These retries happen on the SDK layer.
|
||||
// ref: https://github.com/databricks/databricks-sdk-go/blob/cdb28002afacb8b762348534a4c4040a9f19c24b/apierr/errors.go#L91
|
||||
uploadRetriesFor(t, 503)
|
||||
uploadRetriesFor(t, 429)
|
||||
}
|
||||
|
||||
func TestTelemetryUploadMaxRetries(t *testing.T) {
|
||||
server := testserver.New(t)
|
||||
t.Cleanup(server.Close)
|
||||
count := 0
|
||||
|
||||
server.Handle("POST", "/telemetry-ext", func(req testserver.Request) any {
|
||||
count++
|
||||
return ResponseBody{
|
||||
NumProtoSuccess: 1,
|
||||
}
|
||||
})
|
||||
|
||||
t.Setenv("DATABRICKS_HOST", server.URL)
|
||||
t.Setenv("DATABRICKS_TOKEN", "token")
|
||||
ctx := WithNewLogger(context.Background())
|
||||
|
||||
Log(ctx, protos.DatabricksCliLog{
|
||||
CliTestEvent: &protos.CliTestEvent{
|
||||
Name: protos.DummyCliEnumValue1,
|
||||
},
|
||||
})
|
||||
Log(ctx, protos.DatabricksCliLog{
|
||||
CliTestEvent: &protos.CliTestEvent{
|
||||
Name: protos.DummyCliEnumValue2,
|
||||
},
|
||||
})
|
||||
|
||||
ctx = command.SetConfigUsed(ctx, &config.Config{
|
||||
Host: server.URL,
|
||||
Token: "token",
|
||||
})
|
||||
|
||||
err := Upload(ctx, protos.ExecutionContext{})
|
||||
assert.EqualError(t, err, "failed to upload telemetry logs after three attempts")
|
||||
assert.Equal(t, 3, count)
|
||||
}
|
Loading…
Reference in New Issue