Skip to content

Commit

Permalink
count unknown reason for failed pods as system errors (#667)
Browse files Browse the repository at this point in the history
* count unknown errors for failed pods as system errors

Signed-off-by: Paul Dittamo <[email protected]>

* unit test

Signed-off-by: Paul Dittamo <[email protected]>

* update comment

Signed-off-by: Paul Dittamo <[email protected]>

---------

Signed-off-by: Paul Dittamo <[email protected]>
  • Loading branch information
pvditt committed Feb 21, 2025
1 parent e6e8b91 commit 3c474be
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 3 deletions.
7 changes: 7 additions & 0 deletions flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -1157,6 +1157,13 @@ func DemystifyFailure(ctx context.Context, status v1.PodStatus, info pluginsCore
}
}

// If the code remains 'UnknownError', it indicates that the kubelet did not have a chance
// to record a more specific failure before the node was terminated or preempted.
// In such cases, we classify the error as system-level and accept false positives
if code == "UnknownError" {
isSystemError = true
}

if isSystemError {
logger.Warnf(ctx, "Pod failed with a system error. Code: %s, Message: %s", code, message)
return pluginsCore.PhaseInfoSystemRetryableFailure(Interrupted, message, &info), nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1627,8 +1627,8 @@ func TestDemystifyFailure(t *testing.T) {
phaseInfo, err := DemystifyFailure(ctx, v1.PodStatus{}, pluginsCore.TaskInfo{}, "")
assert.Nil(t, err)
assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase())
assert.Equal(t, "UnknownError", phaseInfo.Err().GetCode())
assert.Equal(t, core.ExecutionError_USER, phaseInfo.Err().GetKind())
assert.Equal(t, "Interrupted", phaseInfo.Err().Code)
assert.Equal(t, core.ExecutionError_SYSTEM, phaseInfo.Err().Kind)
})

t.Run("known-error", func(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion flyteplugins/go/tasks/plugins/k8s/pod/container_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ func TestContainerTaskExecutor_GetTaskStatus(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase())
ec := phaseInfo.Err().GetCode()
assert.Equal(t, "UnknownError", ec)
assert.Equal(t, "Interrupted", ec)
})

t.Run("failConditionUnschedulable", func(t *testing.T) {
Expand Down

0 comments on commit 3c474be

Please sign in to comment.