diff --git a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go index db7d63de5e..307c288c99 100644 --- a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go +++ b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go @@ -1157,6 +1157,13 @@ func DemystifyFailure(ctx context.Context, status v1.PodStatus, info pluginsCore } } + // If the code remains 'UnknownError', it indicates that the kubelet did not have a chance + // to record a more specific failure before the node was terminated or preempted. + // In such cases, we classify the error as system-level and accept false positives + if code == "UnknownError" { + isSystemError = true + } + if isSystemError { logger.Warnf(ctx, "Pod failed with a system error. Code: %s, Message: %s", code, message) return pluginsCore.PhaseInfoSystemRetryableFailure(Interrupted, message, &info), nil diff --git a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go index ab823ac5a0..74f4b307f7 100644 --- a/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go +++ b/flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go @@ -1627,8 +1627,8 @@ func TestDemystifyFailure(t *testing.T) { phaseInfo, err := DemystifyFailure(ctx, v1.PodStatus{}, pluginsCore.TaskInfo{}, "") assert.Nil(t, err) assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase()) - assert.Equal(t, "UnknownError", phaseInfo.Err().GetCode()) - assert.Equal(t, core.ExecutionError_USER, phaseInfo.Err().GetKind()) + assert.Equal(t, "Interrupted", phaseInfo.Err().Code) + assert.Equal(t, core.ExecutionError_SYSTEM, phaseInfo.Err().Kind) }) t.Run("known-error", func(t *testing.T) { diff --git a/flyteplugins/go/tasks/plugins/k8s/pod/container_test.go b/flyteplugins/go/tasks/plugins/k8s/pod/container_test.go index 39a98b3862..dc604e3587 100644 --- a/flyteplugins/go/tasks/plugins/k8s/pod/container_test.go +++ b/flyteplugins/go/tasks/plugins/k8s/pod/container_test.go @@ -446,7 +446,7 @@ func TestContainerTaskExecutor_GetTaskStatus(t *testing.T) { assert.NoError(t, err) assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase()) ec := phaseInfo.Err().GetCode() - assert.Equal(t, "UnknownError", ec) + assert.Equal(t, "Interrupted", ec) }) t.Run("failConditionUnschedulable", func(t *testing.T) {