Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: do not delete workspace pod on authz errors #805

Merged
merged 3 commits into from
Feb 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ CHANGELOG
- Enable deleting workspace pod after a successful sync by exposing `WorkspaceReclaimPolicy` [#804](https://github.com/pulumi/pulumi-kubernetes-operator/pull/804)
- Surface Update failures back to the Stack object status [#807](https://github.com/pulumi/pulumi-kubernetes-operator/pull/807)
- Surface update conflict errors when a stack is locked [#807](https://github.com/pulumi/pulumi-kubernetes-operator/pull/807)4 (add changelog entry)
- Do not destroy the workspace pod if an authentication error occurs [#805](https://github.com/pulumi/pulumi-kubernetes-operator/pull/805)

## 2.0.0-beta.3 (2024-11-27)

Expand Down
5 changes: 5 additions & 0 deletions agent/pkg/server/pulumi_errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ var knownErrors = knownPulumiErrors{
Reason: "UpdateConflict",
Code: 409,
},
"invalid access token": {
Message: "Invalid access token used to authenticate with Pulumi Cloud",
Reason: "InvalidAccessToken",
Code: 401,
},
}

// withPulumiErrorInfo iterates over known errors and checks if the provided error matches any of them.
Expand Down
4 changes: 3 additions & 1 deletion agent/pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,9 @@ func (s *Server) Cancel() {
func (s *Server) WhoAmI(ctx context.Context, in *pb.WhoAmIRequest) (*pb.WhoAmIResult, error) {
whoami, err := s.ws.WhoAmIDetails(ctx)
if err != nil {
return nil, err
s.log.Errorw("whoami completed with an error", zap.Error(err))
st := status.Newf(codes.Unknown, "whoami failed: %v", err)
return nil, withPulumiErrorInfo(st, err).Err()
}
resp := &pb.WhoAmIResult{
User: whoami.User,
Expand Down
31 changes: 31 additions & 0 deletions operator/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,37 @@ func TestE2E(t *testing.T) {
assert.NotContains(t, stack.Status.Outputs, "notTargeted")
},
},
{
name: "random-yaml-auth-error",
f: func(t *testing.T) {
t.Parallel()

cmd := exec.Command("kubectl", "apply", "-f", "e2e/testdata/random-yaml-auth-error")
require.NoError(t, run(cmd))
dumpLogs(t, "random-yaml-auth-error", "pod/random-yaml-auth-error-workspace-0")

// Wait for the Workspace pod to be created, so that we can watch/wait on the Workspace object.
retryUntil(t, 30*time.Second, true, func() bool {
found, err := foundEvent("Pod", "random-yaml-auth-error-workspace-0", "random-yaml-auth-error", "Created")
assert.NoError(t, err)
return found
})

// Ensure the Workspace is in a failed state with Unauthenticated.
_, err := waitFor[pulumiv1.Stack](
"workspaces/random-yaml-auth-error",
"random-yaml-auth-error",
5*time.Minute,
`jsonpath={.status.conditions[?(@.type=="Ready")].reason}=Unauthenticated`)
assert.NoError(t, err)

// Ensure that the workspace pod was not deleted after reconciling the failed stack.
time.Sleep(10 * time.Second)
found, err := foundEvent("Pod", "random-yaml-auth-error-workspace-0", "random-yaml-auth-error", "Killing")
assert.NoError(t, err)
assert.False(t, found)
},
},
}

for _, tt := range tests {
Expand Down
89 changes: 89 additions & 0 deletions operator/e2e/testdata/random-yaml-auth-error/manifests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
---
# This NetworkPolicy allows ingress traffic to the source-controller pods
# from specific namespaces and pods managed by pulumi-kubernetes-operator.
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-random-yaml-auth-error-fetch
namespace: flux-system
spec:
podSelector:
matchLabels:
app: source-controller
ingress:
- ports:
- protocol: TCP
port: http
from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: random-yaml-auth-error
- podSelector:
matchLabels:
app.kubernetes.io/managed-by: pulumi-kubernetes-operator
app.kubernetes.io/name: pulumi
app.kubernetes.io/component: workspace
policyTypes:
- Ingress
---
# Namespace to isolate the random-yaml-auth-error test.
apiVersion: v1
kind: Namespace
metadata:
name: random-yaml-auth-error
---
# ServiceAccount for the random-yaml-auth-error namespace.
# No permissions are granted to this service account.
apiVersion: v1
kind: ServiceAccount
metadata:
name: random-yaml-auth-error
namespace: random-yaml-auth-error
---
# Define a Flux Source GitRepository object for syncing Pulumi examples from a GitHub repository
apiVersion: source.toolkit.fluxcd.io/v1
kind: GitRepository
metadata:
name: pulumi-examples
namespace: random-yaml-auth-error
spec:
interval: 10m
ref:
branch: master
timeout: 60s
url: https://github.com/pulumi/examples
---
apiVersion: pulumi.com/v1
kind: Stack
metadata:
name: random-yaml-auth-error
namespace: random-yaml-auth-error
spec:
fluxSource:
sourceRef:
apiVersion: source.toolkit.fluxcd.io/v1
kind: GitRepository
name: pulumi-examples
dir: random-yaml
stack: dev
refresh: false
continueResyncOnCommitMatch: false
resyncFrequencySeconds: 60
destroyOnFinalize: true
# Enable file state for testing.
envRefs:
PULUMI_BACKEND_URL:
type: Literal
literal:
value: "file:///state/"
PULUMI_CONFIG_PASSPHRASE:
type: Literal
literal:
value: "test"
workspaceTemplate:
spec:
serviceAccountName: random-yaml-auth-error
podTemplate:
spec:
containers:
- name: pulumi
26 changes: 26 additions & 0 deletions operator/internal/controller/auto/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
autov1alpha1 "github.com/pulumi/pulumi-kubernetes-operator/v2/operator/api/auto/v1alpha1"
autov1alpha1webhook "github.com/pulumi/pulumi-kubernetes-operator/v2/operator/internal/webhook/auto/v1alpha1"
"github.com/pulumi/pulumi-kubernetes-operator/v2/operator/version"
"google.golang.org/grpc/status"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/meta"
Expand Down Expand Up @@ -242,6 +243,31 @@ func (r *WorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
initializedV, ok := pod.Annotations[PodAnnotationInitialized]
initialized, _ := strconv.ParseBool(initializedV)
if !ok || !initialized {
l.Info("Running whoami to ensure authentication is setup correctly with the workspace pod")
_, err = wc.WhoAmI(ctx, &agentpb.WhoAmIRequest{})
if err != nil {
EronWright marked this conversation as resolved.
Show resolved Hide resolved
l.Error(err, "unable to run whoami; retaining the workspace pod to retry later")
st := status.Convert(err)

ready.Status = metav1.ConditionFalse
ready.Reason = st.Code().String()
ready.Message = st.Message()

// Override with structured error from PulumiErrorInfo if provided.
if len(st.Details()) > 0 {
if info, ok := st.Details()[0].(*agentpb.PulumiErrorInfo); ok {
ready.Reason = info.Reason
ready.Message = info.Message
}
}

if statusErr := updateStatus(); statusErr != nil {
return ctrl.Result{}, statusErr
}

return ctrl.Result{}, err
}

l.Info("Running pulumi install")
ready.Status = metav1.ConditionFalse
ready.Reason = "Installing"
Expand Down
Loading