Skip to content

Commit 7fa2819

Browse files
mayurkrclaude
andauthored
fix(remediation): honor RemediationPolicy.spec.dryRun (#93)
* fix(remediation): honor RemediationPolicy.spec.dryRun The CRD field spec.dryRun was documented as "generates fixes but does not create actual PRs" but the controller only logged the flag — it always called RemediationEngine.ApplyPlan, so setting dryRun=true still opened real GitOps PRs. The only reliable preview path was ZelyoConfig.spec.mode=audit at the operator level, which contradicts the per-policy contract. Gate ApplyPlan + ResolveIncident in processIncidents: when dryRun is true, generate the plan (so operators see fix count / risk in the log and a DryRunPreview event) but skip PR creation, leave the incident open for a later non-dry-run reconcile, and do not bump status.remediationsApplied. Adds an integration test in the controller envtest suite using a fake llm.Client and fake gitops.Engine: asserts CreatePullRequest is never called when dryRun=true, the seeded incident stays open, and the status counter stays at 0. A counter-case with dryRun=false exercises the same fakes to prove CreatePullRequest is called and the incident is resolved — this guards the dry-run assertion from passing via a broken test harness. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * refactor(remediation): extract gitops engine setup to cut cyclomatic complexity The dry-run gating added one branch to processIncidents, which pushed gocyclo from 15 to 16 and tripped the repo's threshold (15). Extract the PAT-token lookup + GitOps engine wiring into a new helper `maybeSetGitOpsEngineFromSecret` — a flat early-return style that reads better than the previous deeply-nested if/if/if block and drops 3 branches from processIncidents. Pure refactor, no behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * fix(remediation): cap LLM plan generation per cycle in dry-run mode Gemini PR review (PR #93) flagged that prsCreated never increments on the dryRun path, so a policy with N open incidents fires N LLM calls per reconcile regardless of spec.maxConcurrentPRs. On clusters with many correlated incidents that's unbounded LLM cost and reconcile- timeout risk. Introduce a separate `processed` counter that increments once per incident that makes it to the LLM call (before GeneratePlan, so plan-generation failures still count against the budget). Use it as the loop ceiling; keep prsCreated driving the status counter so status semantics are unchanged. Add a regression test seeding 5 incidents against maxConcurrentPRs=2 with dryRun=true and asserting the LLM is hit exactly 2 times and every incident stays open. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 2c8023d commit 7fa2819

2 files changed

Lines changed: 331 additions & 19 deletions

File tree

internal/controller/remediationpolicy_controller.go

Lines changed: 52 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -247,14 +247,24 @@ func (r *RemediationPolicyReconciler) processIncidents(
247247
}
248248
}
249249

250-
var prsCreated int32
250+
// prsCreated counts real PRs opened this cycle and drives the status
251+
// counter. processed counts every incident that consumed an LLM plan
252+
// generation — whether the outcome was a new PR or a dryRun preview.
253+
// The per-cycle budget must bound BOTH paths: without this, a policy
254+
// with N open incidents and dryRun=true would fire N LLM calls per
255+
// reconcile regardless of maxConcurrentPRs, burning tokens and pushing
256+
// the reconcile toward its timeout.
257+
var prsCreated, processed int32
251258
for _, incident := range incidents {
252-
if prsCreated >= maxPRs {
253-
log.Info("MaxConcurrentPRs limit reached", "limit", maxPRs)
259+
if processed >= maxPRs {
260+
log.Info("MaxConcurrentPRs limit reached", "limit", maxPRs, "dryRun", policy.Spec.DryRun)
254261
break
255262
}
256-
opened := r.remediateIncident(ctx, policy, repo, incident,
263+
opened, charged := r.remediateIncident(ctx, policy, repo, incident,
257264
minSev, repoOwner, repoName, existingBranches)
265+
if charged {
266+
processed++
267+
}
258268
if opened {
259269
prsCreated++
260270
}
@@ -264,10 +274,16 @@ func (r *RemediationPolicyReconciler) processIncidents(
264274
}
265275

266276
// remediateIncident handles the full severity-check → dedup →
267-
// GeneratePlan → ApplyPlan → resolve flow for a single incident. Returns
268-
// true if a new PR was opened (so the caller can tally against
269-
// MaxConcurrentPRs). Factored out of processIncidents to keep each unit
270-
// under the gocyclo threshold.
277+
// GeneratePlan → (dry-run preview | ApplyPlan) → resolve flow for a single
278+
// incident. Factored out of processIncidents to keep each unit under the
279+
// gocyclo threshold.
280+
//
281+
// Returns two flags so the caller can drive independent counters:
282+
// - opened: a real PR was created (counts against status.RemediationsApplied)
283+
// - charged: this incident consumed an LLM plan generation (counts against
284+
// the per-cycle MaxConcurrentPRs budget — covers both real PRs and
285+
// dryRun previews, but NOT incidents skipped by severity or dedup since
286+
// no LLM call is made)
271287
func (r *RemediationPolicyReconciler) remediateIncident(
272288
ctx context.Context,
273289
policy *zelyov1alpha1.RemediationPolicy,
@@ -276,13 +292,13 @@ func (r *RemediationPolicyReconciler) remediateIncident(
276292
minSev int,
277293
repoOwner, repoName string,
278294
existingBranches map[string]string,
279-
) bool {
295+
) (opened, charged bool) {
280296
log := logf.FromContext(ctx)
281297

282298
// Severity filter.
283299
incSev, ok := severityOrder[incident.Severity]
284300
if !ok || incSev > minSev {
285-
return false
301+
return false, false
286302
}
287303

288304
finding := incidentToFinding(incident)
@@ -293,12 +309,17 @@ func (r *RemediationPolicyReconciler) remediateIncident(
293309
branch := gitops.BranchName(finding.ResourceName, finding.ResourceNamespace, finding.Title)
294310
if existingURL, exists := existingBranches[branch]; exists {
295311
log.Info("Skipping remediation — open PR already exists",
296-
"incident", incident.ID, "branch", branch, "prURL", existingURL)
297-
// Still mark the incident resolved so we don't loop on it; a
298-
// future scan will regenerate the incident if the PR is closed
299-
// without merging and the finding remains.
300-
r.CorrelatorEngine.ResolveIncident(incident.ID)
301-
return false
312+
"incident", incident.ID, "branch", branch, "prURL", existingURL,
313+
"dryRun", policy.Spec.DryRun)
314+
// In a real reconcile, mark the incident resolved so we don't
315+
// loop on it; a future scan will regenerate the incident if the
316+
// PR is closed without merging and the finding remains. In
317+
// dryRun we must NOT touch correlator state — leave it for the
318+
// next non-dryRun reconcile.
319+
if !policy.Spec.DryRun {
320+
r.CorrelatorEngine.ResolveIncident(incident.ID)
321+
}
322+
return false, false
302323
}
303324

304325
plan, err := r.RemediationEngine.GeneratePlan(ctx, finding, repo.Spec.Paths)
@@ -308,7 +329,8 @@ func (r *RemediationPolicyReconciler) remediateIncident(
308329
"resource", fmt.Sprintf("%s/%s", incident.Namespace, incident.Resource))
309330
r.Recorder.Event(policy, corev1.EventTypeWarning, zelyov1alpha1.EventReasonReconcileError,
310331
fmt.Sprintf("LLM plan generation failed for incident %s: %v", incident.ID, err))
311-
return false
332+
// Still counts against the budget — the LLM call was made.
333+
return false, true
312334
}
313335

314336
log.Info("Generated remediation plan",
@@ -317,13 +339,24 @@ func (r *RemediationPolicyReconciler) remediateIncident(
317339
"riskScore", plan.RiskScore,
318340
"dryRun", policy.Spec.DryRun)
319341

342+
// spec.dryRun is a per-policy preview switch: generate the plan so
343+
// operators can review fix count / risk, but do not submit a PR and
344+
// do not resolve the incident — a later reconcile with dryRun=false
345+
// should still pick it up and remediate.
346+
if policy.Spec.DryRun {
347+
r.Recorder.Event(policy, corev1.EventTypeNormal, "DryRunPreview",
348+
fmt.Sprintf("Dry-run: would remediate incident %s (fixes=%d, risk=%d) — no PR opened",
349+
incident.ID, len(plan.Fixes), plan.RiskScore))
350+
return false, true
351+
}
352+
320353
result, err := r.RemediationEngine.ApplyPlan(ctx, plan, repoOwner, repoName)
321354
if err != nil {
322355
log.Error(err, "Failed to apply remediation plan",
323356
"incident", incident.ID)
324357
r.Recorder.Event(policy, corev1.EventTypeWarning, zelyov1alpha1.EventReasonReconcileError,
325358
fmt.Sprintf("Failed to apply fix for incident %s: %v", incident.ID, err))
326-
return false
359+
return false, true
327360
}
328361

329362
if result != nil {
@@ -337,7 +370,7 @@ func (r *RemediationPolicyReconciler) remediateIncident(
337370
}
338371

339372
r.CorrelatorEngine.ResolveIncident(incident.ID)
340-
return true
373+
return true, true
341374
}
342375

343376
// incidentToFinding converts a correlator incident to a scanner finding for the

0 commit comments

Comments
 (0)