From 4d6253b108b35a2be6733210790bc97e27f61f12 Mon Sep 17 00:00:00 2001 From: "Nathan A. Ferch" Date: Wed, 4 Feb 2026 17:37:35 -0500 Subject: [PATCH 1/2] feat: stop emitting metrics for closed and stale PRs --- cmd/server.go | 23 ++ cmd/server_test.go | 1 + runatlantis.io/docs/server-configuration.md | 10 + .../controllers/events/events_controller.go | 10 +- server/events/command/project_context.go | 9 +- .../instrumented_project_command_runner.go | 23 +- .../events/project_command_context_builder.go | 5 - server/events/pull_closed_executor.go | 15 + .../events/vcs/common/instrumented_client.go | 38 +- .../events/vcs/github/instrumented_client.go | 14 +- server/metrics/pr_scope_manager.go | 342 +++++++++++++++ .../metrics/pr_scope_manager_inactive_test.go | 172 ++++++++ .../pr_scope_manager_integration_test.go | 196 +++++++++ .../pr_scope_manager_prom_deletion_test.go | 391 ++++++++++++++++++ server/metrics/pr_scope_manager_test.go | 209 ++++++++++ server/server.go | 44 +- server/server_test.go | 39 +- server/user_config.go | 1 + 18 files changed, 1481 insertions(+), 61 deletions(-) create mode 100644 server/metrics/pr_scope_manager.go create mode 100644 server/metrics/pr_scope_manager_inactive_test.go create mode 100644 server/metrics/pr_scope_manager_integration_test.go create mode 100644 server/metrics/pr_scope_manager_prom_deletion_test.go create mode 100644 server/metrics/pr_scope_manager_test.go diff --git a/cmd/server.go b/cmd/server.go index c0d5db4f8f..36d0852ea6 100644 --- a/cmd/server.go +++ b/cmd/server.go @@ -20,6 +20,7 @@ import ( "path/filepath" "slices" "strings" + "time" homedir "github.com/mitchellh/go-homedir" "github.com/moby/patternmatcher" @@ -126,6 +127,7 @@ const ( ParallelPoolSize = "parallel-pool-size" PendingApplyStatusFlag = "pending-apply-status" StatsNamespace = "stats-namespace" + MetricsInactivePRRetention = "metrics-inactive-pr-retention" AllowDraftPRs = "allow-draft-prs" PortFlag = "port" RedisDB = "redis-db" @@ -188,6 +190,7 @@ const ( DefaultMaxCommentsPerCommand = 100 DefaultParallelPoolSize = 15 DefaultStatsNamespace = "atlantis" + DefaultMetricsInactivePRRetention = "24h" DefaultPort = 4141 DefaultRedisDB = 0 DefaultRedisPort = 6379 @@ -414,6 +417,10 @@ var stringFlags = map[string]stringFlag{ description: "Namespace for aggregating stats.", defaultValue: DefaultStatsNamespace, }, + MetricsInactivePRRetention: { + description: "Duration to retain metrics for inactive PRs before cleanup (e.g., '24h', '168h', '7d'). Cleanup runs at this same frequency. Set to 0 to disable cleanup.", + defaultValue: DefaultMetricsInactivePRRetention, + }, RedisHost: { description: "The Redis Hostname for when using a Locking DB type of 'redis'.", }, @@ -956,6 +963,9 @@ func (s *ServerCmd) setDefaults(c *server.UserConfig, v *viper.Viper) { if c.StatsNamespace == "" { c.StatsNamespace = DefaultStatsNamespace } + if c.MetricsInactivePRRetention == "" { + c.MetricsInactivePRRetention = DefaultMetricsInactivePRRetention + } if c.Port == 0 { c.Port = DefaultPort } @@ -1005,6 +1015,19 @@ func (s *ServerCmd) validate(userConfig server.UserConfig) error { TFDistributionTerraform, TFDistributionOpenTofu) } + if userConfig.MetricsInactivePRRetention != "" && userConfig.MetricsInactivePRRetention != "0" { + retention, err := time.ParseDuration(userConfig.MetricsInactivePRRetention) + if err != nil { + return fmt.Errorf("--%s must be a valid duration (e.g., '24h', '168h', '7d'): %w", MetricsInactivePRRetention, err) + } + if retention < 0 { + return fmt.Errorf("--%s must be positive", MetricsInactivePRRetention) + } + if retention > 30*24*time.Hour { + return fmt.Errorf("--%s must be <= 30 days", MetricsInactivePRRetention) + } + } + checkoutStrategy := userConfig.CheckoutStrategy if checkoutStrategy != CheckoutStrategyBranch && checkoutStrategy != CheckoutStrategyMerge { return fmt.Errorf("invalid checkout strategy: not one of %s or %s", diff --git a/cmd/server_test.go b/cmd/server_test.go index d8c2f6e5d9..e0c6a190b5 100644 --- a/cmd/server_test.go +++ b/cmd/server_test.go @@ -118,6 +118,7 @@ var testFlags = map[string]any{ LogLevelFlag: "debug", MarkdownTemplateOverridesDirFlag: "/path2", MaxCommentsPerCommand: 10, + MetricsInactivePRRetention: "72h", StatsNamespace: "atlantis", AllowDraftPRs: true, PortFlag: 8181, diff --git a/runatlantis.io/docs/server-configuration.md b/runatlantis.io/docs/server-configuration.md index bc916e5286..76840420a8 100644 --- a/runatlantis.io/docs/server-configuration.md +++ b/runatlantis.io/docs/server-configuration.md @@ -1047,6 +1047,16 @@ ATLANTIS_MAX_COMMENTS_PER_COMMAND=100 Limit the number of comments published after a command is executed, to prevent spamming your VCS and Atlantis to get throttled as a result. Defaults to `100`. Set this option to `0` to disable log truncation. Note that the truncation will happen on the top of the command output, to preserve the most important parts of the output, often displayed at the end. +### `--metrics-inactive-pr-retention` + +```bash +atlantis server --metrics-inactive-pr-retention=72h +# or +ATLANTIS_METRICS_INACTIVE_PR_RETENTION=72h +``` + +After the duration specified, Atlantis will stop reporting metrics for inactive pull requests. + ### `--parallel-apply` ```bash diff --git a/server/controllers/events/events_controller.go b/server/controllers/events/events_controller.go index a4aefbe3c7..8c52d916b9 100644 --- a/server/controllers/events/events_controller.go +++ b/server/controllers/events/events_controller.go @@ -196,11 +196,17 @@ func (e *VCSEventsController) handleGithubPost(w http.ResponseWriter, r *http.Re case *github.IssueCommentEvent: resp = e.HandleGithubCommentEvent(event, githubReqID, logger) scope = scope.SubScope(fmt.Sprintf("comment_%s", *event.Action)) - scope = common.SetGitScopeTags(scope, event.GetRepo().GetFullName(), event.GetIssue().GetNumber()) + scope = scope.Tagged(map[string]string{ + "base_repo": event.GetRepo().GetFullName(), + "pr_number": strconv.Itoa(event.GetIssue().GetNumber()), + }) case *github.PullRequestEvent: resp = e.HandleGithubPullRequestEvent(logger, event, githubReqID) scope = scope.SubScope(fmt.Sprintf("pr_%s", *event.Action)) - scope = common.SetGitScopeTags(scope, event.GetRepo().GetFullName(), event.GetNumber()) + scope = scope.Tagged(map[string]string{ + "base_repo": event.GetRepo().GetFullName(), + "pr_number": strconv.Itoa(event.GetNumber()), + }) default: resp = HTTPResponse{ body: fmt.Sprintf("Ignoring unsupported event %s", githubReqID), diff --git a/server/events/command/project_context.go b/server/events/command/project_context.go index a15a87d91b..dec3b05208 100644 --- a/server/events/command/project_context.go +++ b/server/events/command/project_context.go @@ -12,6 +12,7 @@ import ( "github.com/runatlantis/atlantis/server/core/config/valid" "github.com/runatlantis/atlantis/server/events/models" "github.com/runatlantis/atlantis/server/logging" + "github.com/runatlantis/atlantis/server/metrics" tally "github.com/uber-go/tally/v4" ) @@ -139,8 +140,9 @@ type ProjectContext struct { TeamAllowlistChecker TeamAllowlistChecker } -// SetProjectScopeTags adds ProjectContext tags to a new returned scope. -func (p ProjectContext) SetProjectScopeTags(scope tally.Scope) tally.Scope { +// SetProjectScopeTags sets project-specific tags on a scope using the PR scope manager. +// Creates a closeable PR-specific root scope with project-level tags. +func (p ProjectContext) SetProjectScopeTags(prScopeManager *metrics.PRScopeManager) tally.Scope { v := "" if p.TerraformVersion != nil { v = p.TerraformVersion.String() @@ -155,7 +157,8 @@ func (p ProjectContext) SetProjectScopeTags(scope tally.Scope) tally.Scope { Workspace: p.Workspace, } - return scope.Tagged(tags.Loadtags()) + // Use PR scope manager to create closeable PR-specific root scope + return prScopeManager.GetOrCreatePRScope(p.BaseRepo.FullName, p.Pull.Num, tags.Loadtags()) } // GetShowResultFileName returns the filename (not the path) to store the tf show result diff --git a/server/events/instrumented_project_command_runner.go b/server/events/instrumented_project_command_runner.go index 468ed79dc4..ed3a14f8d0 100644 --- a/server/events/instrumented_project_command_runner.go +++ b/server/events/instrumented_project_command_runner.go @@ -20,10 +20,11 @@ type IntrumentedCommandRunner interface { type InstrumentedProjectCommandRunner struct { projectCommandRunner ProjectCommandRunner - scope tally.Scope + prScopeManager *metrics.PRScopeManager + scope tally.Scope // fallback scope if PRScopeManager is nil } -func NewInstrumentedProjectCommandRunner(scope tally.Scope, projectCommandRunner ProjectCommandRunner) *InstrumentedProjectCommandRunner { +func NewInstrumentedProjectCommandRunner(scope tally.Scope, projectCommandRunner ProjectCommandRunner, prScopeManager *metrics.PRScopeManager) *InstrumentedProjectCommandRunner { projectTags := command.ProjectScopeTags{} scope = scope.SubScope("project").Tagged(projectTags.Loadtags()) @@ -33,38 +34,40 @@ func NewInstrumentedProjectCommandRunner(scope tally.Scope, projectCommandRunner return &InstrumentedProjectCommandRunner{ projectCommandRunner: projectCommandRunner, + prScopeManager: prScopeManager, scope: scope, } } func (p *InstrumentedProjectCommandRunner) Plan(ctx command.ProjectContext) command.ProjectCommandOutput { - return RunAndEmitStats(ctx, p.projectCommandRunner.Plan, p.scope) + return RunAndEmitStats(ctx, p.projectCommandRunner.Plan, p.prScopeManager, p.scope) } func (p *InstrumentedProjectCommandRunner) PolicyCheck(ctx command.ProjectContext) command.ProjectCommandOutput { - return RunAndEmitStats(ctx, p.projectCommandRunner.PolicyCheck, p.scope) + return RunAndEmitStats(ctx, p.projectCommandRunner.PolicyCheck, p.prScopeManager, p.scope) } func (p *InstrumentedProjectCommandRunner) Apply(ctx command.ProjectContext) command.ProjectCommandOutput { - return RunAndEmitStats(ctx, p.projectCommandRunner.Apply, p.scope) + return RunAndEmitStats(ctx, p.projectCommandRunner.Apply, p.prScopeManager, p.scope) } func (p *InstrumentedProjectCommandRunner) ApprovePolicies(ctx command.ProjectContext) command.ProjectCommandOutput { - return RunAndEmitStats(ctx, p.projectCommandRunner.ApprovePolicies, p.scope) + return RunAndEmitStats(ctx, p.projectCommandRunner.ApprovePolicies, p.prScopeManager, p.scope) } func (p *InstrumentedProjectCommandRunner) Import(ctx command.ProjectContext) command.ProjectCommandOutput { - return RunAndEmitStats(ctx, p.projectCommandRunner.Import, p.scope) + return RunAndEmitStats(ctx, p.projectCommandRunner.Import, p.prScopeManager, p.scope) } func (p *InstrumentedProjectCommandRunner) StateRm(ctx command.ProjectContext) command.ProjectCommandOutput { - return RunAndEmitStats(ctx, p.projectCommandRunner.StateRm, p.scope) + return RunAndEmitStats(ctx, p.projectCommandRunner.StateRm, p.prScopeManager, p.scope) } -func RunAndEmitStats(ctx command.ProjectContext, execute func(ctx command.ProjectContext) command.ProjectCommandOutput, scope tally.Scope) command.ProjectCommandOutput { +func RunAndEmitStats(ctx command.ProjectContext, execute func(ctx command.ProjectContext) command.ProjectCommandOutput, prScopeManager *metrics.PRScopeManager, fallbackScope tally.Scope) command.ProjectCommandOutput { commandName := ctx.CommandName.String() // ensures we are differentiating between project level command and overall command - scope = ctx.SetProjectScopeTags(scope).SubScope(commandName) + + scope := ctx.SetProjectScopeTags(prScopeManager).SubScope(commandName) logger := ctx.Log executionTime := scope.Timer(metrics.ExecutionTimeMetric).Start() diff --git a/server/events/project_command_context_builder.go b/server/events/project_command_context_builder.go index 255b76d773..56bfa346ca 100644 --- a/server/events/project_command_context_builder.go +++ b/server/events/project_command_context_builder.go @@ -74,11 +74,6 @@ func (cb *CommandScopedStatsProjectCommandContextBuilder) BuildProjectContext( projectCmds = []command.ProjectContext{} for _, cmd := range cmds { - - // specifically use the command name in the context instead of the arg - // since we can return multiple commands worth of contexts for a given command name arg - // to effectively pipeline them. - cmd.Scope = cmd.SetProjectScopeTags(cmd.Scope) projectCmds = append(projectCmds, cmd) } diff --git a/server/events/pull_closed_executor.go b/server/events/pull_closed_executor.go index c1f3a38697..4b3848719d 100644 --- a/server/events/pull_closed_executor.go +++ b/server/events/pull_closed_executor.go @@ -46,6 +46,16 @@ type PullCleaner interface { CleanUpPull(logger logging.SimpleLogging, repo models.Repo, pull models.PullRequest) error } +//go:generate pegomock generate github.com/runatlantis/atlantis/server/events --package mocks -o mocks/mock_scope_cleaner.go ScopeCleaner + +// ScopeCleaner tracks and cleans up metric scopes for closed PRs. +type ScopeCleaner interface { + // MarkPRClosed marks a PR as closed for metric cleanup. + MarkPRClosed(repoFullName string, pullNum int) + // CleanupStaleMetrics closes scopes that have exceeded the retention period. + CleanupStaleMetrics() int +} + // PullClosedExecutor executes the tasks required to clean up a closed pull // request. type PullClosedExecutor struct { @@ -56,6 +66,7 @@ type PullClosedExecutor struct { PullClosedTemplate PullCleanupTemplate LogStreamResourceCleaner ResourceCleaner CancellationTracker CancellationTracker + ScopeCleaner ScopeCleaner } type templatedProject struct { @@ -122,6 +133,10 @@ func (p *PullClosedExecutor) CleanUpPull(logger logging.SimpleLogging, repo mode p.CancellationTracker.Clear(pull) } + if p.ScopeCleaner != nil { + p.ScopeCleaner.MarkPRClosed(repo.FullName, pull.Num) + } + // If there are no locks then there's no need to comment. if len(locks) == 0 { return nil diff --git a/server/events/vcs/common/instrumented_client.go b/server/events/vcs/common/instrumented_client.go index 199c2f8d17..6d1905364f 100644 --- a/server/events/vcs/common/instrumented_client.go +++ b/server/events/vcs/common/instrumented_client.go @@ -15,13 +15,13 @@ import ( type InstrumentedClient struct { vcs.Client - StatsScope tally.Scope - Logger logging.SimpleLogging + StatsScope tally.Scope + PRScopeManager *metrics.PRScopeManager + Logger logging.SimpleLogging } func (c *InstrumentedClient) GetModifiedFiles(logger logging.SimpleLogging, repo models.Repo, pull models.PullRequest) ([]string, error) { - scope := c.StatsScope.SubScope("get_modified_files") - scope = SetGitScopeTags(scope, repo.FullName, pull.Num) + scope := SetGitScopeTags(c.PRScopeManager, repo.FullName, pull.Num).SubScope("get_modified_files") executionTime := scope.Timer(metrics.ExecutionTimeMetric).Start() defer executionTime.Stop() @@ -42,8 +42,7 @@ func (c *InstrumentedClient) GetModifiedFiles(logger logging.SimpleLogging, repo } func (c *InstrumentedClient) CreateComment(logger logging.SimpleLogging, repo models.Repo, pullNum int, comment string, command string) error { - scope := c.StatsScope.SubScope("create_comment") - scope = SetGitScopeTags(scope, repo.FullName, pullNum) + scope := SetGitScopeTags(c.PRScopeManager, repo.FullName, pullNum).SubScope("create_comment") executionTime := scope.Timer(metrics.ExecutionTimeMetric).Start() defer executionTime.Stop() @@ -62,7 +61,7 @@ func (c *InstrumentedClient) CreateComment(logger logging.SimpleLogging, repo mo } func (c *InstrumentedClient) ReactToComment(logger logging.SimpleLogging, repo models.Repo, pullNum int, commentID int64, reaction string) error { - scope := c.StatsScope.SubScope("react_to_comment") + scope := SetGitScopeTags(c.PRScopeManager, repo.FullName, pullNum).SubScope("react_to_comment") executionTime := scope.Timer(metrics.ExecutionTimeMetric).Start() defer executionTime.Stop() @@ -81,8 +80,7 @@ func (c *InstrumentedClient) ReactToComment(logger logging.SimpleLogging, repo m } func (c *InstrumentedClient) HidePrevCommandComments(logger logging.SimpleLogging, repo models.Repo, pullNum int, command string, dir string) error { - scope := c.StatsScope.SubScope("hide_prev_plan_comments") - scope = SetGitScopeTags(scope, repo.FullName, pullNum) + scope := SetGitScopeTags(c.PRScopeManager, repo.FullName, pullNum).SubScope("hide_prev_plan_comments") executionTime := scope.Timer(metrics.ExecutionTimeMetric).Start() defer executionTime.Stop() @@ -102,8 +100,7 @@ func (c *InstrumentedClient) HidePrevCommandComments(logger logging.SimpleLoggin } func (c *InstrumentedClient) PullIsApproved(logger logging.SimpleLogging, repo models.Repo, pull models.PullRequest) (models.ApprovalStatus, error) { - scope := c.StatsScope.SubScope("pull_is_approved") - scope = SetGitScopeTags(scope, repo.FullName, pull.Num) + scope := SetGitScopeTags(c.PRScopeManager, repo.FullName, pull.Num).SubScope("pull_is_approved") executionTime := scope.Timer(metrics.ExecutionTimeMetric).Start() defer executionTime.Stop() @@ -124,8 +121,7 @@ func (c *InstrumentedClient) PullIsApproved(logger logging.SimpleLogging, repo m } func (c *InstrumentedClient) PullIsMergeable(logger logging.SimpleLogging, repo models.Repo, pull models.PullRequest, vcsstatusname string, ignoreVCSStatusNames []string) (models.MergeableStatus, error) { - scope := c.StatsScope.SubScope("pull_is_mergeable") - scope = SetGitScopeTags(scope, repo.FullName, pull.Num) + scope := SetGitScopeTags(c.PRScopeManager, repo.FullName, pull.Num).SubScope("pull_is_mergeable") executionTime := scope.Timer(metrics.ExecutionTimeMetric).Start() defer executionTime.Stop() @@ -152,8 +148,7 @@ func (c *InstrumentedClient) UpdateStatus(logger logging.SimpleLogging, repo mod return nil } - scope := c.StatsScope.SubScope("update_status") - scope = SetGitScopeTags(scope, repo.FullName, pull.Num) + scope := SetGitScopeTags(c.PRScopeManager, repo.FullName, pull.Num).SubScope("update_status") executionTime := scope.Timer(metrics.ExecutionTimeMetric).Start() defer executionTime.Stop() @@ -172,8 +167,7 @@ func (c *InstrumentedClient) UpdateStatus(logger logging.SimpleLogging, repo mod } func (c *InstrumentedClient) MergePull(logger logging.SimpleLogging, pull models.PullRequest, pullOptions models.PullRequestOptions) error { - scope := c.StatsScope.SubScope("merge_pull") - scope = SetGitScopeTags(scope, pull.BaseRepo.FullName, pull.Num) + scope := SetGitScopeTags(c.PRScopeManager, pull.BaseRepo.FullName, pull.Num).SubScope("merge_pull") executionTime := scope.Timer(metrics.ExecutionTimeMetric).Start() defer executionTime.Stop() @@ -191,9 +185,13 @@ func (c *InstrumentedClient) MergePull(logger logging.SimpleLogging, pull models return nil } -func SetGitScopeTags(scope tally.Scope, repoFullName string, pullNum int) tally.Scope { - return scope.Tagged(map[string]string{ +// SetGitScopeTags sets git-level tags (repo and PR) on a scope using the PR scope manager. +// Creates a closeable PR-specific root scope with git-level tags. +func SetGitScopeTags(prScopeManager *metrics.PRScopeManager, repoFullName string, pullNum int) tally.Scope { + tags := map[string]string{ "base_repo": repoFullName, "pr_number": strconv.Itoa(pullNum), - }) + } + + return prScopeManager.GetOrCreatePRScope(repoFullName, pullNum, tags) } diff --git a/server/events/vcs/github/instrumented_client.go b/server/events/vcs/github/instrumented_client.go index 9ebd116990..54491c6a8a 100644 --- a/server/events/vcs/github/instrumented_client.go +++ b/server/events/vcs/github/instrumented_client.go @@ -11,19 +11,21 @@ import ( ) // NewInstrumentedGithubClient creates a client proxy responsible for gathering stats and logging -func NewInstrumentedGithubClient(client *Client, statsScope tally.Scope, logger logging.SimpleLogging) IGithubClient { +func NewInstrumentedGithubClient(client *Client, statsScope tally.Scope, logger logging.SimpleLogging, prScopeManager *metrics.PRScopeManager) IGithubClient { scope := statsScope.SubScope("github") instrumentedGHClient := &common.InstrumentedClient{ - Client: client, - StatsScope: scope, - Logger: logger, + Client: client, + StatsScope: scope, + PRScopeManager: prScopeManager, + Logger: logger, } return &InstrumentedGithubClient{ InstrumentedClient: instrumentedGHClient, PullRequestGetter: client, StatsScope: scope, + PRScopeManager: prScopeManager, Logger: logger, } } @@ -47,12 +49,12 @@ type InstrumentedGithubClient struct { *common.InstrumentedClient PullRequestGetter GithubPullRequestGetter StatsScope tally.Scope + PRScopeManager *metrics.PRScopeManager Logger logging.SimpleLogging } func (c *InstrumentedGithubClient) GetPullRequest(logger logging.SimpleLogging, repo models.Repo, pullNum int) (*github.PullRequest, error) { - scope := c.StatsScope.SubScope("get_pull_request") - scope = common.SetGitScopeTags(scope, repo.FullName, pullNum) + scope := common.SetGitScopeTags(c.PRScopeManager, repo.FullName, pullNum).SubScope("get_pull_request") executionTime := scope.Timer(metrics.ExecutionTimeMetric).Start() defer executionTime.Stop() diff --git a/server/metrics/pr_scope_manager.go b/server/metrics/pr_scope_manager.go new file mode 100644 index 0000000000..061af887aa --- /dev/null +++ b/server/metrics/pr_scope_manager.go @@ -0,0 +1,342 @@ +// Copyright 2025 The Atlantis Authors +// SPDX-License-Identifier: Apache-2.0 + +package metrics + +import ( + "fmt" + "io" + "sync" + "time" + + "github.com/runatlantis/atlantis/server/logging" + tally "github.com/uber-go/tally/v4" + tallyprom "github.com/uber-go/tally/v4/prometheus" +) + +// PRScopeManager manages separate root scopes for each PR, allowing them to be +// individually closed when PRs are done, preventing exessive resource consumption over time. +type PRScopeManager struct { + logger logging.SimpleLogging + baseReporter tally.BaseStatsReporter + promReporter tallyprom.Reporter // Optional: for cleaning up Prometheus metrics + scopeOptions tally.ScopeOptions + reportInterval time.Duration + retentionPeriod time.Duration + + mu sync.RWMutex + prScopes map[string]*prScopeEntry // key: "repo/pullnum" +} + +type prScopeEntry struct { + scope *trackingScope + closer io.Closer + lastAccess time.Time + tags map[string]string // Store tags for Prometheus cleanup +} + +// trackingScope wraps a tally.Scope to track which subscopes are created. +// This allows us to clean up Prometheus metrics without hardcoding subscope names. +type trackingScope struct { + tally.Scope + mu sync.RWMutex + subscopes map[string]bool // Set of subscope names that have been created +} + +func newTrackingScope(scope tally.Scope) *trackingScope { + return &trackingScope{ + Scope: scope, + subscopes: make(map[string]bool), + } +} + +// SubScope wraps the underlying SubScope call and tracks the subscope name. +// When Prometheus cleanup runs, only metrics from subscopes that were actually +// created will be targeted for deletion, avoiding unnecessary cleanup attempts. +func (ts *trackingScope) SubScope(name string) tally.Scope { + ts.mu.Lock() + ts.subscopes[name] = true + ts.mu.Unlock() + return ts.Scope.SubScope(name) +} + +// getSubscopes returns a list of all subscope names that were created. +func (ts *trackingScope) getSubscopes() []string { + ts.mu.RLock() + defer ts.mu.RUnlock() + + names := make([]string, 0, len(ts.subscopes)) + for name := range ts.subscopes { + names = append(names, name) + } + return names +} + +// NewPRScopeManager creates a manager for PR-specific root scopes. +func NewPRScopeManager( + logger logging.SimpleLogging, + baseReporter tally.BaseStatsReporter, + scopeOptions tally.ScopeOptions, + reportInterval time.Duration, + retentionPeriod time.Duration, +) *PRScopeManager { + manager := &PRScopeManager{ + logger: logger, + baseReporter: baseReporter, + scopeOptions: scopeOptions, + reportInterval: reportInterval, + retentionPeriod: retentionPeriod, + prScopes: make(map[string]*prScopeEntry), + } + + // If the reporter is a Prometheus reporter, store it for metric cleanup + if promReporter, ok := baseReporter.(tallyprom.Reporter); ok { + manager.promReporter = promReporter + } + + return manager +} + +// GetOrCreatePRScope returns a root scope for the given PR with the specified tags. +// If a scope already exists for this PR, it returns the existing one. +// Each PR gets its own root scope that can be independently closed. +func (m *PRScopeManager) GetOrCreatePRScope(repo string, prNum int, tags map[string]string) tally.Scope { + key := m.prKey(repo, prNum) + + m.mu.Lock() + if entry, exists := m.prScopes[key]; exists { + // Update last access time + entry.lastAccess = time.Now() + m.mu.Unlock() + return entry.scope + } + m.mu.Unlock() + + // Create a new root scope for this PR + m.mu.Lock() + defer m.mu.Unlock() + + // Double-check after acquiring write lock + if entry, exists := m.prScopes[key]; exists { + return entry.scope + } + + // Create new root scope with the PR-specific tags + // IMPORTANT: Each root scope shares the same reporter, so we must + // use tags to differentiate metrics (for CachedStatsReporter like Prometheus) + // or rely on metric name prefixes (for StatsReporter like StatsD). + opts := m.scopeOptions + + // Set the appropriate reporter field based on type + if cachedReporter, ok := m.baseReporter.(tally.CachedStatsReporter); ok { + opts.CachedReporter = cachedReporter + } else if statsReporter, ok := m.baseReporter.(tally.StatsReporter); ok { + opts.Reporter = statsReporter + } + + opts.Tags = mergeTags(opts.Tags, tags) + + scope, closer := tally.NewRootScope(opts, m.reportInterval) + + // Wrap the scope to track subscopes for Prometheus cleanup + ts := newTrackingScope(scope) + + m.prScopes[key] = &prScopeEntry{ + scope: ts, + closer: closer, + lastAccess: time.Now(), + tags: tags, // Store tags for Prometheus cleanup + } + + m.logger.Debug("created new root scope for pr %s", key) + return ts +} + +// MarkPRClosed immediately closes a PR's scope and removes it. +// For explicitly closed PRs, we don't need to wait for a retention period. +func (m *PRScopeManager) MarkPRClosed(repo string, prNum int) { + key := m.prKey(repo, prNum) + + m.mu.Lock() + defer m.mu.Unlock() + + entry, exists := m.prScopes[key] + if !exists { + // PR never had any commands run, no scope to clean up + return + } + + // Clean up Prometheus metrics before closing the scope + m.deletePrometheusMetrics(entry) + + // Close the scope immediately + if err := entry.closer.Close(); err != nil { + m.logger.Err("error closing scope for pr %s: %s", key, err) + } else { + m.logger.Debug("closed scope for pr %s (explicitly closed)", key) + } + + // Remove from active scopes + delete(m.prScopes, key) +} + +// Run implements the scheduled.Job interface for periodic cleanup. +func (m *PRScopeManager) Run() { + m.CleanupStaleScopes() +} + +// CleanupStaleMetrics is an alias for CleanupStaleScopes to satisfy the ScopeCleaner interface. +func (m *PRScopeManager) CleanupStaleMetrics() int { + return m.CleanupStaleScopes() +} + +// CleanupStaleScopes closes and removes scopes for inactive PRs +// (PRs with no activity for longer than retention period - abandoned PRs, deleted repos, etc.) +// Note: Explicitly closed PRs are handled immediately in MarkPRClosed(). +func (m *PRScopeManager) CleanupStaleScopes() int { + // If retention period is 0, cleanup is disabled + if m.retentionPeriod == 0 { + return 0 + } + + m.mu.Lock() + defer m.mu.Unlock() + + now := time.Now() + cleaned := 0 + + // Clean up active PR scopes with no recent activity (abandoned/stale PRs) + for key, entry := range m.prScopes { + if now.Sub(entry.lastAccess) > m.retentionPeriod { + m.logger.Debug("removed stale scope for pr %s %v", key, entry.scope) + + // Clean up Prometheus metrics before closing the scope + m.deletePrometheusMetrics(entry) + + if err := entry.closer.Close(); err != nil { + m.logger.Err("error closing scope for inactive pr %s: %s", key, err) + } else { + m.logger.Debug("closed scope for inactive pr %s (no activity for %s)", key, m.retentionPeriod) + cleaned++ + } + delete(m.prScopes, key) + } + } + + if cleaned > 0 { + m.logger.Info("closed and cleaned up %d inactive pr root scopes", cleaned) + } + + return cleaned +} + +// GetStats returns the number of active PR scopes. +func (m *PRScopeManager) GetStats() int { + m.mu.RLock() + defer m.mu.RUnlock() + return len(m.prScopes) +} + +func (m *PRScopeManager) prKey(repo string, prNum int) string { + return fmt.Sprintf("%s/%d", repo, prNum) +} + +// deletePrometheusMetrics removes Prometheus metric label values for a PR to prevent metric bloat. +// This is called before closing a PR scope to clean up stale metrics from the /metrics endpoint. +func (m *PRScopeManager) deletePrometheusMetrics(prScopeEntry *prScopeEntry) { + if m.promReporter == nil { + return + } + + // Get the list of subscopes that were actually created during this PR's lifetime + subscopes := prScopeEntry.scope.getSubscopes() + if len(subscopes) == 0 { + // No subscopes created, nothing to clean up + return + } + + // Build the full metric name with prefix and subscope + prefix := m.scopeOptions.Prefix + separator := m.scopeOptions.Separator + if separator == "" { + separator = "." + } + + deletedCount := 0 + + // Use Delete(Labels) instead of DeleteLabelValues() to avoid label key ordering issues. + // Tally's keysFromMap() doesn't sort keys, so we can't reliably match the order used + // when the collector was created. Delete(Labels) uses a map and is order-independent. + // + // IMPORTANT: Prometheus sanitizes label values (e.g., "owner/repo" becomes "owner_repo"). + // We must apply the same sanitization to our label values before calling Delete(), + // otherwise the labels won't match and deletion will fail. + labels := make(map[string]string, len(prScopeEntry.tags)) + if m.scopeOptions.SanitizeOptions != nil { + sanitizer := tally.NewSanitizer(*m.scopeOptions.SanitizeOptions) + for k, v := range prScopeEntry.tags { + // Sanitize the label value to match what Prometheus actually stored + labels[k] = sanitizer.Value(v) + } + } else { + // No sanitizer configured - use original values + for k, v := range prScopeEntry.tags { + labels[k] = v + } + } + + // Extract all tag keys for RegisterCounter/RegisterTimer (order doesn't matter for registration) + tagKeys := make([]string, 0, len(prScopeEntry.tags)) + for k := range prScopeEntry.tags { + tagKeys = append(tagKeys, k) + } + + for _, subscope := range subscopes { + // Delete counters: execution_success and execution_error + for _, metricName := range []string{ExecutionSuccessMetric, ExecutionErrorMetric} { + fullName := prefix + if fullName != "" { + fullName += separator + } + fullName += subscope + separator + metricName + + // Try to get the counter and delete using label map + if counterVec, err := m.promReporter.RegisterCounter(fullName, tagKeys, ""); err == nil { + if counterVec.Delete(labels) { + deletedCount++ + } + } + } + + // Delete timer (histogram or summary): execution_time + fullName := prefix + if fullName != "" { + fullName += separator + } + fullName += subscope + separator + ExecutionTimeMetric + + // Try to get the timer and delete using label map + if timerUnion, err := m.promReporter.RegisterTimer(fullName, tagKeys, "", nil); err == nil { + if timerUnion.Histogram != nil && timerUnion.Histogram.Delete(labels) { + deletedCount++ + } else if timerUnion.Summary != nil && timerUnion.Summary.Delete(labels) { + deletedCount++ + } + } + } + + if deletedCount > 0 { + m.logger.Debug("deleted %d Prometheus metric label values for PR", deletedCount) + } +} + +func mergeTags(base, additional map[string]string) map[string]string { + result := make(map[string]string, len(base)+len(additional)) + for k, v := range base { + result[k] = v + } + for k, v := range additional { + result[k] = v + } + return result +} diff --git a/server/metrics/pr_scope_manager_inactive_test.go b/server/metrics/pr_scope_manager_inactive_test.go new file mode 100644 index 0000000000..f290870ca8 --- /dev/null +++ b/server/metrics/pr_scope_manager_inactive_test.go @@ -0,0 +1,172 @@ +// Copyright 2025 The Atlantis Authors +// SPDX-License-Identifier: Apache-2.0 + +package metrics_test + +import ( + "fmt" + "testing" + "time" + + "github.com/runatlantis/atlantis/server/logging" + "github.com/runatlantis/atlantis/server/metrics" + . "github.com/runatlantis/atlantis/testing" + tally "github.com/uber-go/tally/v4" + promreporter "github.com/uber-go/tally/v4/prometheus" +) + +// TestPRScopeManager_InactivePRCleanup verifies that scopes are cleaned up +// even if the PR is never explicitly closed (abandoned PRs, deleted repos, etc.) +func TestPRScopeManager_InactivePRCleanup(t *testing.T) { + logger := logging.NewNoopLogger(t) + reporter := promreporter.NewReporter(promreporter.Options{}) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "atlantis_inactive", + Separator: "_", + OmitCardinalityMetrics: true, + }, + 100*time.Millisecond, + 50*time.Millisecond, // Short retention for testing + ) + + // Create scope for PR that will never be explicitly closed + scope := manager.GetOrCreatePRScope("owner/repo", 999, map[string]string{ + "pr_number": "999", + }) + Assert(t, scope != nil, "scope created") + + // Use it once + scope.Counter("operations").Inc(1) + + active := manager.GetStats() + Equals(t, 1, active) + + // Don't call MarkPRClosed - simulate abandoned PR + + // Cleanup immediately shouldn't remove (recently used) + cleaned := manager.CleanupStaleScopes() + Equals(t, 0, cleaned) + + active = manager.GetStats() + Equals(t, 1, active) + + // Wait for retention period without any activity + time.Sleep(60 * time.Millisecond) + + // Now cleanup should remove the inactive scope + cleaned = manager.CleanupStaleScopes() + Equals(t, 1, cleaned) + + active = manager.GetStats() + Equals(t, 0, active) + + t.Log("Successfully cleaned up inactive PR scope without explicit close") +} + +// TestPRScopeManager_ActivePRNotCleaned verifies that PRs with recent activity +// are NOT cleaned up, even if they've been open a long time +func TestPRScopeManager_ActivePRNotCleaned(t *testing.T) { + logger := logging.NewNoopLogger(t) + reporter := promreporter.NewReporter(promreporter.Options{}) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "atlantis_active", + Separator: "_", + OmitCardinalityMetrics: true, + }, + 100*time.Millisecond, + 100*time.Millisecond, // retention period + ) + + // Create scope + scope := manager.GetOrCreatePRScope("owner/repo", 123, map[string]string{ + "pr_number": "123", + }) + + // Simulate ongoing activity - use unique counter names to avoid Prometheus conflicts + for i := 0; i < 5; i++ { + time.Sleep(30 * time.Millisecond) + + // Access the scope (updates lastAccess) + scope2 := manager.GetOrCreatePRScope("owner/repo", 123, map[string]string{ + "pr_number": "123", + }) + Assert(t, scope == scope2, "should return same scope") + scope2.Counter(fmt.Sprintf("operations_%d", i)).Inc(1) + + // Try cleanup + cleaned := manager.CleanupStaleScopes() + Equals(t, 0, cleaned) // Should not clean up active PR + + active := manager.GetStats() + Equals(t, 1, active) + } + + t.Log("Active PR with recent activity was not cleaned up") +} + +// TestPRScopeManager_MixedClosedAndInactive verifies cleanup handles both +// explicitly closed PRs and inactive PRs correctly +func TestPRScopeManager_MixedClosedAndInactive(t *testing.T) { + logger := logging.NewNoopLogger(t) + reporter := promreporter.NewReporter(promreporter.Options{}) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "atlantis_mixed", + Separator: "_", + OmitCardinalityMetrics: true, + }, + 100*time.Millisecond, + 50*time.Millisecond, + ) + + // PR 1: Will be explicitly closed + scope1 := manager.GetOrCreatePRScope("owner/repo", 1, map[string]string{"pr_number": "1"}) + scope1.Counter("ops").Inc(1) + + // PR 2: Will be abandoned (inactive) + scope2 := manager.GetOrCreatePRScope("owner/repo", 2, map[string]string{"pr_number": "2"}) + scope2.Counter("ops").Inc(1) + + // PR 3: Will remain active + scope3 := manager.GetOrCreatePRScope("owner/repo", 3, map[string]string{"pr_number": "3"}) + scope3.Counter("ops").Inc(1) + + active := manager.GetStats() + Equals(t, 3, active) + + // Close PR 1 - closes immediately + manager.MarkPRClosed("owner/repo", 1) + + active = manager.GetStats() + Equals(t, 2, active) // PR 1 closed immediately, PRs 2 and 3 remain + + // Keep PR 3 active + time.Sleep(30 * time.Millisecond) + scope3 = manager.GetOrCreatePRScope("owner/repo", 3, map[string]string{"pr_number": "3"}) + scope3.Counter("ops2").Inc(1) + + // Wait for retention to expire for PR 2 (inactive) + time.Sleep(30 * time.Millisecond) + + // Cleanup should remove: + // - PR 2 (inactive + retention expired) + // But NOT PR 3 (active with recent access) + cleaned := manager.CleanupStaleScopes() + Equals(t, 1, cleaned) + + active = manager.GetStats() + Equals(t, 1, active) // Only PR 3 remains + + t.Log("Successfully cleaned up closed and inactive PRs while preserving active PR") +} diff --git a/server/metrics/pr_scope_manager_integration_test.go b/server/metrics/pr_scope_manager_integration_test.go new file mode 100644 index 0000000000..5bc1d69604 --- /dev/null +++ b/server/metrics/pr_scope_manager_integration_test.go @@ -0,0 +1,196 @@ +// Copyright 2025 The Atlantis Authors +// SPDX-License-Identifier: Apache-2.0 + +package metrics_test + +import ( + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/runatlantis/atlantis/server/logging" + "github.com/runatlantis/atlantis/server/metrics" + "github.com/runatlantis/atlantis/server/scheduled" + tally "github.com/uber-go/tally/v4" + tallyprom "github.com/uber-go/tally/v4/prometheus" + . "github.com/runatlantis/atlantis/testing" +) + +// TestPRScopeManager_ScheduledCleanupIntegration is an end-to-end test that verifies +// the complete integration: PRScopeManager creates root scopes, scheduled executor +// runs periodic cleanup, and inactive PR scopes are properly cleaned up. +func TestPRScopeManager_ScheduledCleanupIntegration(t *testing.T) { + logger := logging.NewNoopLogger(t) + + // Create a Prometheus reporter with unique registry for this test + reporter := tallyprom.NewReporter(tallyprom.Options{ + Registerer: prometheus.NewRegistry(), + }) + + // Create PRScopeManager with short retention for testing (100ms) + retentionPeriod := 100 * time.Millisecond + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "test_integration", + Separator: tallyprom.DefaultSeparator, + SanitizeOptions: &tallyprom.DefaultSanitizerOpts, + }, + time.Second, + retentionPeriod, + ) + + // Create scheduled executor service (same as production) + scheduledService := scheduled.NewExecutorService( + tally.NoopScope, + logger, + ) + + // Add PRScopeManager as a scheduled job with same period as retention + // (this is exactly how it's done in server.go) + scheduledService.AddJob(scheduled.JobDefinition{ + Job: manager, + Period: retentionPeriod, + }) + + // Start the scheduled executor in background + go scheduledService.Run() + defer func() { + // Cleanup would happen via signal, but for test we just return + }() + + // === Simulate production usage === + + // 1. Create metrics for PR #1 (will become inactive) + scope1 := manager.GetOrCreatePRScope("owner/repo", 1, map[string]string{ + "pr_number": "1", + }) + scope1.Counter("commands").Inc(1) + Equals(t, 1, manager.GetStats()) // 1 active PR + + // 2. Create metrics for PR #2 (will stay active) + scope2 := manager.GetOrCreatePRScope("owner/repo", 2, map[string]string{ + "pr_number": "2", + }) + scope2.Counter("commands").Inc(1) + Equals(t, 2, manager.GetStats()) // 2 active PRs + + // 3. PR #2 continues to be active (simulate ongoing commands) + keepAlive := time.NewTicker(50 * time.Millisecond) + defer keepAlive.Stop() + done := make(chan bool) + go func() { + for { + select { + case <-keepAlive.C: + // Keep PR #2 active by accessing its scope + s := manager.GetOrCreatePRScope("owner/repo", 2, map[string]string{ + "pr_number": "2", + }) + s.Counter("commands").Inc(1) + case <-done: + return + } + } + }() + + // 4. Wait for retention period + a bit more to ensure cleanup runs + // The scheduled job will run after retentionPeriod and should clean up PR #1 + time.Sleep(retentionPeriod + 50*time.Millisecond) + + // Stop keep-alive + done <- true + + // 5. Verify: PR #1 should be cleaned up (inactive), PR #2 should remain (active) + active := manager.GetStats() + Equals(t, 1, active) + + // 6. Verify PR #2 still works (wasn't cleaned) + scope2Again := manager.GetOrCreatePRScope("owner/repo", 2, map[string]string{ + "pr_number": "2", + }) + Assert(t, scope2Again != nil, "PR #2 scope should still exist") + + t.Log("✅ Integration test passed: scheduled cleanup properly removes inactive PRs while preserving active ones") +} + +// TestPRScopeManager_ExplicitCloseIntegration verifies that explicitly closing +// a PR immediately removes its scope (doesn't wait for retention period). +func TestPRScopeManager_ExplicitCloseIntegration(t *testing.T) { + logger := logging.NewNoopLogger(t) + // Use unique reporter for each test to avoid Prometheus registration conflicts + reporter := tallyprom.NewReporter(tallyprom.Options{ + Registerer: prometheus.NewRegistry(), + }) + + // Create PRScopeManager with long retention (we won't wait for it) + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "test_explicit", + Separator: tallyprom.DefaultSeparator, + SanitizeOptions: &tallyprom.DefaultSanitizerOpts, + }, + time.Second, + 1*time.Hour, // Long retention - we're testing explicit close + ) + + // Create metrics for a PR + scope := manager.GetOrCreatePRScope("owner/repo", 123, map[string]string{ + "pr_number": "123", + }) + scope.Counter("commands").Inc(1) + Equals(t, 1, manager.GetStats()) + + // Explicitly close the PR (simulates PR being merged/closed) + manager.MarkPRClosed("owner/repo", 123) + + // Verify immediate cleanup (no waiting for retention period) + Equals(t, 0, manager.GetStats()) + + t.Log("✅ Integration test passed: explicit close immediately removes PR scope") +} + +// TestPRScopeManager_DisabledCleanup verifies that when retention is 0, +// cleanup is disabled and scopes are never automatically cleaned. +func TestPRScopeManager_DisabledCleanup(t *testing.T) { + logger := logging.NewNoopLogger(t) + // Use unique reporter for each test to avoid Prometheus registration conflicts + reporter := tallyprom.NewReporter(tallyprom.Options{ + Registerer: prometheus.NewRegistry(), + }) + + // Create PRScopeManager with 0 retention (cleanup disabled) + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "test_disabled", + Separator: tallyprom.DefaultSeparator, + SanitizeOptions: &tallyprom.DefaultSanitizerOpts, + }, + time.Second, + 0, // Retention = 0 means cleanup disabled + ) + + // Create metrics for a PR + scope := manager.GetOrCreatePRScope("owner/repo", 999, map[string]string{ + "pr_number": "999", + }) + scope.Counter("commands").Inc(1) + Equals(t, 1, manager.GetStats()) + + // Wait a bit (would normally trigger cleanup if enabled) + time.Sleep(50 * time.Millisecond) + + // Run cleanup manually + cleaned := manager.CleanupStaleScopes() + Equals(t, 0, cleaned) // Nothing cleaned because retention = 0 + + // PR scope should still exist + Equals(t, 1, manager.GetStats()) + + t.Log("✅ Integration test passed: cleanup disabled when retention = 0") +} diff --git a/server/metrics/pr_scope_manager_prom_deletion_test.go b/server/metrics/pr_scope_manager_prom_deletion_test.go new file mode 100644 index 0000000000..65fceb33b2 --- /dev/null +++ b/server/metrics/pr_scope_manager_prom_deletion_test.go @@ -0,0 +1,391 @@ +// Copyright 2025 The Atlantis Authors +// SPDX-License-Identifier: Apache-2.0 + +package metrics_test + +import ( + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/runatlantis/atlantis/server/logging" + "github.com/runatlantis/atlantis/server/metrics" + tally "github.com/uber-go/tally/v4" + tallyprom "github.com/uber-go/tally/v4/prometheus" + . "github.com/runatlantis/atlantis/testing" +) + +// TestPRScopeManager_PrometheusMetricDeletion verifies that Prometheus metrics +// are actually deleted when a PR scope is closed. +func TestPRScopeManager_PrometheusMetricDeletion(t *testing.T) { + logger := logging.NewNoopLogger(t) + + // Create a custom Prometheus registry so we can inspect metrics + registry := prometheus.NewRegistry() + reporter := tallyprom.NewReporter(tallyprom.Options{ + Registerer: registry, + }) + + // Create PRScopeManager + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "test_deletion", + Separator: tallyprom.DefaultSeparator, + SanitizeOptions: &tallyprom.DefaultSanitizerOpts, + }, + 100*time.Millisecond, // Report interval + 1*time.Hour, // Long retention - we'll test explicit close + ) + + // Create a PR scope with tags + scope := manager.GetOrCreatePRScope("owner/repo", 42, map[string]string{ + "base_repo": "owner/repo", + "pr_number": "42", + }) + + // Create subscopes and metrics that match what instrumented_client.go creates + updateStatusScope := scope.SubScope("update_status") + updateStatusScope.Counter(metrics.ExecutionSuccessMetric).Inc(5) + updateStatusScope.Counter(metrics.ExecutionErrorMetric).Inc(2) + timer := updateStatusScope.Timer(metrics.ExecutionTimeMetric) + stopwatch := timer.Start() + time.Sleep(10 * time.Millisecond) + stopwatch.Stop() + + createCommentScope := scope.SubScope("create_comment") + createCommentScope.Counter(metrics.ExecutionSuccessMetric).Inc(3) + createCommentScope.Counter(metrics.ExecutionErrorMetric).Inc(1) + + // Let metrics flush + time.Sleep(150 * time.Millisecond) + + // Verify metrics exist in Prometheus before deletion + metricsFamilies, err := registry.Gather() + Ok(t, err) + + // Count metrics with pr_number="42" + beforeCount := countMetricsWithLabel(metricsFamilies, "pr_number", "42") + t.Logf("Found %d metric samples with pr_number=\"42\" before deletion", beforeCount) + Assert(t, beforeCount > 0, "should have metrics with pr_number=\"42\" before deletion") + + // Now close the PR scope (this should trigger Prometheus metric deletion) + manager.MarkPRClosed("owner/repo", 42) + + // Let the deletion complete + time.Sleep(50 * time.Millisecond) + + // Verify metrics are deleted from Prometheus + metricsFamilies, err = registry.Gather() + Ok(t, err) + + afterCount := countMetricsWithLabel(metricsFamilies, "pr_number", "42") + t.Logf("Found %d metric samples with pr_number=\"42\" after deletion", afterCount) + + // This is the critical assertion - metrics should be deleted + Equals(t, 0, afterCount) + + t.Log("✅ Test passed: Prometheus metrics with pr_number=\"42\" were successfully deleted") +} + +// TestPRScopeManager_MultiPRDeletion verifies that deleting one PR's metrics +// doesn't affect another PR's metrics. +func TestPRScopeManager_MultiPRDeletion(t *testing.T) { + logger := logging.NewNoopLogger(t) + + registry := prometheus.NewRegistry() + reporter := tallyprom.NewReporter(tallyprom.Options{ + Registerer: registry, + }) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "test_multi", + Separator: tallyprom.DefaultSeparator, + SanitizeOptions: &tallyprom.DefaultSanitizerOpts, + }, + 100*time.Millisecond, + 1*time.Hour, + ) + + // Create metrics for PR #1 + scope1 := manager.GetOrCreatePRScope("owner/repo", 1, map[string]string{ + "base_repo": "owner/repo", + "pr_number": "1", + }) + scope1.SubScope("update_status").Counter(metrics.ExecutionSuccessMetric).Inc(10) + + // Create metrics for PR #2 + scope2 := manager.GetOrCreatePRScope("owner/repo", 2, map[string]string{ + "base_repo": "owner/repo", + "pr_number": "2", + }) + scope2.SubScope("update_status").Counter(metrics.ExecutionSuccessMetric).Inc(20) + + // Let metrics flush + time.Sleep(150 * time.Millisecond) + + // Verify both PRs have metrics + metricsFamilies, err := registry.Gather() + Ok(t, err) + pr1Before := countMetricsWithLabel(metricsFamilies, "pr_number", "1") + pr2Before := countMetricsWithLabel(metricsFamilies, "pr_number", "2") + t.Logf("Before: PR #1 has %d metrics, PR #2 has %d metrics", pr1Before, pr2Before) + Assert(t, pr1Before > 0, "PR #1 should have metrics") + Assert(t, pr2Before > 0, "PR #2 should have metrics") + + // Close PR #1 + manager.MarkPRClosed("owner/repo", 1) + time.Sleep(50 * time.Millisecond) + + // Verify PR #1 metrics are gone but PR #2 metrics remain + metricsFamilies, err = registry.Gather() + Ok(t, err) + pr1After := countMetricsWithLabel(metricsFamilies, "pr_number", "1") + pr2After := countMetricsWithLabel(metricsFamilies, "pr_number", "2") + t.Logf("After: PR #1 has %d metrics, PR #2 has %d metrics", pr1After, pr2After) + + Equals(t, 0, pr1After) + Equals(t, pr2Before, pr2After) // PR #2 metrics unchanged + + t.Log("✅ Test passed: Deleting PR #1 metrics didn't affect PR #2 metrics") +} + +// countMetricsWithLabel counts how many metric samples have a specific label value. +func countMetricsWithLabel(families []*dto.MetricFamily, labelName, labelValue string) int { + count := 0 + for _, family := range families { + for _, metric := range family.GetMetric() { + for _, label := range metric.GetLabel() { + if label.GetName() == labelName && label.GetValue() == labelValue { + count++ + break + } + } + } + } + return count +} + +// gatherMetrics is a helper that gathers all metrics from a registry. +func gatherMetrics(t *testing.T, registry prometheus.Gatherer) []*dto.MetricFamily { + families, err := registry.Gather() + Ok(t, err) + return families +} + +// dumpMetrics is a helper for debugging - prints all metrics to test log. +func dumpMetrics(t *testing.T, families []*dto.MetricFamily, prefix string) { + t.Logf("%s: Dumping %d metric families", prefix, len(families)) + for _, family := range families { + t.Logf(" Family: %s (%s)", family.GetName(), family.GetType()) + for _, metric := range family.GetMetric() { + labels := make([]string, 0) + for _, label := range metric.GetLabel() { + labels = append(labels, label.GetName()+"="+label.GetValue()) + } + t.Logf(" Metric: %v", labels) + } + } +} + +// getMetricValue retrieves the value of a counter metric with specific labels. +func getMetricValue(families []*dto.MetricFamily, metricName string, labels map[string]string) (float64, bool) { + for _, family := range families { + if family.GetName() == metricName { + for _, metric := range family.GetMetric() { + if matchesLabels(metric.GetLabel(), labels) { + if metric.Counter != nil { + return metric.Counter.GetValue(), true + } + } + } + } + } + return 0, false +} + +// matchesLabels checks if a metric's labels match the expected labels. +func matchesLabels(metricLabels []*dto.LabelPair, expectedLabels map[string]string) bool { + if len(metricLabels) != len(expectedLabels) { + return false + } + for _, label := range metricLabels { + expectedValue, exists := expectedLabels[label.GetName()] + if !exists || expectedValue != label.GetValue() { + return false + } + } + return true +} + +// TestPRScopeManager_MetricValueVerification verifies the actual counter values +// before and after deletion. +func TestPRScopeManager_MetricValueVerification(t *testing.T) { + logger := logging.NewNoopLogger(t) + + registry := prometheus.NewRegistry() + reporter := tallyprom.NewReporter(tallyprom.Options{ + Registerer: registry, + }) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "test_values", + Separator: tallyprom.DefaultSeparator, + SanitizeOptions: &tallyprom.DefaultSanitizerOpts, + }, + 100*time.Millisecond, + 1*time.Hour, + ) + + // Create scope and increment counter + scope := manager.GetOrCreatePRScope("owner/repo", 99, map[string]string{ + "base_repo": "owner/repo", + "pr_number": "99", + }) + updateScope := scope.SubScope("update_status") + updateScope.Counter(metrics.ExecutionSuccessMetric).Inc(42) + + // Let metrics flush + time.Sleep(150 * time.Millisecond) + + // Verify the counter value before deletion + families := gatherMetrics(t, registry) + // Note: Prometheus sanitizes label values (owner/repo becomes owner_repo) + value, found := getMetricValue(families, "test_values_update_status_execution_success", map[string]string{ + "base_repo": "owner_repo", // Sanitized: / becomes _ + "pr_number": "99", + }) + Assert(t, found, "metric should exist before deletion") + Equals(t, 42.0, value) + t.Logf("Counter value before deletion: %.0f", value) + + // Close the PR + manager.MarkPRClosed("owner/repo", 99) + time.Sleep(50 * time.Millisecond) + + // Verify metric is deleted + families = gatherMetrics(t, registry) + _, found = getMetricValue(families, "test_values_update_status_execution_success", map[string]string{ + "base_repo": "owner_repo", // Sanitized + "pr_number": "99", + }) + Assert(t, !found, "metric should not exist after deletion") + + t.Log("✅ Test passed: Metric value was 42, now metric is deleted") +} + +// TestPRScopeManager_OnlyCreatedSubscopesDeleted verifies that only subscopes that were +// actually created during the PR's lifetime are targeted for deletion, not all possible subscopes. +func TestPRScopeManager_OnlyCreatedSubscopesDeleted(t *testing.T) { + logger := logging.NewNoopLogger(t) + + registry := prometheus.NewRegistry() + reporter := tallyprom.NewReporter(tallyprom.Options{ + Registerer: registry, + }) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "test_selective", + Separator: tallyprom.DefaultSeparator, + SanitizeOptions: &tallyprom.DefaultSanitizerOpts, + }, + 100*time.Millisecond, + 1*time.Hour, + ) + + // Create a PR scope but only use TWO subscopes (not all 8) + scope := manager.GetOrCreatePRScope("owner/repo", 77, map[string]string{ + "base_repo": "owner/repo", + "pr_number": "77", + }) + + // Only create metrics in update_status and create_comment subscopes + scope.SubScope("update_status").Counter(metrics.ExecutionSuccessMetric).Inc(10) + scope.SubScope("create_comment").Counter(metrics.ExecutionSuccessMetric).Inc(5) + + // Let metrics flush + time.Sleep(150 * time.Millisecond) + + // Verify we have metrics for pr_number="77" + metricsFamilies, err := registry.Gather() + Ok(t, err) + beforeCount := countMetricsWithLabel(metricsFamilies, "pr_number", "77") + t.Logf("Created %d metrics using only 2 subscopes", beforeCount) + Assert(t, beforeCount == 2, "should have exactly 2 metrics (one per subscope)") + + // Close the PR - should only attempt to delete metrics from the 2 subscopes we used + manager.MarkPRClosed("owner/repo", 77) + time.Sleep(50 * time.Millisecond) + + // Verify all metrics are deleted + metricsFamilies, err = registry.Gather() + Ok(t, err) + afterCount := countMetricsWithLabel(metricsFamilies, "pr_number", "77") + Equals(t, 0, afterCount) + + t.Log("✅ Test passed: Only the 2 subscopes that were created were targeted for cleanup") +} + +// TestPRScopeManager_NonPrometheusReporter verifies that when using a non-Prometheus +// reporter (like StatsD), the deletion code gracefully skips without errors. +func TestPRScopeManager_NonPrometheusReporter(t *testing.T) { + logger := logging.NewNoopLogger(t) + + // Use a TestReporter instead of Prometheus + reporter := &testStatsReporter{} + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "test_statsd", + Separator: ".", + }, + 100*time.Millisecond, + 1*time.Hour, + ) + + // Create and close a scope - should not panic or error + scope := manager.GetOrCreatePRScope("owner/repo", 555, map[string]string{ + "pr_number": "555", + }) + scope.Counter("test").Inc(1) + + // This should not panic even though reporter is not Prometheus + manager.MarkPRClosed("owner/repo", 555) + + Equals(t, 0, manager.GetStats()) + t.Log("✅ Test passed: Non-Prometheus reporter gracefully handled") +} + +// testStatsReporter is a simple test reporter for verifying non-Prometheus behavior. +type testStatsReporter struct{} + +func (r *testStatsReporter) ReportCounter(name string, tags map[string]string, value int64) {} +func (r *testStatsReporter) ReportGauge(name string, tags map[string]string, value float64) {} +func (r *testStatsReporter) ReportTimer(name string, tags map[string]string, interval time.Duration) {} +func (r *testStatsReporter) ReportHistogramValueSamples(name string, tags map[string]string, buckets tally.Buckets, bucketLowerBound, bucketUpperBound float64, samples int64) { +} +func (r *testStatsReporter) ReportHistogramDurationSamples(name string, tags map[string]string, buckets tally.Buckets, bucketLowerBound, bucketUpperBound time.Duration, samples int64) { +} +func (r *testStatsReporter) Capabilities() tally.Capabilities { + return r +} +func (r *testStatsReporter) Reporting() bool { + return true +} +func (r *testStatsReporter) Tagging() bool { + return true +} +func (r *testStatsReporter) Flush() {} diff --git a/server/metrics/pr_scope_manager_test.go b/server/metrics/pr_scope_manager_test.go new file mode 100644 index 0000000000..86f7e0eac8 --- /dev/null +++ b/server/metrics/pr_scope_manager_test.go @@ -0,0 +1,209 @@ +// Copyright 2025 The Atlantis Authors +// SPDX-License-Identifier: Apache-2.0 + +package metrics_test + +import ( + "fmt" + "testing" + "time" + + "github.com/runatlantis/atlantis/server/logging" + "github.com/runatlantis/atlantis/server/metrics" + . "github.com/runatlantis/atlantis/testing" + tally "github.com/uber-go/tally/v4" + promreporter "github.com/uber-go/tally/v4/prometheus" +) + +func TestPRScopeManager_GetOrCreatePRScope(t *testing.T) { + logger := logging.NewNoopLogger(t) + reporter := promreporter.NewReporter(promreporter.Options{}) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "atlantis_get", + Separator: "_", + OmitCardinalityMetrics: true, + }, + 100*time.Millisecond, + 1*time.Hour, + ) + + // First call creates a new scope + scope1 := manager.GetOrCreatePRScope("owner/repo", 123, map[string]string{ + "pr_number": "123", + }) + Assert(t, scope1 != nil, "scope should be created") + + // Second call returns the same scope + scope2 := manager.GetOrCreatePRScope("owner/repo", 123, map[string]string{ + "pr_number": "123", + }) + Assert(t, scope1 == scope2, "should return same scope instance") + + // Different PR gets different scope + scope3 := manager.GetOrCreatePRScope("owner/repo", 456, map[string]string{ + "pr_number": "456", + }) + Assert(t, scope1 != scope3, "different PR should get different scope") + + active := manager.GetStats() + Equals(t, 2, active) +} + +func TestPRScopeManager_MarkPRClosedAndCleanup(t *testing.T) { + logger := logging.NewNoopLogger(t) + reporter := promreporter.NewReporter(promreporter.Options{}) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "atlantis_mark", + Separator: "_", + OmitCardinalityMetrics: true, + }, + 100*time.Millisecond, + 50*time.Millisecond, // Short retention for testing + ) + + // Create scope for PR + scope := manager.GetOrCreatePRScope("owner/repo", 123, map[string]string{ + "pr_number": "123", + }) + Assert(t, scope != nil, "scope created") + + // Use the scope + scope.Counter("test_ops").Inc(1) + + active := manager.GetStats() + Equals(t, 1, active) + + // Mark PR as closed - should close immediately + manager.MarkPRClosed("owner/repo", 123) + + // Scope should be closed and removed immediately + active = manager.GetStats() + Equals(t, 0, active) +} + +func TestPRScopeManager_MultipleRootScopes(t *testing.T) { + logger := logging.NewNoopLogger(t) + reporter := promreporter.NewReporter(promreporter.Options{}) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "atlantis_multi", + Separator: "_", + OmitCardinalityMetrics: true, + }, + 100*time.Millisecond, + 1*time.Hour, + ) + + // Create scopes for multiple PRs + scope1 := manager.GetOrCreatePRScope("owner/repo", 1, map[string]string{"pr_number": "1"}) + scope2 := manager.GetOrCreatePRScope("owner/repo", 2, map[string]string{"pr_number": "2"}) + scope3 := manager.GetOrCreatePRScope("owner/repo", 3, map[string]string{"pr_number": "3"}) + + // Each should be independent root scopes - use unique counter names to avoid Prometheus conflicts + scope1.Counter("operations_1").Inc(1) + scope2.Counter("operations_2").Inc(2) + scope3.Counter("operations_3").Inc(3) + + active := manager.GetStats() + Equals(t, 3, active) + + // Close one PR - should close immediately + manager.MarkPRClosed("owner/repo", 2) + + active = manager.GetStats() + Equals(t, 2, active) + + // Other PRs still work - continue using unique counter names + scope1.Counter("operations_1_again").Inc(1) + scope3.Counter("operations_3_again").Inc(1) + + t.Log("Successfully verified independent root scopes per PR") +} + +func TestPRScopeManager_LargeScale(t *testing.T) { + if testing.Short() { + t.Skip("Skipping large scale test in short mode") + } + + logger := logging.NewNoopLogger(t) + // Create ONE shared reporter that all root scopes will use + reporter := promreporter.NewReporter(promreporter.Options{}) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "atlantis_large", + Separator: "_", + OmitCardinalityMetrics: true, + }, + 100*time.Millisecond, + 50*time.Millisecond, + ) + + numPRs := 10 // Keep small to avoid rapid registration issues + t.Logf("Creating %d PR scopes (each is a separate root scope)...", numPRs) + + // Create PR scopes one by one + // Each root scope shares the same reporter, Prometheus differentiates via tags + for i := 0; i < numPRs; i++ { + scope := manager.GetOrCreatePRScope("test/repo", i, map[string]string{ + "pr_number": fmt.Sprintf("%d", i), + }) + // Use different counter names per PR to avoid conflicts during rapid creation + scope.Counter(fmt.Sprintf("operations_%d", i)).Inc(1) + + // Small delay to avoid overwhelming Prometheus registration + if i > 0 && i%5 == 0 { + time.Sleep(10 * time.Millisecond) + } + } + + active := manager.GetStats() + Equals(t, numPRs, active) + + // Close all PRs - should close immediately + t.Log("Closing all PRs...") + startTime := time.Now() + for i := 0; i < numPRs; i++ { + manager.MarkPRClosed("test/repo", i) + } + duration := time.Since(startTime) + + active = manager.GetStats() + Equals(t, 0, active) + t.Logf("Closed %d root scopes immediately in %v", numPRs, duration) +} + +func TestPRScopeManager_NonExistentPRClose(t *testing.T) { + logger := logging.NewNoopLogger(t) + reporter := promreporter.NewReporter(promreporter.Options{}) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "atlantis_nonexist", + OmitCardinalityMetrics: true, + }, + 100*time.Millisecond, + 1*time.Hour, + ) + + // Closing a PR that never had a scope should not panic + manager.MarkPRClosed("owner/repo", 999) + + active := manager.GetStats() + Equals(t, 0, active) +} diff --git a/server/server.go b/server/server.go index 2e409e7262..5e92a9be07 100644 --- a/server/server.go +++ b/server/server.go @@ -39,7 +39,7 @@ import ( "github.com/go-playground/validator/v10" "github.com/mitchellh/go-homedir" tally "github.com/uber-go/tally/v4" - prometheus "github.com/uber-go/tally/v4/prometheus" + tallyprom "github.com/uber-go/tally/v4/prometheus" "github.com/urfave/negroni/v3" "github.com/runatlantis/atlantis/server/core/boltdb" @@ -108,6 +108,7 @@ type Server struct { StatsScope tally.Scope StatsReporter tally.BaseStatsReporter StatsCloser io.Closer + PRScopeManager *metrics.PRScopeManager Locker locking.Locker ApplyLocker locking.ApplyLocker VCSEventsController *events_controllers.VCSEventsController @@ -230,6 +231,27 @@ func NewServer(userConfig UserConfig, config Config) (*Server, error) { return nil, fmt.Errorf("instantiating metrics scope: %w", err) } + // Create PR scope manager for per-PR metrics that can be individually closed + scopeOptions := tally.ScopeOptions{ + Prefix: userConfig.StatsNamespace, + SanitizeOptions: &tallyprom.DefaultSanitizerOpts, + Separator: tallyprom.DefaultSeparator, + } + + // Parse retention period from config + retentionPeriod, err := time.ParseDuration(userConfig.MetricsInactivePRRetention) + if err != nil { + return nil, fmt.Errorf("parsing metrics-inactive-pr-retention: %w", err) + } + + prScopeManager := metrics.NewPRScopeManager( + logger, + statsReporter, + scopeOptions, + time.Second, // report interval + retentionPeriod, // retention period for inactive PRs + ) + if userConfig.GithubUser != "" || userConfig.GithubAppID != 0 { if userConfig.GithubAllowMergeableBypassApply { githubConfig = github.Config{ @@ -273,7 +295,7 @@ func NewServer(userConfig UserConfig, config Config) (*Server, error) { return nil, err } - githubClient = github.NewInstrumentedGithubClient(rawGithubClient, statsScope, logger) + githubClient = github.NewInstrumentedGithubClient(rawGithubClient, statsScope, logger, prScopeManager) } if userConfig.GitlabUser != "" { supportedVCSHosts = append(supportedVCSHosts, models.Gitlab) @@ -533,6 +555,19 @@ func NewServer(userConfig UserConfig, config Config) (*Server, error) { logger, ) + // Add PR scope manager cleanup job to clean up inactive PR scopes + // Cleanup runs at the same frequency as the retention period (no point running more often) + // Only add if retention period is set (non-zero) + if retentionPeriod > 0 { + scheduledExecutorService.AddJob(scheduled.JobDefinition{ + Job: prScopeManager, + Period: retentionPeriod, + }) + logger.Info("Metrics cleanup enabled: retention period and cleanup interval = %v", retentionPeriod) + } else { + logger.Info("Metrics cleanup disabled (retention period is 0)") + } + // provide fresh tokens before clone from the GitHub Apps integration, proxy workingDir if githubAppEnabled { if !userConfig.WriteGitCreds { @@ -583,6 +618,7 @@ func NewServer(userConfig UserConfig, config Config) (*Server, error) { PullClosedTemplate: &events.PullClosedEventTemplate{}, LogStreamResourceCleaner: projectCmdOutputHandler, VCSClient: vcsClient, + ScopeCleaner: prScopeManager, }, ) @@ -760,6 +796,7 @@ func NewServer(userConfig UserConfig, config Config) (*Server, error) { instrumentedProjectCmdRunner := events.NewInstrumentedProjectCommandRunner( statsScope, projectOutputWrapper, + prScopeManager, ) policyCheckCommandRunner := events.NewPolicyCheckCommandRunner( @@ -1025,6 +1062,7 @@ func NewServer(userConfig UserConfig, config Config) (*Server, error) { StatsScope: statsScope, StatsReporter: statsReporter, StatsCloser: closer, + PRScopeManager: prScopeManager, Locker: lockingClient, ApplyLocker: applyLockingClient, VCSEventsController: eventsController, @@ -1080,7 +1118,7 @@ func (s *Server) Start() error { s.Router.HandleFunc("/jobs/{job-id}", s.JobsController.GetProjectJobs).Methods("GET").Name(ProjectJobsViewRouteName) s.Router.HandleFunc("/jobs/{job-id}/ws", s.JobsController.GetProjectJobsWS).Methods("GET") - r, ok := s.StatsReporter.(prometheus.Reporter) + r, ok := s.StatsReporter.(tallyprom.Reporter) if ok { s.Router.Handle(s.CommandRunner.GlobalCfg.Metrics.Prometheus.Endpoint, r.HTTPHandler()) } diff --git a/server/server_test.go b/server/server_test.go index 4c9fcadfd9..171579edd4 100644 --- a/server/server_test.go +++ b/server/server_test.go @@ -38,11 +38,12 @@ import ( ) const ( - testAtlantisVersion = "1.0.0" - testAtlantisUrl = "http://example.com" - testLockingDBType = cmd.DefaultLockingDBType - testGitHubHostName = cmd.DefaultGHHostname - testGitHubUser = "user" + testAtlantisVersion = "1.0.0" + testAtlantisUrl = "http://example.com" + testLockingDBType = cmd.DefaultLockingDBType + testGitHubHostName = cmd.DefaultGHHostname + testGitHubUser = "user" + testMetricsInactivePRRetention = "67h" ) func TestNewServer_GitHubUser(t *testing.T) { @@ -50,11 +51,12 @@ func TestNewServer_GitHubUser(t *testing.T) { tmpDir := t.TempDir() _, err := server.NewServer( server.UserConfig{ - DataDir: tmpDir, - AtlantisURL: testAtlantisUrl, - LockingDBType: testLockingDBType, - GithubHostname: testGitHubHostName, - GithubUser: testGitHubUser, + DataDir: tmpDir, + AtlantisURL: testAtlantisUrl, + LockingDBType: testLockingDBType, + GithubHostname: testGitHubHostName, + GithubUser: testGitHubUser, + MetricsInactivePRRetention: testMetricsInactivePRRetention, }, server.Config{ AtlantisVersion: testAtlantisVersion, }, @@ -67,14 +69,27 @@ func TestNewServer_GitHubUser(t *testing.T) { func TestNewServer_InvalidAtlantisURL(t *testing.T) { tmpDir := t.TempDir() _, err := server.NewServer(server.UserConfig{ - DataDir: tmpDir, - AtlantisURL: "example.com", + DataDir: tmpDir, + AtlantisURL: "example.com", + MetricsInactivePRRetention: testMetricsInactivePRRetention, }, server.Config{ AtlantisURLFlag: "atlantis-url", }) ErrEquals(t, "parsing --atlantis-url flag \"example.com\": http or https must be specified", err) } +func TestNewServer_InvalidMetricsInactivePRRetention(t *testing.T) { + tmpDir := t.TempDir() + _, err := server.NewServer(server.UserConfig{ + DataDir: tmpDir, + AtlantisURL: testAtlantisUrl, + MetricsInactivePRRetention: "invalid", + }, server.Config{ + AtlantisURLFlag: "atlantis-url", + }) + ErrEquals(t, "parsing metrics-inactive-pr-retention: time: invalid duration \"invalid\"", err) +} + func TestIndex_LockErr(t *testing.T) { t.Log("index should return a 503 if unable to list locks") RegisterMockTestingT(t) diff --git a/server/user_config.go b/server/user_config.go index fcf25a535e..560fdae259 100644 --- a/server/user_config.go +++ b/server/user_config.go @@ -91,6 +91,7 @@ type UserConfig struct { ParallelApply bool `mapstructure:"parallel-apply"` PendingApplyStatus bool `mapstructure:"pending-apply-status"` StatsNamespace string `mapstructure:"stats-namespace"` + MetricsInactivePRRetention string `mapstructure:"metrics-inactive-pr-retention"` PlanDrafts bool `mapstructure:"allow-draft-prs"` Port int `mapstructure:"port"` QuietPolicyChecks bool `mapstructure:"quiet-policy-checks"` From 82539ca37dfd845f23cbf5cc9f31c0783a8be8f8 Mon Sep 17 00:00:00 2001 From: pablosan Date: Wed, 17 Jun 2026 16:46:37 +0200 Subject: [PATCH 2/2] fix: delete execution_failure metrics when PR scope closes Ensure Prometheus cleanup removes all per-PR counter metrics emitted by InstrumentedProjectCommandRunner, not just success and error counters. --- .../instrumented_project_command_runner.go | 2 +- server/events/mock_workingdir_test.go | 5 +- server/metrics/common.go | 7 +++ server/metrics/pr_scope_manager.go | 5 +- .../pr_scope_manager_integration_test.go | 2 +- .../pr_scope_manager_prom_deletion_test.go | 54 +++++++++++++++++-- 6 files changed, 64 insertions(+), 11 deletions(-) diff --git a/server/events/instrumented_project_command_runner.go b/server/events/instrumented_project_command_runner.go index ed3a14f8d0..2e18ca9f2e 100644 --- a/server/events/instrumented_project_command_runner.go +++ b/server/events/instrumented_project_command_runner.go @@ -28,7 +28,7 @@ func NewInstrumentedProjectCommandRunner(scope tally.Scope, projectCommandRunner projectTags := command.ProjectScopeTags{} scope = scope.SubScope("project").Tagged(projectTags.Loadtags()) - for _, m := range []string{metrics.ExecutionSuccessMetric, metrics.ExecutionErrorMetric, metrics.ExecutionFailureMetric} { + for _, m := range metrics.ExecutionCounterMetrics { metrics.InitCounter(scope, m) } diff --git a/server/events/mock_workingdir_test.go b/server/events/mock_workingdir_test.go index a5750125b3..7723e15292 100644 --- a/server/events/mock_workingdir_test.go +++ b/server/events/mock_workingdir_test.go @@ -4,11 +4,12 @@ package events import ( + "reflect" + "time" + pegomock "github.com/petergtz/pegomock/v4" models "github.com/runatlantis/atlantis/server/events/models" logging "github.com/runatlantis/atlantis/server/logging" - "reflect" - "time" ) type MockWorkingDir struct { diff --git a/server/metrics/common.go b/server/metrics/common.go index bd2cb1ccbb..5f8a2cd167 100644 --- a/server/metrics/common.go +++ b/server/metrics/common.go @@ -9,3 +9,10 @@ const ( ExecutionErrorMetric = "execution_error" ExecutionFailureMetric = "execution_failure" ) + +// ExecutionCounterMetrics lists per-PR counter metrics that must be deleted when a PR scope closes. +var ExecutionCounterMetrics = []string{ + ExecutionSuccessMetric, + ExecutionErrorMetric, + ExecutionFailureMetric, +} diff --git a/server/metrics/pr_scope_manager.go b/server/metrics/pr_scope_manager.go index 061af887aa..3f6be1e354 100644 --- a/server/metrics/pr_scope_manager.go +++ b/server/metrics/pr_scope_manager.go @@ -15,7 +15,7 @@ import ( ) // PRScopeManager manages separate root scopes for each PR, allowing them to be -// individually closed when PRs are done, preventing exessive resource consumption over time. +// individually closed when PRs are done, preventing excessive resource consumption over time. type PRScopeManager struct { logger logging.SimpleLogging baseReporter tally.BaseStatsReporter @@ -292,8 +292,7 @@ func (m *PRScopeManager) deletePrometheusMetrics(prScopeEntry *prScopeEntry) { } for _, subscope := range subscopes { - // Delete counters: execution_success and execution_error - for _, metricName := range []string{ExecutionSuccessMetric, ExecutionErrorMetric} { + for _, metricName := range ExecutionCounterMetrics { fullName := prefix if fullName != "" { fullName += separator diff --git a/server/metrics/pr_scope_manager_integration_test.go b/server/metrics/pr_scope_manager_integration_test.go index 5bc1d69604..5a984ef24f 100644 --- a/server/metrics/pr_scope_manager_integration_test.go +++ b/server/metrics/pr_scope_manager_integration_test.go @@ -11,9 +11,9 @@ import ( "github.com/runatlantis/atlantis/server/logging" "github.com/runatlantis/atlantis/server/metrics" "github.com/runatlantis/atlantis/server/scheduled" + . "github.com/runatlantis/atlantis/testing" tally "github.com/uber-go/tally/v4" tallyprom "github.com/uber-go/tally/v4/prometheus" - . "github.com/runatlantis/atlantis/testing" ) // TestPRScopeManager_ScheduledCleanupIntegration is an end-to-end test that verifies diff --git a/server/metrics/pr_scope_manager_prom_deletion_test.go b/server/metrics/pr_scope_manager_prom_deletion_test.go index 65fceb33b2..3cf8ef6adb 100644 --- a/server/metrics/pr_scope_manager_prom_deletion_test.go +++ b/server/metrics/pr_scope_manager_prom_deletion_test.go @@ -11,9 +11,9 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/runatlantis/atlantis/server/logging" "github.com/runatlantis/atlantis/server/metrics" + . "github.com/runatlantis/atlantis/testing" tally "github.com/uber-go/tally/v4" tallyprom "github.com/uber-go/tally/v4/prometheus" - . "github.com/runatlantis/atlantis/testing" ) // TestPRScopeManager_PrometheusMetricDeletion verifies that Prometheus metrics @@ -50,6 +50,7 @@ func TestPRScopeManager_PrometheusMetricDeletion(t *testing.T) { updateStatusScope := scope.SubScope("update_status") updateStatusScope.Counter(metrics.ExecutionSuccessMetric).Inc(5) updateStatusScope.Counter(metrics.ExecutionErrorMetric).Inc(2) + updateStatusScope.Counter(metrics.ExecutionFailureMetric).Inc(1) timer := updateStatusScope.Timer(metrics.ExecutionTimeMetric) stopwatch := timer.Start() time.Sleep(10 * time.Millisecond) @@ -337,6 +338,50 @@ func TestPRScopeManager_OnlyCreatedSubscopesDeleted(t *testing.T) { t.Log("✅ Test passed: Only the 2 subscopes that were created were targeted for cleanup") } +// TestPRScopeManager_ExecutionFailureDeletion verifies execution_failure counters are +// removed on PR close (emitted by InstrumentedProjectCommandRunner on plan/apply failures). +func TestPRScopeManager_ExecutionFailureDeletion(t *testing.T) { + logger := logging.NewNoopLogger(t) + + registry := prometheus.NewRegistry() + reporter := tallyprom.NewReporter(tallyprom.Options{ + Registerer: registry, + }) + + manager := metrics.NewPRScopeManager( + logger, + reporter, + tally.ScopeOptions{ + Prefix: "test_failure", + Separator: tallyprom.DefaultSeparator, + SanitizeOptions: &tallyprom.DefaultSanitizerOpts, + }, + 100*time.Millisecond, + 1*time.Hour, + ) + + scope := manager.GetOrCreatePRScope("owner/repo", 88, map[string]string{ + "base_repo": "owner/repo", + "pr_number": "88", + }) + scope.SubScope("plan").Counter(metrics.ExecutionFailureMetric).Inc(3) + + time.Sleep(150 * time.Millisecond) + + metricsFamilies, err := registry.Gather() + Ok(t, err) + beforeCount := countMetricsWithLabel(metricsFamilies, "pr_number", "88") + Assert(t, beforeCount > 0, "execution_failure metric should exist before deletion") + + manager.MarkPRClosed("owner/repo", 88) + time.Sleep(50 * time.Millisecond) + + metricsFamilies, err = registry.Gather() + Ok(t, err) + afterCount := countMetricsWithLabel(metricsFamilies, "pr_number", "88") + Equals(t, 0, afterCount) +} + // TestPRScopeManager_NonPrometheusReporter verifies that when using a non-Prometheus // reporter (like StatsD), the deletion code gracefully skips without errors. func TestPRScopeManager_NonPrometheusReporter(t *testing.T) { @@ -372,9 +417,10 @@ func TestPRScopeManager_NonPrometheusReporter(t *testing.T) { // testStatsReporter is a simple test reporter for verifying non-Prometheus behavior. type testStatsReporter struct{} -func (r *testStatsReporter) ReportCounter(name string, tags map[string]string, value int64) {} -func (r *testStatsReporter) ReportGauge(name string, tags map[string]string, value float64) {} -func (r *testStatsReporter) ReportTimer(name string, tags map[string]string, interval time.Duration) {} +func (r *testStatsReporter) ReportCounter(name string, tags map[string]string, value int64) {} +func (r *testStatsReporter) ReportGauge(name string, tags map[string]string, value float64) {} +func (r *testStatsReporter) ReportTimer(name string, tags map[string]string, interval time.Duration) { +} func (r *testStatsReporter) ReportHistogramValueSamples(name string, tags map[string]string, buckets tally.Buckets, bucketLowerBound, bucketUpperBound float64, samples int64) { } func (r *testStatsReporter) ReportHistogramDurationSamples(name string, tags map[string]string, buckets tally.Buckets, bucketLowerBound, bucketUpperBound time.Duration, samples int64) {