Skip to content

Commit

Permalink
fix(RELEASE-1277): release controller crashes
Browse files Browse the repository at this point in the history
this PR tries to fix/decrease the daily crashes of the release
controller by filtering objects to cache and increasing the
lease renew time.

Signed-off-by: Leandro Mendes <[email protected]>
  • Loading branch information
theflockers committed Jan 24, 2025
1 parent c49d620 commit 2b94c4a
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 0 deletions.
6 changes: 6 additions & 0 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ spec:
- /manager
args:
- --leader-elect
- --lease-duration
- 60
- --leader-renew-deadline
- 30
- --leader-elector-retry-period
- 10
image: controller:latest
name: manager
securityContext:
Expand Down
3 changes: 3 additions & 0 deletions controllers/release/adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ func (a *adapter) createFinalPipelineRun(releasePlan *v1alpha1.ReleasePlan, snap
WithLabels(map[string]string{
metadata.ApplicationNameLabel: releasePlan.Spec.Application,
metadata.PipelinesTypeLabel: metadata.FinalPipelineType,
metadata.ServiceNameLabel: metadata.ServiceName,
metadata.ReleaseNameLabel: a.release.Name,
metadata.ReleaseNamespaceLabel: a.release.Namespace,
metadata.ReleaseSnapshotLabel: a.release.Spec.Snapshot,
Expand Down Expand Up @@ -530,6 +531,7 @@ func (a *adapter) createManagedPipelineRun(resources *loader.ProcessingResources
WithLabels(map[string]string{
metadata.ApplicationNameLabel: resources.ReleasePlan.Spec.Application,
metadata.PipelinesTypeLabel: metadata.ManagedPipelineType,
metadata.ServiceNameLabel: metadata.ServiceName,
metadata.ReleaseNameLabel: a.release.Name,
metadata.ReleaseNamespaceLabel: a.release.Namespace,
metadata.ReleaseSnapshotLabel: a.release.Spec.Snapshot,
Expand Down Expand Up @@ -571,6 +573,7 @@ func (a *adapter) createTenantPipelineRun(releasePlan *v1alpha1.ReleasePlan, sna
WithLabels(map[string]string{
metadata.ApplicationNameLabel: releasePlan.Spec.Application,
metadata.PipelinesTypeLabel: metadata.TenantPipelineType,
metadata.ServiceNameLabel: metadata.ServiceName,
metadata.ReleaseNameLabel: a.release.Name,
metadata.ReleaseNamespaceLabel: a.release.Namespace,
metadata.ReleaseSnapshotLabel: a.release.Spec.Snapshot,
Expand Down
25 changes: 25 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ import (
"crypto/tls"
"flag"
"os"
"time"

"sigs.k8s.io/controller-runtime/pkg/metrics/server"
crwebhook "sigs.k8s.io/controller-runtime/pkg/webhook"

"github.com/konflux-ci/operator-toolkit/controller"
"github.com/konflux-ci/operator-toolkit/webhook"
"github.com/konflux-ci/release-service/api/v1alpha1/webhooks"
"github.com/konflux-ci/release-service/metadata"

"go.uber.org/zap/zapcore"

Expand All @@ -37,10 +39,13 @@ import (
ecapiv1alpha1 "github.com/enterprise-contract/enterprise-contract-controller/api/v1alpha1"
applicationapiv1alpha1 "github.com/redhat-appstudio/application-api/api/v1alpha1"

"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"

Expand Down Expand Up @@ -70,13 +75,23 @@ func main() {
var metricsAddr string
var enableHttp2 bool
var enableLeaderElection bool
var leaderRenewDeadline time.Duration
var leaseDuration time.Duration
var leaderElectorRetryPeriod time.Duration

Check warning on line 80 in main.go

View check run for this annotation

Codecov / codecov/patch

main.go#L78-L80

Added lines #L78 - L80 were not covered by tests
var probeAddr string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableHttp2, "enable-http2", false, "Enable HTTP/2 for the metrics and webhook servers.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
flag.DurationVar(&leaderRenewDeadline, "leader-renew-deadline", 10*time.Second,
"Leader RenewDeadline is the duration that the acting controlplane "+
"will retry refreshing leadership before giving up.")
flag.DurationVar(&leaseDuration, "lease-duration", 15*time.Second,
"Lease Duration is the duration that non-leader candidates will wait to force acquire leadership.")
flag.DurationVar(&leaderElectorRetryPeriod, "leader-elector-retry-period", 2*time.Second, "RetryPeriod is the duration the "+
"LeaderElector clients should wait between tries of actions.")

Check warning on line 94 in main.go

View check run for this annotation

Codecov / codecov/patch

main.go#L88-L94

Added lines #L88 - L94 were not covered by tests
opts := zap.Options{
Development: true,
TimeEncoder: zapcore.ISO8601TimeEncoder,
Expand All @@ -87,9 +102,19 @@ func main() {
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Cache: cache.Options{
ByObject: map[client.Object]cache.ByObject{
&tektonv1.PipelineRun{}: cache.ByObject{
Label: labels.SelectorFromSet(labels.Set{metadata.ServiceNameLabel: metadata.ServiceName}),
},
},
},

Check warning on line 111 in main.go

View check run for this annotation

Codecov / codecov/patch

main.go#L105-L111

Added lines #L105 - L111 were not covered by tests
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "f3d4c01a.redhat.com",
RenewDeadline: &leaderRenewDeadline,
LeaseDuration: &leaseDuration,
RetryPeriod: &leaderElectorRetryPeriod,

Check warning on line 117 in main.go

View check run for this annotation

Codecov / codecov/patch

main.go#L115-L117

Added lines #L115 - L117 were not covered by tests
Metrics: server.Options{
BindAddress: metricsAddr,
},
Expand Down
6 changes: 6 additions & 0 deletions metadata/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ const (

// MaxLabelLength is the maximum allowed characters in a label value
MaxLabelLength = 63

// Release service name
ServiceName = "release"
)

// Labels used by the release api package
Expand All @@ -41,6 +44,9 @@ var (
// AutomatedLabel is the label name for marking a Release as automated
AutomatedLabel = fmt.Sprintf("release.%s/automated", rhtapDomain)

// ServiceNameLabel is the label used to specify the service associated with an object
ServiceNameLabel = fmt.Sprintf("%s/%s", rhtapDomain, "service")

// ReleasePlanAdmissionLabel is the ReleasePlan label for the name of the ReleasePlanAdmission to use
ReleasePlanAdmissionLabel = fmt.Sprintf("release.%s/releasePlanAdmission", rhtapDomain)
)
Expand Down

0 comments on commit 2b94c4a

Please sign in to comment.