Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/retries on conflict error #74

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ Runtime Configuration:
- string key of an annotation on the workload containing a [RFC3339 Timestamp](https://datatracker.ietf.org/doc/html/rfc3339)
- when set grace-period will use the timestamp in the annotation instead of the creation time of the workload
- default: none (uses the workloads creation time)
- <span id="--max-retries-on-conflict">--max-retries-on-conflict</span>:
- integer
- sets the maximum number of retries for handling HTTP 409 conflict errors, which occur when another entity modifies a resource that the downscaler is currently processing
- default: 0

### Environment Variables

Expand Down
48 changes: 41 additions & 7 deletions cmd/kubedownscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"os"
"os/signal"
"regexp"
"strings"
"sync"
"syscall"
"time"
Expand All @@ -17,6 +18,7 @@ import (
"github.com/caas-team/gokubedownscaler/internal/pkg/scalable"
"github.com/caas-team/gokubedownscaler/internal/pkg/util"
"github.com/caas-team/gokubedownscaler/internal/pkg/values"
"k8s.io/apiserver/pkg/registry/generic/registry"
"k8s.io/client-go/tools/leaderelection"
)

Expand Down Expand Up @@ -185,19 +187,15 @@ func startScanning(
for _, workload := range workloads {
waitGroup.Add(1)

go func() {
slog.Debug("scanning workload", "workload", workload.GetName(), "namespace", workload.GetNamespace())

go func(workload scalable.Workload) {
defer waitGroup.Done()

err := scanWorkload(workload, client, ctx, layerCli, layerEnv, config)
err = attemptScan(client, ctx, layerCli, layerEnv, config, workload)
if err != nil {
slog.Error("failed to scan workload", "error", err, "workload", workload.GetName(), "namespace", workload.GetNamespace())
return
}

slog.Debug("successfully scanned workload", "workload", workload.GetName(), "namespace", workload.GetNamespace())
}()
}(workload)
}

waitGroup.Wait()
Expand All @@ -215,6 +213,42 @@ func startScanning(
return nil
}

func attemptScan(
client kubernetes.Client,
ctx context.Context,
layerCli, layerEnv *values.Layer,
config *util.RuntimeConfiguration,
workload scalable.Workload,
) error {
slog.Debug("scanning workload", "workload", workload.GetName(), "namespace", workload.GetNamespace())

for retry := range config.MaxRetriesOnConflict + 1 {
err := scanWorkload(workload, client, ctx, layerCli, layerEnv, config)
if err != nil {
if !(strings.Contains(err.Error(), registry.OptimisticLockErrorMsg)) {
return fmt.Errorf("failed to scan workload: %w", err)
}

slog.Warn("workload modified, retrying", "attempt", retry+1, "workload", workload.GetName(), "namespace", workload.GetNamespace())

err := client.RegetWorkload(workload, ctx)
if err != nil {
return fmt.Errorf("failed to fetch updated workload: %w", err)
}

continue
}

slog.Debug("successfully scanned workload", "workload", workload.GetName(), "namespace", workload.GetNamespace())

return nil
}

slog.Error("failed to scan workload", "attempts", config.MaxRetriesOnConflict+1)

return nil
}

// scanWorkload runs a scan on the worklod, determining the scaling and scaling the workload.
func scanWorkload(
workload scalable.Workload,
Expand Down
2 changes: 1 addition & 1 deletion docs/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Causes:
Fixes:

- do not run multiple downscalers on the same resources
- it should just scale in the next scan cycle so there are probably no changes needed
- the `--max-retries-on-conflict` argument enables users to specify the number of retries for the downscaler when a conflict occurs. While the affected resource will likely be scaled in the next cycle without this optional argument, it is highly recommended to use it in conjunction with the `--once` argument

> [!Note]
> this is a pretty unavoidable issue due to there being no easy way to lock the resource from being edited while the downscaler is scaling it. The py-kube-downscaler solved this by just overwriting the changes made during scaling
34 changes: 33 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,43 @@ require (
github.com/zalando-incubator/stackset-controller v1.4.93
k8s.io/api v0.32.1
k8s.io/apimachinery v0.32.1
k8s.io/apiserver v0.32.0
k8s.io/client-go v0.32.1
)

require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/coreos/go-semver v0.3.1 // indirect
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/emicklei/go-restful/v3 v3.12.1 // indirect
github.com/evanphx/json-patch/v5 v5.9.0 // indirect
github.com/expr-lang/expr v1.16.9 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.8.0 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/btree v1.1.2 // indirect
github.com/google/gnostic-models v0.6.9 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.11 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
Expand All @@ -46,11 +58,26 @@ require (
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.61.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/rogpeppe/go-internal v1.13.1 // indirect
github.com/spf13/cobra v1.8.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/szuecs/routegroup-client v0.28.2 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.etcd.io/etcd/api/v3 v3.5.17 // indirect
go.etcd.io/etcd/client/pkg/v3 v3.5.17 // indirect
go.etcd.io/etcd/client/v3 v3.5.17 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.54.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.54.0 // indirect
go.opentelemetry.io/otel v1.33.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 // indirect
go.opentelemetry.io/otel/metric v1.33.0 // indirect
go.opentelemetry.io/otel/sdk v1.33.0 // indirect
go.opentelemetry.io/otel/trace v1.33.0 // indirect
go.opentelemetry.io/proto/otlp v1.4.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect
golang.org/x/net v0.34.0 // indirect
golang.org/x/oauth2 v0.25.0 // indirect
Expand All @@ -59,15 +86,20 @@ require (
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.9.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20241209162323-e6fa225c2576 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576 // indirect
google.golang.org/grpc v1.69.2 // indirect
google.golang.org/protobuf v1.36.2 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apiextensions-apiserver v0.32.0 // indirect
k8s.io/component-base v0.32.0 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 // indirect
k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect
knative.dev/pkg v0.0.0-20250110150618-accfe3649188 // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect
sigs.k8s.io/controller-runtime v0.19.4 // indirect
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.5.0 // indirect
Expand Down
Loading
Loading