Skip to content

Commit 36e752f

Browse files
committed
Fix the clusterstatus based on last operatorAction
1 parent e7f898b commit 36e752f

File tree

10 files changed

+58
-44
lines changed

10 files changed

+58
-44
lines changed

.bazelrc

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
build --workspace_status_command hack/build/print-workspace-status.sh
2-
test --test_output=errors --test_timeout=-1,-1,-1,2400
2+
test --test_output=all --test_timeout=-1,-1,-1,2400

config/rbac/role.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,9 @@ rules:
148148
resources:
149149
- persistentvolumeclaims
150150
verbs:
151+
- delete
151152
- list
152153
- update
153-
- delete
154154
- apiGroups:
155155
- ""
156156
resources:

e2e/e2e.go

+4
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,8 @@ const (
3434
NonExistentVersion = "cockroachdb/cockroach-non-existent:v21.1.999"
3535
SkipFeatureVersion = "cockroachdb/cockroach:v20.1.0"
3636
InvalidImage = "nginx:latest"
37+
DefaultCPULimit = "1"
38+
DefaultMemoryLimit = "4Gi"
39+
DefaultCPURequest = "500m"
40+
DefaultMemoryRequest = "1Gi"
3741
)

e2e/upgrades/BUILD.bazel

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ go_test(
1111
"//pkg/testutil/env:go_default_library",
1212
"@com_github_go_logr_zapr//:go_default_library",
1313
"@com_github_stretchr_testify//require:go_default_library",
14+
"@io_k8s_api//core/v1:go_default_library",
15+
"@io_k8s_apimachinery//pkg/api/resource:go_default_library",
1416
"@io_k8s_sigs_controller_runtime//pkg/client:go_default_library",
1517
"@org_uber_go_zap//zaptest:go_default_library",
1618
],

e2e/upgrades/upgrades_test.go

+25-7
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,21 @@ import (
2828
"github.com/go-logr/zapr"
2929
"github.com/stretchr/testify/require"
3030
"go.uber.org/zap/zaptest"
31+
corev1 "k8s.io/api/core/v1"
32+
"k8s.io/apimachinery/pkg/api/resource"
33+
)
34+
35+
var (
36+
resRequirements = corev1.ResourceRequirements{
37+
Limits: corev1.ResourceList{
38+
corev1.ResourceCPU: resource.MustParse(e2e.DefaultCPULimit),
39+
corev1.ResourceMemory: resource.MustParse(e2e.DefaultMemoryLimit),
40+
},
41+
Requests: corev1.ResourceList{
42+
corev1.ResourceCPU: resource.MustParse(e2e.DefaultCPURequest),
43+
corev1.ResourceMemory: resource.MustParse(e2e.DefaultMemoryRequest),
44+
},
45+
}
3146
)
3247

3348
// TestUpgradesMinorVersion tests a minor version bump
@@ -52,7 +67,7 @@ func TestUpgradesMinorVersion(t *testing.T) {
5267

5368
builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
5469
WithImage(e2e.MinorVersion1).
55-
WithPVDataStore("1Gi")
70+
WithPVDataStore("1Gi").WithResources(resRequirements)
5671

5772
steps := testutil.Steps{
5873
{
@@ -103,7 +118,7 @@ func TestUpgradesMajorVersion20to21(t *testing.T) {
103118

104119
builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
105120
WithImage(e2e.MinorVersion2).
106-
WithPVDataStore("1Gi")
121+
WithPVDataStore("1Gi").WithResources(resRequirements)
107122

108123
steps := testutil.Steps{
109124
{
@@ -152,7 +167,7 @@ func TestUpgradesMajorVersion20_1To20_2(t *testing.T) {
152167

153168
builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
154169
WithImage("cockroachdb/cockroach:v20.1.16").
155-
WithPVDataStore("1Gi")
170+
WithPVDataStore("1Gi").WithResources(resRequirements)
156171

157172
steps := testutil.Steps{
158173
{
@@ -209,7 +224,8 @@ func TestUpgradesMinorVersionThenRollback(t *testing.T) {
209224
WithNodeCount(3).
210225
WithTLS().
211226
WithImage(e2e.MinorVersion1).
212-
WithPVDataStore("1Gi")
227+
WithPVDataStore("1Gi").
228+
WithResources(resRequirements)
213229

214230
steps := testutil.Steps{
215231
{
@@ -274,7 +290,8 @@ func TestUpgradeWithInvalidVersion(t *testing.T) {
274290

275291
builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
276292
WithImage(e2e.MinorVersion1).
277-
WithPVDataStore("1Gi")
293+
WithPVDataStore("1Gi").
294+
WithResources(resRequirements)
278295

279296
steps := testutil.Steps{
280297
{
@@ -324,7 +341,8 @@ func TestUpgradeWithInvalidImage(t *testing.T) {
324341

325342
builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
326343
WithImage(e2e.MinorVersion1).
327-
WithPVDataStore("1Gi")
344+
WithPVDataStore("1Gi").
345+
WithResources(resRequirements)
328346

329347
steps := testutil.Steps{
330348
{
@@ -374,7 +392,7 @@ func TestUpgradeWithMajorVersionExcludingMajorFeature(t *testing.T) {
374392

375393
builder := testutil.NewBuilder("crdb").WithNodeCount(3).WithTLS().
376394
WithImage(e2e.SkipFeatureVersion).
377-
WithPVDataStore("1Gi")
395+
WithPVDataStore("1Gi").WithResources(resRequirements)
378396

379397
steps := testutil.Steps{
380398
{

install/operator.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,9 @@ rules:
161161
resources:
162162
- persistentvolumeclaims
163163
verbs:
164+
- delete
164165
- list
165166
- update
166-
- delete
167167
- apiGroups:
168168
- ""
169169
resources:

pkg/actor/validate_version.go

+13-14
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
116116
cluster.SetCrdbContainerImage(containerImage)
117117
cluster.SetAnnotationContainerImage(containerImage)
118118
jobName := cluster.JobName()
119-
changed, err := (resource.Reconciler{
119+
_, err := (resource.Reconciler{
120120
ManagedResource: r,
121121
Builder: resource.JobBuilder{
122122
Cluster: cluster,
@@ -132,17 +132,10 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
132132
log.Error(err, "failed to reconcile job")
133133
return err
134134
}
135-
log.Error(err, "failed to reconcile job only err: ", err.Error())
135+
log.Error(err, "failed to reconcile job only error")
136136
return err
137137
}
138138

139-
if changed {
140-
log.V(int(zapcore.DebugLevel)).Info("created/updated job, stopping request processing")
141-
// Return a non error error here to prevent the controller from
142-
// clearing any previously set Status fields.
143-
return NotReadyErr{errors.New("job changed")}
144-
}
145-
146139
log.V(int(zapcore.DebugLevel)).Info("version checker", "job", jobName)
147140
key := kubetypes.NamespacedName{
148141
Namespace: cluster.Namespace(),
@@ -184,11 +177,15 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
184177
}
185178
return errors.Wrapf(err, "failed to check the version of the crdb")
186179
}
180+
181+
// Tail only last two lines as one line contains version info and one line has newline character. It is much faster.
182+
podLines := int64(2)
187183
podLogOpts := corev1.PodLogOptions{
188184
Container: resource.JobContainerName,
185+
TailLines: &podLines,
189186
}
190-
//get pod for the job we created
191187

188+
//get pod for the job we created
192189
pods, err := v.clientset.CoreV1().Pods(job.Namespace).List(ctx, metav1.ListOptions{
193190
LabelSelector: labels.Set(job.Spec.Selector.MatchLabels).AsSelector().String(),
194191
})
@@ -231,11 +228,12 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
231228
}
232229
output := buf.String()
233230

231+
log.Info("Logs of version checker pod", "log", output)
234232
// This is the value from Build Tag taken from the container
235233
calVersion = strings.Replace(output, "\n", "", -1)
236-
// if no image is retrieved we exit
234+
// if no logs are found we will retry later to fetch the logs
237235
if calVersion == "" {
238-
err := PermanentErr{Err: errors.New("failed to check the version of the cluster")}
236+
err := NotReadyErr{Err: errors.New("failed to check the version of the cluster")}
239237
log.Error(err, "crdb version not found")
240238
return err
241239
}
@@ -366,7 +364,8 @@ func IsJobPodRunning(
366364
}
367365
pod := pods.Items[0]
368366
if !kube.IsPodReady(&pod) {
369-
return LogError("job pod is not ready yet waiting longer", nil, l)
367+
pErr := fmt.Errorf("pod %s is not ready, and in %s phase", pod.Name, pod.Status.Phase)
368+
return LogError("job pod is not ready yet waiting longer", pErr, l)
370369
}
371370
l.V(int(zapcore.DebugLevel)).Info("job pod is ready")
372371
return nil
@@ -406,7 +405,7 @@ func WaitUntilJobPodIsRunning(ctx context.Context, clientset kubernetes.Interfac
406405
return IsJobPodRunning(ctx, clientset, job, l)
407406
}
408407
b := backoff.NewExponentialBackOff()
409-
b.MaxElapsedTime = 120 * time.Second
408+
b.MaxElapsedTime = 180 * time.Second
410409
b.MaxInterval = 10 * time.Second
411410
if err := backoff.Retry(f, b); err != nil {
412411
return errors.Wrapf(err, "pod is not running for job: %s", job.Name)

pkg/clusterstatus/clusterstatus.go

+9-15
Original file line numberDiff line numberDiff line change
@@ -31,25 +31,19 @@ func SetClusterStatusOnFirstReconcile(status *api.CrdbClusterStatus) {
3131
status.ClusterStatus = api.ActionStatus(api.Starting).String()
3232
}
3333

34-
// TODO: Do we need to check for all OperatorActions or for just the 0th element in SetClusterStatus func
35-
// nolint
34+
// SetClusterStatus will set the status of the cluster based on the status of the operator actions
35+
// operatorActions are not always appended, rather any particular action type is replaced if its status changes.
36+
// The status of the cluster is set to the status of the last operator action based on lastTransitionTime.
3637
func SetClusterStatus(status *api.CrdbClusterStatus) {
3738
InitOperatorActionsIfNeeded(status, metav1.Now())
38-
for _, a := range status.OperatorActions {
39-
if a.Status == api.ActionStatus(api.Failed).String() {
40-
status.ClusterStatus = api.ActionStatus(api.Failed).String()
41-
return
39+
for i := range status.OperatorActions {
40+
if i == 0 || status.OperatorActions[i-1].LastTransitionTime.Before(&status.OperatorActions[i].LastTransitionTime) {
41+
status.ClusterStatus = status.OperatorActions[i].Status
4242
}
43-
44-
if a.Status == api.ActionStatus(api.Unknown).String() {
45-
status.ClusterStatus = api.ActionStatus(api.Unknown).String()
46-
return
47-
}
48-
49-
status.ClusterStatus = api.ActionStatus(api.Finished).String()
50-
return
5143
}
52-
status.ClusterStatus = api.ActionStatus(api.Finished).String()
44+
if status.ClusterStatus == "" {
45+
status.ClusterStatus = api.ActionStatus(api.Starting).String()
46+
}
5347
}
5448

5549
func InitOperatorActionsIfNeeded(status *api.CrdbClusterStatus, now metav1.Time) {

pkg/controller/cluster_controller.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ type ClusterReconciler struct {
6969
// +kubebuilder:rbac:groups=core,resources=pods/exec,verbs=create
7070
// +kubebuilder:rbac:groups=core,resources=pods/log,verbs=get
7171
// +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list
72-
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=list;update
72+
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=list;update;delete
7373
// +kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;create;watch
7474
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=get;list;create;watch
7575
// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=get;list;create;watch

pkg/resource/job.go

+1-4
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func (b JobBuilder) Build(obj client.Object) error {
6262
job.Annotations = b.Spec().AdditionalAnnotations
6363

6464
// we recreate spec from ground only if we do not find the container job
65-
if dbContainer, err := kube.FindContainer(JobContainerName, &job.Spec.Template.Spec); err != nil {
65+
if _, err := kube.FindContainer(JobContainerName, &job.Spec.Template.Spec); err != nil {
6666
backoffLimit := int32(2)
6767
job.Spec = kbatch.JobSpec{
6868
// This field is alpha-level and is only honored by servers that enable the TTLAfterFinished feature.
@@ -71,9 +71,6 @@ func (b JobBuilder) Build(obj client.Object) error {
7171
Template: b.buildPodTemplate(),
7272
BackoffLimit: &backoffLimit,
7373
}
74-
} else {
75-
//if job with the container already exists we update the image only
76-
dbContainer.Image = b.GetCockroachDBImageName()
7774
}
7875

7976
return nil

0 commit comments

Comments
 (0)