Skip to content

Commit 16e3d7b

Browse files
authoredJan 18, 2022
Merge pull request #24841 from chaodaiG/crier-gerrit-multiple-reporters
Crier gerrit reporter: worker can be more than 1
2 parents 56b8c7e + b7ebf19 commit 16e3d7b

14 files changed

+892
-201
lines changed
 

‎prow/cmd/crier/main.go

+1-9
Original file line numberDiff line numberDiff line change
@@ -78,14 +78,6 @@ type options struct {
7878
}
7979

8080
func (o *options) validate() error {
81-
82-
// TODO(krzyzacy): gerrit && github report are actually stateful..
83-
// Need a better design to re-enable parallel reporting
84-
if o.gerritWorkers > 1 {
85-
logrus.Warn("gerrit reporter only supports one worker")
86-
o.gerritWorkers = 1
87-
}
88-
8981
if o.gerritWorkers+o.pubsubWorkers+o.githubWorkers+o.slackWorkers+o.gcsWorkers+o.k8sGCSWorkers+o.blobStorageWorkers+o.k8sBlobStorageWorkers <= 0 {
9082
return errors.New("crier need to have at least one report worker to start")
9183
}
@@ -253,7 +245,7 @@ func main() {
253245
}
254246

255247
if o.gerritWorkers > 0 {
256-
gerritReporter, err := gerritreporter.NewReporter(cfg, o.cookiefilePath, o.gerritProjects, mgr.GetCache())
248+
gerritReporter, err := gerritreporter.NewReporter(cfg, o.cookiefilePath, o.gerritProjects, mgr.GetClient())
257249
if err != nil {
258250
logrus.WithError(err).Fatal("Error starting gerrit reporter")
259251
}

‎prow/cmd/crier/main_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,10 @@ func TestOptions(t *testing.T) {
5151
},
5252
//Gerrit Reporter
5353
{
54-
name: "gerrit only support one worker",
54+
name: "gerrit supports multiple workers",
5555
args: []string{"--gerrit-workers=99", "--gerrit-projects=foo=bar", "--cookiefile=foobar", "--config-path=foo"},
5656
expected: &options{
57-
gerritWorkers: 1,
57+
gerritWorkers: 99,
5858
cookiefilePath: "foobar",
5959
gerritProjects: map[string][]string{
6060
"foo": {"bar"},
@@ -74,7 +74,7 @@ func TestOptions(t *testing.T) {
7474
name: "gerrit missing --cookiefile",
7575
args: []string{"--gerrit-workers=5", "--gerrit-projects=foo=bar", "--config-path=foo"},
7676
expected: &options{
77-
gerritWorkers: 1,
77+
gerritWorkers: 5,
7878
gerritProjects: map[string][]string{
7979
"foo": {"bar"},
8080
},

‎prow/crier/BUILD.bazel

+2-3
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,10 @@ go_library(
1010
visibility = ["//visibility:public"],
1111
deps = [
1212
"//prow/apis/prowjobs/v1:go_default_library",
13+
"//prow/crier/reporters/criercommonlib:go_default_library",
1314
"@com_github_prometheus_client_golang//prometheus:go_default_library",
1415
"@com_github_sirupsen_logrus//:go_default_library",
1516
"@io_k8s_apimachinery//pkg/api/errors:go_default_library",
16-
"@io_k8s_apimachinery//pkg/types:go_default_library",
17-
"@io_k8s_apimachinery//pkg/util/wait:go_default_library",
18-
"@io_k8s_client_go//util/retry:go_default_library",
1917
"@io_k8s_sigs_controller_runtime//pkg/builder:go_default_library",
2018
"@io_k8s_sigs_controller_runtime//pkg/client:go_default_library",
2119
"@io_k8s_sigs_controller_runtime//pkg/controller:go_default_library",
@@ -35,6 +33,7 @@ filegroup(
3533
name = "all-srcs",
3634
srcs = [
3735
":package-srcs",
36+
"//prow/crier/reporters/criercommonlib:all-srcs",
3837
"//prow/crier/reporters/gcs:all-srcs",
3938
"//prow/crier/reporters/gerrit:all-srcs",
4039
"//prow/crier/reporters/github:all-srcs",

‎prow/crier/controller.go

+2-64
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,14 @@ import (
2424

2525
"github.com/sirupsen/logrus"
2626
"k8s.io/apimachinery/pkg/api/errors"
27-
"k8s.io/apimachinery/pkg/types"
28-
"k8s.io/apimachinery/pkg/util/wait"
29-
"k8s.io/client-go/util/retry"
3027
"sigs.k8s.io/controller-runtime/pkg/builder"
3128
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
3229
"sigs.k8s.io/controller-runtime/pkg/controller"
3330
"sigs.k8s.io/controller-runtime/pkg/manager"
3431
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3532

3633
prowv1 "k8s.io/test-infra/prow/apis/prowjobs/v1"
34+
"k8s.io/test-infra/prow/crier/reporters/criercommonlib"
3735
)
3836

3937
type ReportClient interface {
@@ -80,66 +78,6 @@ func New(
8078
return nil
8179
}
8280

83-
func (r *reconciler) updateReportState(ctx context.Context, pj *prowv1.ProwJob, log *logrus.Entry, reportedState prowv1.ProwJobState) error {
84-
// update pj report status
85-
newpj := pj.DeepCopy()
86-
// we set omitempty on PrevReportStates, so here we need to init it if is nil
87-
if newpj.Status.PrevReportStates == nil {
88-
newpj.Status.PrevReportStates = map[string]prowv1.ProwJobState{}
89-
}
90-
newpj.Status.PrevReportStates[r.reporter.GetName()] = reportedState
91-
92-
if err := r.pjclientset.Patch(ctx, newpj, ctrlruntimeclient.MergeFrom(pj)); err != nil {
93-
return fmt.Errorf("failed to patch: %w", err)
94-
}
95-
96-
// Block until the update is in the lister to make sure that events from another controller
97-
// that also does reporting dont trigger another report because our lister doesn't yet contain
98-
// the updated Status
99-
name := types.NamespacedName{Namespace: pj.Namespace, Name: pj.Name}
100-
if err := wait.Poll(100*time.Millisecond, 10*time.Second, func() (bool, error) {
101-
if err := r.pjclientset.Get(ctx, name, pj); err != nil {
102-
return false, err
103-
}
104-
if pj.Status.PrevReportStates != nil &&
105-
pj.Status.PrevReportStates[r.reporter.GetName()] == reportedState {
106-
return true, nil
107-
}
108-
return false, nil
109-
}); err != nil {
110-
return fmt.Errorf("failed to wait for updated report status to be in lister: %w", err)
111-
}
112-
return nil
113-
}
114-
115-
func (r *reconciler) updateReportStateWithRetries(ctx context.Context, pj *prowv1.ProwJob, log *logrus.Entry) error {
116-
reportState := pj.Status.State
117-
log = log.WithFields(logrus.Fields{
118-
"prowjob": pj.Name,
119-
"jobName": pj.Spec.Job,
120-
"jobStatus": reportState,
121-
})
122-
// We have to retry here, if we return we lose the information that we already reported this job.
123-
if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
124-
// Get it first, this is very cheap
125-
name := types.NamespacedName{Namespace: pj.Namespace, Name: pj.Name}
126-
if err := r.pjclientset.Get(ctx, name, pj); err != nil {
127-
return err
128-
}
129-
// Must not wrap until we have kube 1.19, otherwise the RetryOnConflict won't recognize conflicts
130-
// correctly
131-
return r.updateReportState(ctx, pj, log, reportState)
132-
}); err != nil {
133-
// Very subpar, we will report again. But even if we didn't do that now, we would do so
134-
// latest when crier gets restarted. In an ideal world, all reporters are idempotent and
135-
// reporting has no cost.
136-
return fmt.Errorf("failed to update report state on prowjob: %w", err)
137-
}
138-
139-
log.Info("Successfully updated report state on prowjob")
140-
return nil
141-
}
142-
14381
// Reconcile retrieves each queued item and takes the necessary handler action based off of if
14482
// the item was created or deleted.
14583
func (r *reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
@@ -207,7 +145,7 @@ func (r *reconciler) reconcile(ctx context.Context, log *logrus.Entry, req recon
207145
log.WithField("job-count", len(pjs)).Info("Reported job(s), now will update pj(s).")
208146
var lastErr error
209147
for _, pjob := range pjs {
210-
if err := r.updateReportStateWithRetries(ctx, pjob, log); err != nil {
148+
if err := criercommonlib.UpdateReportStateWithRetries(ctx, pjob, log, r.pjclientset, r.reporter.GetName()); err != nil {
211149
log.WithError(err).Error("Failed to update report state on prowjob")
212150
// The error above is alreay logged, so it would be duplicated
213151
// effort to combine all errors to return, only capture the last
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
2+
3+
go_library(
4+
name = "go_default_library",
5+
srcs = [
6+
"shardedlock.go",
7+
"updatereportstatus.go",
8+
],
9+
importpath = "k8s.io/test-infra/prow/crier/reporters/criercommonlib",
10+
visibility = ["//visibility:public"],
11+
deps = [
12+
"//prow/apis/prowjobs/v1:go_default_library",
13+
"@com_github_sirupsen_logrus//:go_default_library",
14+
"@io_k8s_apimachinery//pkg/types:go_default_library",
15+
"@io_k8s_apimachinery//pkg/util/wait:go_default_library",
16+
"@io_k8s_client_go//util/retry:go_default_library",
17+
"@io_k8s_sigs_controller_runtime//pkg/client:go_default_library",
18+
"@org_golang_x_sync//semaphore:go_default_library",
19+
],
20+
)
21+
22+
filegroup(
23+
name = "package-srcs",
24+
srcs = glob(["**"]),
25+
tags = ["automanaged"],
26+
visibility = ["//visibility:private"],
27+
)
28+
29+
filegroup(
30+
name = "all-srcs",
31+
srcs = [":package-srcs"],
32+
tags = ["automanaged"],
33+
visibility = ["//visibility:public"],
34+
)
35+
36+
go_test(
37+
name = "go_default_test",
38+
srcs = ["shardedlock_test.go"],
39+
embed = [":go_default_library"],
40+
tags = ["manual"],
41+
deps = ["@org_golang_x_sync//semaphore:go_default_library"],
42+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*
2+
Copyright 2022 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
// Package criercommonlib contains shared lib used by reporters
18+
package criercommonlib
19+
20+
import (
21+
"context"
22+
"time"
23+
24+
"github.com/sirupsen/logrus"
25+
"golang.org/x/sync/semaphore"
26+
)
27+
28+
// SimplePull contains info for identifying a shard
29+
type SimplePull struct {
30+
org, repo string
31+
number int
32+
}
33+
34+
// NewSimplePull creates SimplePull
35+
func NewSimplePull(org, repo string, number int) *SimplePull {
36+
return &SimplePull{org: org, repo: repo, number: number}
37+
}
38+
39+
// ShardedLock contains sharding information based on PRs
40+
type ShardedLock struct {
41+
// semaphore is chosed over mutex, as Acquire from semaphore respects
42+
// context timeout while mutex doesn't
43+
mapLock *semaphore.Weighted
44+
locks map[SimplePull]*semaphore.Weighted
45+
}
46+
47+
// NewShardedLock creates ShardedLock
48+
func NewShardedLock() *ShardedLock {
49+
return &ShardedLock{
50+
mapLock: semaphore.NewWeighted(1),
51+
locks: map[SimplePull]*semaphore.Weighted{},
52+
}
53+
}
54+
55+
// GetLock aquires the lock for a PR
56+
func (s *ShardedLock) GetLock(ctx context.Context, key SimplePull) (*semaphore.Weighted, error) {
57+
if err := s.mapLock.Acquire(ctx, 1); err != nil {
58+
return nil, err
59+
}
60+
defer s.mapLock.Release(1)
61+
if _, exists := s.locks[key]; !exists {
62+
s.locks[key] = semaphore.NewWeighted(1)
63+
}
64+
return s.locks[key], nil
65+
}
66+
67+
// Cleanup deletes all locks by acquiring first
68+
// the mapLock and then each individual lock before
69+
// deleting it. The individual lock must be acquired
70+
// because otherwise it may be held, we delete it from
71+
// the map, it gets recreated and acquired and two
72+
// routines report in parallel for the same job.
73+
// Note that while this function is running, no new
74+
// presubmit reporting can happen, as we hold the mapLock.
75+
func (s *ShardedLock) Cleanup() {
76+
ctx := context.Background()
77+
s.mapLock.Acquire(ctx, 1)
78+
defer s.mapLock.Release(1)
79+
80+
for key, lock := range s.locks {
81+
// There is a very low chance of race condition, that two threads got
82+
// different locks from the same PR, which would end up with duplicated
83+
// report once. Since this is very complicated to fix and the impact is
84+
// really low, would just keep it as is.
85+
// For details see: https://github.com/kubernetes/test-infra/pull/20343
86+
lock.Acquire(ctx, 1)
87+
delete(s.locks, key)
88+
lock.Release(1)
89+
}
90+
}
91+
92+
// RunCleanup asynchronously runs the cleanup once per hour.
93+
func (s *ShardedLock) RunCleanup() {
94+
go func() {
95+
for range time.Tick(time.Hour) {
96+
logrus.Debug("Starting to clean up presubmit locks")
97+
startTime := time.Now()
98+
s.Cleanup()
99+
logrus.WithField("duration", time.Since(startTime).String()).Debug("Finished cleaning up presubmit locks")
100+
}
101+
}()
102+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
Copyright 2022 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
// Package criercommonlib contains shared lib used by reporters
18+
package criercommonlib
19+
20+
import (
21+
"testing"
22+
23+
"golang.org/x/sync/semaphore"
24+
)
25+
26+
func TestShardedLockCleanup(t *testing.T) {
27+
t.Parallel()
28+
sl := &ShardedLock{mapLock: semaphore.NewWeighted(1), locks: map[SimplePull]*semaphore.Weighted{}}
29+
key := SimplePull{"org", "repo", 1}
30+
sl.locks[key] = semaphore.NewWeighted(1)
31+
sl.Cleanup()
32+
if _, exists := sl.locks[key]; exists {
33+
t.Error("lock didn't get cleaned up")
34+
}
35+
36+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
Copyright 2022 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
// Package criercommonlib contains shared lib used by reporters
18+
package criercommonlib
19+
20+
import (
21+
"context"
22+
"fmt"
23+
"time"
24+
25+
"github.com/sirupsen/logrus"
26+
"k8s.io/apimachinery/pkg/types"
27+
"k8s.io/apimachinery/pkg/util/wait"
28+
"k8s.io/client-go/util/retry"
29+
30+
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
31+
32+
prowv1 "k8s.io/test-infra/prow/apis/prowjobs/v1"
33+
)
34+
35+
func updateReportState(ctx context.Context, pj *prowv1.ProwJob, log *logrus.Entry, reportedState prowv1.ProwJobState, pjclientset ctrlruntimeclient.Client, reporterName string) error {
36+
// update pj report status
37+
newpj := pj.DeepCopy()
38+
// we set omitempty on PrevReportStates, so here we need to init it if is nil
39+
if newpj.Status.PrevReportStates == nil {
40+
newpj.Status.PrevReportStates = map[string]prowv1.ProwJobState{}
41+
}
42+
newpj.Status.PrevReportStates[reporterName] = reportedState
43+
44+
if err := pjclientset.Patch(ctx, newpj, ctrlruntimeclient.MergeFrom(pj)); err != nil {
45+
return fmt.Errorf("failed to patch: %w", err)
46+
}
47+
48+
// Block until the update is in the lister to make sure that events from another controller
49+
// that also does reporting dont trigger another report because our lister doesn't yet contain
50+
// the updated Status
51+
name := types.NamespacedName{Namespace: pj.Namespace, Name: pj.Name}
52+
if err := wait.Poll(100*time.Millisecond, 10*time.Second, func() (bool, error) {
53+
if err := pjclientset.Get(ctx, name, pj); err != nil {
54+
return false, err
55+
}
56+
if pj.Status.PrevReportStates != nil &&
57+
pj.Status.PrevReportStates[reporterName] == reportedState {
58+
return true, nil
59+
}
60+
return false, nil
61+
}); err != nil {
62+
return fmt.Errorf("failed to wait for updated report status to be in lister: %w", err)
63+
}
64+
return nil
65+
}
66+
67+
func UpdateReportStateWithRetries(ctx context.Context, pj *prowv1.ProwJob, log *logrus.Entry, pjclientset ctrlruntimeclient.Client, reporterName string) error {
68+
reportState := pj.Status.State
69+
log = log.WithFields(logrus.Fields{
70+
"prowjob": pj.Name,
71+
"jobName": pj.Spec.Job,
72+
"jobStatus": reportState,
73+
})
74+
// We have to retry here, if we return we lose the information that we already reported this job.
75+
if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
76+
// Get it first, this is very cheap
77+
name := types.NamespacedName{Namespace: pj.Namespace, Name: pj.Name}
78+
if err := pjclientset.Get(ctx, name, pj); err != nil {
79+
return err
80+
}
81+
// Must not wrap until we have kube 1.19, otherwise the RetryOnConflict won't recognize conflicts
82+
// correctly
83+
return updateReportState(ctx, pj, log, reportState, pjclientset, reporterName)
84+
}); err != nil {
85+
// Very subpar, we will report again. But even if we didn't do that now, we would do so
86+
// latest when crier gets restarted. In an ideal world, all reporters are idempotent and
87+
// reporting has no cost.
88+
return fmt.Errorf("failed to update report state on prowjob: %w", err)
89+
}
90+
91+
log.Info("Successfully updated report state on prowjob")
92+
return nil
93+
}

‎prow/crier/reporters/gerrit/BUILD.bazel

+4
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@ go_library(
88
deps = [
99
"//prow/apis/prowjobs/v1:go_default_library",
1010
"//prow/config:go_default_library",
11+
"//prow/crier/reporters/criercommonlib:go_default_library",
1112
"//prow/gerrit/client:go_default_library",
1213
"//prow/kube:go_default_library",
1314
"@com_github_andygrunwald_go_gerrit//:go_default_library",
1415
"@com_github_sirupsen_logrus//:go_default_library",
16+
"@io_k8s_apimachinery//pkg/api/errors:go_default_library",
1517
"@io_k8s_sigs_controller_runtime//pkg/client:go_default_library",
1618
"@io_k8s_sigs_controller_runtime//pkg/reconcile:go_default_library",
1719
],
@@ -38,6 +40,7 @@ go_test(
3840
tags = ["manual"],
3941
deps = [
4042
"//prow/apis/prowjobs/v1:go_default_library",
43+
"//prow/crier/reporters/criercommonlib:go_default_library",
4144
"//prow/gerrit/client:go_default_library",
4245
"//prow/kube:go_default_library",
4346
"@com_github_andygrunwald_go_gerrit//:go_default_library",
@@ -47,5 +50,6 @@ go_test(
4750
"@io_k8s_apimachinery//pkg/runtime:go_default_library",
4851
"@io_k8s_apimachinery//pkg/util/diff:go_default_library",
4952
"@io_k8s_sigs_controller_runtime//pkg/client/fake:go_default_library",
53+
"@org_golang_x_sync//errgroup:go_default_library",
5054
],
5155
)

‎prow/crier/reporters/gerrit/reporter.go

+93-10
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,23 @@ package gerrit
1919

2020
import (
2121
"context"
22+
"errors"
2223
"fmt"
2324
"sort"
2425
"strconv"
2526
"strings"
2627
"time"
2728

29+
apierrors "k8s.io/apimachinery/pkg/api/errors"
30+
2831
"github.com/andygrunwald/go-gerrit"
2932
"github.com/sirupsen/logrus"
3033
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
3134
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3235

3336
v1 "k8s.io/test-infra/prow/apis/prowjobs/v1"
3437
"k8s.io/test-infra/prow/config"
38+
"k8s.io/test-infra/prow/crier/reporters/criercommonlib"
3539
"k8s.io/test-infra/prow/gerrit/client"
3640
"k8s.io/test-infra/prow/kube"
3741
)
@@ -79,8 +83,9 @@ type gerritClient interface {
7983

8084
// Client is a gerrit reporter client
8185
type Client struct {
82-
gc gerritClient
83-
lister ctrlruntimeclient.Reader
86+
gc gerritClient
87+
pjclientset ctrlruntimeclient.Client
88+
prLocks *criercommonlib.ShardedLock
8489
}
8590

8691
// Job is the view of a prowjob scoped for a report
@@ -101,7 +106,7 @@ type JobReport struct {
101106
}
102107

103108
// NewReporter returns a reporter client
104-
func NewReporter(cfg config.Getter, cookiefilePath string, projects map[string][]string, lister ctrlruntimeclient.Reader) (*Client, error) {
109+
func NewReporter(cfg config.Getter, cookiefilePath string, projects map[string][]string, pjclientset ctrlruntimeclient.Client) (*Client, error) {
105110
gc, err := client.NewClient(projects)
106111
if err != nil {
107112
return nil, err
@@ -118,10 +123,14 @@ func NewReporter(cfg config.Getter, cookiefilePath string, projects map[string][
118123
// line arg(which is going to be deprecated).
119124
gc.Authenticate(cookiefilePath, "")
120125

121-
return &Client{
122-
gc: gc,
123-
lister: lister,
124-
}, nil
126+
c := &Client{
127+
gc: gc,
128+
pjclientset: pjclientset,
129+
prLocks: criercommonlib.NewShardedLock(),
130+
}
131+
132+
c.prLocks.RunCleanup()
133+
return c, nil
125134
}
126135

127136
func applyGlobalConfig(cfg config.Getter, gerritClient *client.Client, cookiefilePath string) {
@@ -200,7 +209,7 @@ func (c *Client) ShouldReport(ctx context.Context, log *logrus.Entry, pj *v1.Pro
200209
}
201210

202211
var pjs v1.ProwJobList
203-
if err := c.lister.List(ctx, &pjs, ctrlruntimeclient.MatchingLabels(selector)); err != nil {
212+
if err := c.pjclientset.List(ctx, &pjs, ctrlruntimeclient.MatchingLabels(selector)); err != nil {
204213
log.WithError(err).Errorf("Cannot list prowjob with selector %v", selector)
205214
return false
206215
}
@@ -253,6 +262,50 @@ func (c *Client) ShouldReport(ctx context.Context, log *logrus.Entry, pj *v1.Pro
253262
// Report will send the current prowjob status as a gerrit review
254263
func (c *Client) Report(ctx context.Context, logger *logrus.Entry, pj *v1.ProwJob) ([]*v1.ProwJob, *reconcile.Result, error) {
255264
logger = logger.WithFields(logrus.Fields{"job": pj.Spec.Job, "name": pj.Name})
265+
266+
// Gerrit reporter hasn't learned how to deduplicate itself from report yet,
267+
// will need to block here. Unfortunately need to check after this section
268+
// to ensure that the job was not already marked reported by other threads
269+
// TODO(chaodaiG): postsubmit job technically doesn't know which PR it's
270+
// from, currently it's associated with a PR in gerrit in a weird way, which
271+
// needs to be fixed in
272+
// https://github.com/kubernetes/test-infra/issues/22653, remove the
273+
// PostsubmitJob check once it's fixed
274+
if pj.Spec.Type == v1.PresubmitJob || pj.Spec.Type == v1.PostsubmitJob {
275+
key, err := lockKeyForPJ(pj)
276+
if err != nil {
277+
return nil, nil, fmt.Errorf("failed to get lockkey for job: %w", err)
278+
}
279+
lock, err := c.prLocks.GetLock(ctx, *key)
280+
if err != nil {
281+
return nil, nil, err
282+
}
283+
if err := lock.Acquire(ctx, 1); err != nil {
284+
return nil, nil, err
285+
}
286+
defer lock.Release(1)
287+
288+
// In the case where several prow jobs from the same PR are finished one
289+
// after another, by the time the lock is acquired, this job might have
290+
// already been reported by another worker, refetch this pj to make sure
291+
// that no duplicate report is produced
292+
pjObjKey := ctrlruntimeclient.ObjectKeyFromObject(pj)
293+
if err := c.pjclientset.Get(ctx, pjObjKey, pj); err != nil {
294+
if apierrors.IsNotFound(err) {
295+
// Job could be GC'ed or deleted for other reasons, not to
296+
// report, this is not a prow error and should not be retried
297+
logger.Debug("object no longer exist")
298+
return nil, nil, nil
299+
}
300+
301+
return nil, nil, fmt.Errorf("failed to get prowjob %s: %w", pjObjKey.String(), err)
302+
}
303+
if pj.Status.PrevReportStates[c.GetName()] == pj.Status.State {
304+
logger.Info("Already reported by other threads.")
305+
return nil, nil, nil
306+
}
307+
}
308+
256309
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
257310
defer cancel()
258311

@@ -276,7 +329,7 @@ func (c *Client) Report(ctx context.Context, logger *logrus.Entry, pj *v1.ProwJo
276329
}
277330

278331
var pjsOnRevisionWithSameLabel v1.ProwJobList
279-
if err := c.lister.List(ctx, &pjsOnRevisionWithSameLabel, ctrlruntimeclient.MatchingLabels(selector)); err != nil {
332+
if err := c.pjclientset.List(ctx, &pjsOnRevisionWithSameLabel, ctrlruntimeclient.MatchingLabels(selector)); err != nil {
280333
logger.WithError(err).WithField("selector", selector).Errorf("Cannot list prowjob with selector")
281334
return nil, nil, err
282335
}
@@ -353,7 +406,22 @@ func (c *Client) Report(ctx context.Context, logger *logrus.Entry, pj *v1.ProwJo
353406
}
354407

355408
logger.Infof("Review Complete, reported jobs: %s", jobNames(toReportJobs))
356-
return toReportJobs, nil, nil
409+
410+
// If return here, the shardedLock will be released, and other threads that
411+
// are from the same PR will still not understand that it's already
412+
// reported, as the change of previous report state happens only after the
413+
// returning of current function from the caller.
414+
// Ideally the previous report state should be changed here.
415+
logger.WithField("job-count", len(toReportJobs)).Info("Reported job(s), now will update pj(s).")
416+
var err error
417+
for _, pjob := range toReportJobs {
418+
if err = criercommonlib.UpdateReportStateWithRetries(ctx, pjob, logger, c.pjclientset, c.GetName()); err != nil {
419+
logger.WithError(err).Error("Failed to update report state on prowjob")
420+
}
421+
}
422+
423+
// Let caller know that we are done with this job.
424+
return nil, nil, err
357425
}
358426

359427
func jobNames(jobs []*v1.ProwJob) []string {
@@ -559,3 +627,18 @@ func ParseReport(message string) *JobReport {
559627
func (r JobReport) String() string {
560628
return fmt.Sprintf("%s\n%s", r.Header, r.Message)
561629
}
630+
631+
func lockKeyForPJ(pj *v1.ProwJob) (*criercommonlib.SimplePull, error) {
632+
// TODO(chaodaiG): remove postsubmit once
633+
// https://github.com/kubernetes/test-infra/issues/22653 is fixed
634+
if pj.Spec.Type != v1.PresubmitJob && pj.Spec.Type != v1.PostsubmitJob {
635+
return nil, fmt.Errorf("can only get lock key for presubmit and postsubmit jobs, was %q", pj.Spec.Type)
636+
}
637+
if pj.Spec.Refs == nil {
638+
return nil, errors.New("pj.Spec.Refs is nil")
639+
}
640+
if n := len(pj.Spec.Refs.Pulls); n != 1 {
641+
return nil, fmt.Errorf("prowjob doesn't have one but %d pulls", n)
642+
}
643+
return criercommonlib.NewSimplePull(pj.Spec.Refs.Org, pj.Spec.Refs.Repo, pj.Spec.Refs.Pulls[0].Number), nil
644+
}

‎prow/crier/reporters/gerrit/reporter_test.go

+505-33
Large diffs are not rendered by default.

‎prow/crier/reporters/github/BUILD.bazel

+1-2
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ go_library(
88
deps = [
99
"//prow/apis/prowjobs/v1:go_default_library",
1010
"//prow/config:go_default_library",
11+
"//prow/crier/reporters/criercommonlib:go_default_library",
1112
"//prow/gerrit/client:go_default_library",
1213
"//prow/github/report:go_default_library",
1314
"//prow/kube:go_default_library",
1415
"@com_github_sirupsen_logrus//:go_default_library",
1516
"@io_k8s_sigs_controller_runtime//pkg/client:go_default_library",
1617
"@io_k8s_sigs_controller_runtime//pkg/reconcile:go_default_library",
17-
"@org_golang_x_sync//semaphore:go_default_library",
1818
],
1919
)
2020

@@ -49,6 +49,5 @@ go_test(
4949
"@io_k8s_apimachinery//pkg/apis/meta/v1:go_default_library",
5050
"@io_k8s_apimachinery//pkg/runtime:go_default_library",
5151
"@io_k8s_sigs_controller_runtime//pkg/client/fake:go_default_library",
52-
"@org_golang_x_sync//semaphore:go_default_library",
5352
],
5453
)

‎prow/crier/reporters/github/reporter.go

+8-64
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@ import (
2626
"time"
2727

2828
"github.com/sirupsen/logrus"
29-
"golang.org/x/sync/semaphore"
3029
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
3130
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3231

3332
v1 "k8s.io/test-infra/prow/apis/prowjobs/v1"
3433
"k8s.io/test-infra/prow/config"
34+
"k8s.io/test-infra/prow/crier/reporters/criercommonlib"
3535
"k8s.io/test-infra/prow/gerrit/client"
3636
"k8s.io/test-infra/prow/github/report"
3737
"k8s.io/test-infra/prow/kube"
@@ -47,76 +47,20 @@ type Client struct {
4747
gc report.GitHubClient
4848
config config.Getter
4949
reportAgent v1.ProwJobAgent
50-
prLocks *shardedLock
50+
prLocks *criercommonlib.ShardedLock
5151
lister ctrlruntimeclient.Reader
5252
}
5353

54-
type simplePull struct {
55-
org, repo string
56-
number int
57-
}
58-
59-
type shardedLock struct {
60-
mapLock *semaphore.Weighted
61-
locks map[simplePull]*semaphore.Weighted
62-
}
63-
64-
func (s *shardedLock) getLock(ctx context.Context, key simplePull) (*semaphore.Weighted, error) {
65-
if err := s.mapLock.Acquire(ctx, 1); err != nil {
66-
return nil, err
67-
}
68-
defer s.mapLock.Release(1)
69-
if _, exists := s.locks[key]; !exists {
70-
s.locks[key] = semaphore.NewWeighted(1)
71-
}
72-
return s.locks[key], nil
73-
}
74-
75-
// cleanup deletes all locks by acquiring first
76-
// the mapLock and then each individual lock before
77-
// deleting it. The individual lock must be acquired
78-
// because otherwise it may be held, we delete it from
79-
// the map, it gets recreated and acquired and two
80-
// routines report in parallel for the same job.
81-
// Note that while this function is running, no new
82-
// presubmit reporting can happen, as we hold the mapLock.
83-
func (s *shardedLock) cleanup() {
84-
ctx := context.Background()
85-
s.mapLock.Acquire(ctx, 1)
86-
defer s.mapLock.Release(1)
87-
88-
for key, lock := range s.locks {
89-
lock.Acquire(ctx, 1)
90-
delete(s.locks, key)
91-
lock.Release(1)
92-
}
93-
}
94-
95-
// runCleanup asynchronously runs the cleanup once per hour.
96-
func (s *shardedLock) runCleanup() {
97-
go func() {
98-
for range time.Tick(time.Hour) {
99-
logrus.Debug("Starting to clean up presubmit locks")
100-
startTime := time.Now()
101-
s.cleanup()
102-
logrus.WithField("duration", time.Since(startTime).String()).Debug("Finished cleaning up presubmit locks")
103-
}
104-
}()
105-
}
106-
10754
// NewReporter returns a reporter client
10855
func NewReporter(gc report.GitHubClient, cfg config.Getter, reportAgent v1.ProwJobAgent, lister ctrlruntimeclient.Reader) *Client {
10956
c := &Client{
11057
gc: gc,
11158
config: cfg,
11259
reportAgent: reportAgent,
113-
prLocks: &shardedLock{
114-
mapLock: semaphore.NewWeighted(1),
115-
locks: map[simplePull]*semaphore.Weighted{},
116-
},
117-
lister: lister,
60+
prLocks: criercommonlib.NewShardedLock(),
61+
lister: lister,
11862
}
119-
c.prLocks.runCleanup()
63+
c.prLocks.RunCleanup()
12064
return c
12165
}
12266

@@ -172,7 +116,7 @@ func (c *Client) Report(ctx context.Context, log *logrus.Entry, pj *v1.ProwJob)
172116
if err != nil {
173117
return nil, nil, fmt.Errorf("failed to get lockkey for job: %w", err)
174118
}
175-
lock, err := c.prLocks.getLock(ctx, *key)
119+
lock, err := c.prLocks.GetLock(ctx, *key)
176120
if err != nil {
177121
return nil, nil, err
178122
}
@@ -248,7 +192,7 @@ func pjsToReport(ctx context.Context, log *logrus.Entry, lister ctrlruntimeclien
248192
return toReport, nil
249193
}
250194

251-
func lockKeyForPJ(pj *v1.ProwJob) (*simplePull, error) {
195+
func lockKeyForPJ(pj *v1.ProwJob) (*criercommonlib.SimplePull, error) {
252196
if pj.Spec.Type != v1.PresubmitJob {
253197
return nil, fmt.Errorf("can only get lock key for presubmit jobs, was %q", pj.Spec.Type)
254198
}
@@ -258,5 +202,5 @@ func lockKeyForPJ(pj *v1.ProwJob) (*simplePull, error) {
258202
if n := len(pj.Spec.Refs.Pulls); n != 1 {
259203
return nil, fmt.Errorf("prowjob doesn't have one but %d pulls", n)
260204
}
261-
return &simplePull{org: pj.Spec.Refs.Org, repo: pj.Spec.Refs.Repo, number: pj.Spec.Refs.Pulls[0].Number}, nil
205+
return criercommonlib.NewSimplePull(pj.Spec.Refs.Org, pj.Spec.Refs.Repo, pj.Spec.Refs.Pulls[0].Number), nil
262206
}

‎prow/crier/reporters/github/reporter_test.go

-13
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ import (
2626
"github.com/google/go-cmp/cmp"
2727
"github.com/google/go-cmp/cmp/cmpopts"
2828
"github.com/sirupsen/logrus"
29-
"golang.org/x/sync/semaphore"
3029
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3130
"k8s.io/apimachinery/pkg/runtime"
3231

@@ -171,18 +170,6 @@ func TestPresumitReportingLocks(t *testing.T) {
171170
wg.Wait()
172171
}
173172

174-
func TestShardedLockCleanup(t *testing.T) {
175-
t.Parallel()
176-
sl := &shardedLock{mapLock: semaphore.NewWeighted(1), locks: map[simplePull]*semaphore.Weighted{}}
177-
key := simplePull{"org", "repo", 1}
178-
sl.locks[key] = semaphore.NewWeighted(1)
179-
sl.cleanup()
180-
if _, exists := sl.locks[key]; exists {
181-
t.Error("lock didn't get cleaned up")
182-
}
183-
184-
}
185-
186173
func TestReport(t *testing.T) {
187174
t.Parallel()
188175
testCases := []struct {

0 commit comments

Comments
 (0)
Please sign in to comment.