Skip to content

Commit f442616

Browse files
Make restore progress available even when cluster is not (#4196)
* feat(schema): extend view with build status We need to store view build status because we want to display it during in sctool progress. Ref #4191 * refactor(restore): move view build status to view Since build status is a part of the view definition in SM DB schema, we should also reflect it in the code. This also allows us to get rid of the ViewProgress struct. * feat(restore_test): test view progress after cluster is unavailable This test checks for #4191. * fix(restore): store view build status in SM DB instead of querying it in progress This way there is no problem with getting sctool progress even when the cluster is no longer available. This makes TestRestoreTablesProgressIntegration pass. Fixes #4191
1 parent 605d9c9 commit f442616

File tree

9 files changed

+111
-57
lines changed

9 files changed

+111
-57
lines changed

pkg/restapi/services.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ type HealthCheckService interface {
4646
// RepairService service interface for the REST API handlers.
4747
type RepairService interface {
4848
GetRun(ctx context.Context, clusterID, taskID, runID uuid.UUID) (*repair.Run, error)
49+
// GetProgress must work even when the cluster is no longer available.
4950
GetProgress(ctx context.Context, clusterID, taskID, runID uuid.UUID) (repair.Progress, error)
5051
GetTarget(ctx context.Context, clusterID uuid.UUID, properties json.RawMessage) (repair.Target, error)
5152
SetIntensity(ctx context.Context, runID uuid.UUID, intensity float64) error
@@ -59,15 +60,18 @@ type BackupService interface {
5960
ExtractLocations(ctx context.Context, properties []json.RawMessage) []backupspec.Location
6061
List(ctx context.Context, clusterID uuid.UUID, locations []backupspec.Location, filter backup.ListFilter) ([]backup.ListItem, error)
6162
ListFiles(ctx context.Context, clusterID uuid.UUID, locations []backupspec.Location, filter backup.ListFilter) ([]backupspec.FilesInfo, error)
63+
// GetProgress must work even when the cluster is no longer available.
6264
GetProgress(ctx context.Context, clusterID, taskID, runID uuid.UUID) (backup.Progress, error)
6365
DeleteSnapshot(ctx context.Context, clusterID uuid.UUID, locations []backupspec.Location, snapshotTags []string) error
6466
GetValidationTarget(_ context.Context, clusterID uuid.UUID, properties json.RawMessage) (backup.ValidationTarget, error)
67+
// GetValidationProgress must work even when the cluster is no longer available.
6568
GetValidationProgress(ctx context.Context, clusterID, taskID, runID uuid.UUID) ([]backup.ValidationHostProgress, error)
6669
}
6770

6871
// RestoreService service interface for the REST API handlers.
6972
type RestoreService interface {
7073
GetTargetUnitsViews(ctx context.Context, clusterID uuid.UUID, properties json.RawMessage) (restore.Target, []restore.Unit, []restore.View, error)
74+
// GetProgress must work even when the cluster is no longer available.
7175
GetProgress(ctx context.Context, clusterID, taskID, runID uuid.UUID) (restore.Progress, error)
7276
}
7377

pkg/service/restore/model.go

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,12 @@ const (
160160

161161
// View represents statement used for recreating restored (dropped) views.
162162
type View struct {
163-
Keyspace string `json:"keyspace" db:"keyspace_name"`
164-
View string `json:"view" db:"view_name"`
165-
Type ViewType `json:"type" db:"view_type"`
166-
BaseTable string `json:"base_table"`
167-
CreateStmt string `json:"create_stmt"`
163+
Keyspace string `json:"keyspace" db:"keyspace_name"`
164+
View string `json:"view" db:"view_name"`
165+
Type ViewType `json:"type" db:"view_type"`
166+
BaseTable string `json:"base_table"`
167+
CreateStmt string `json:"create_stmt"`
168+
BuildStatus scyllaclient.ViewBuildStatus `json:"status"`
168169
}
169170

170171
func (t View) MarshalUDT(name string, info gocql.TypeInfo) ([]byte, error) {
@@ -235,7 +236,7 @@ type Progress struct {
235236
SnapshotTag string `json:"snapshot_tag"`
236237
Keyspaces []KeyspaceProgress `json:"keyspaces,omitempty"`
237238
Hosts []HostProgress `json:"hosts,omitempty"`
238-
Views []ViewProgress `json:"views,omitempty"`
239+
Views []View `json:"views,omitempty"`
239240
Stage Stage `json:"stage"`
240241
}
241242

@@ -266,13 +267,6 @@ type TableProgress struct {
266267
Error string `json:"error,omitempty"`
267268
}
268269

269-
// ViewProgress defines restore progress for the view.
270-
type ViewProgress struct {
271-
View
272-
273-
Status scyllaclient.ViewBuildStatus `json:"status"`
274-
}
275-
276270
// TableName represents full table name.
277271
type TableName struct {
278272
Keyspace string

pkg/service/restore/progress.go

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,13 @@
33
package restore
44

55
import (
6-
"context"
6+
"slices"
77
"time"
88

99
"github.com/pkg/errors"
1010
"github.com/scylladb/gocqlx/v2"
1111
"github.com/scylladb/gocqlx/v2/qb"
1212
"github.com/scylladb/scylla-manager/v3/pkg/schema/table"
13-
"github.com/scylladb/scylla-manager/v3/pkg/scyllaclient"
1413
"github.com/scylladb/scylla-manager/v3/pkg/util/timeutc"
1514
"github.com/scylladb/scylla-manager/v3/pkg/util/uuid"
1615
)
@@ -26,7 +25,7 @@ type tableKey struct {
2625
}
2726

2827
// aggregateProgress returns restore progress information classified by keyspace and tables.
29-
func (w *worker) aggregateProgress(ctx context.Context) (Progress, error) {
28+
func (w *worker) aggregateProgress() (Progress, error) {
3029
var (
3130
p = Progress{
3231
SnapshotTag: w.run.SnapshotTag,
@@ -99,27 +98,7 @@ func (w *worker) aggregateProgress(ctx context.Context) (Progress, error) {
9998

10099
p.extremeToNil()
101100

102-
for _, v := range w.run.Views {
103-
viewTableName := v.View
104-
if v.Type == SecondaryIndex {
105-
viewTableName += "_index"
106-
}
107-
108-
status, err := w.client.ViewBuildStatus(ctx, v.Keyspace, viewTableName)
109-
if err != nil {
110-
w.logger.Error(ctx, "Couldn't get view build status",
111-
"keyspace", v.Keyspace,
112-
"view", v.View,
113-
"error", err,
114-
)
115-
status = scyllaclient.StatusUnknown
116-
}
117-
p.Views = append(p.Views, ViewProgress{
118-
View: v,
119-
Status: status,
120-
})
121-
}
122-
101+
p.Views = slices.Clone(w.run.Views)
123102
for _, hp := range hostProgress {
124103
p.Hosts = append(p.Hosts, hp)
125104
}

pkg/service/restore/restore_integration_test.go

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,3 +989,85 @@ func TestRestoreTablesMultiLocationIntegration(t *testing.T) {
989989
t.Fatalf("tables have different contents\nsrc:\n%v\ndst:\n%v", srcM, dstM)
990990
}
991991
}
992+
993+
func TestRestoreTablesProgressIntegration(t *testing.T) {
994+
// It verifies that:
995+
// - view status progress is correct
996+
// - progress is available even when cluster is not
997+
998+
if IsIPV6Network() {
999+
t.Skip("nodes don't have ip6tables and related modules to properly simulate unavailable cluster")
1000+
}
1001+
1002+
h := newTestHelper(t, ManagedSecondClusterHosts(), ManagedClusterHosts())
1003+
1004+
Print("Keyspace setup")
1005+
ks := randomizedName("progress_")
1006+
ksStmt := "CREATE KEYSPACE %q WITH replication = {'class': 'NetworkTopologyStrategy', 'dc1': %d}"
1007+
ExecStmt(t, h.srcCluster.rootSession, fmt.Sprintf(ksStmt, ks, 1))
1008+
ExecStmt(t, h.dstCluster.rootSession, fmt.Sprintf(ksStmt, ks, 1))
1009+
1010+
Print("Table setup")
1011+
tab := randomizedName("tab_")
1012+
tabStmt := "CREATE TABLE %q.%q (id int PRIMARY KEY, data int)"
1013+
ExecStmt(t, h.srcCluster.rootSession, fmt.Sprintf(tabStmt, ks, tab))
1014+
ExecStmt(t, h.dstCluster.rootSession, fmt.Sprintf(tabStmt, ks, tab))
1015+
1016+
Print("View setup")
1017+
mv := randomizedName("mv_")
1018+
CreateMaterializedView(t, h.srcCluster.rootSession, ks, tab, mv)
1019+
CreateMaterializedView(t, h.dstCluster.rootSession, ks, tab, mv)
1020+
1021+
Print("Fill setup")
1022+
fillTable(t, h.srcCluster.rootSession, 1, ks, tab)
1023+
1024+
Print("Run backup")
1025+
loc := []Location{testLocation("progress", "")}
1026+
S3InitBucket(t, loc[0].Path)
1027+
tag := h.runBackup(t, map[string]any{
1028+
"location": loc,
1029+
})
1030+
1031+
Print("Run restore")
1032+
grantRestoreTablesPermissions(t, h.dstCluster.rootSession, nil, h.dstUser)
1033+
h.runRestore(t, map[string]any{
1034+
"location": loc,
1035+
"snapshot_tag": tag,
1036+
"restore_tables": true,
1037+
})
1038+
1039+
Print("Validate success")
1040+
validateTableContent[int, int](t, h.srcCluster.rootSession, h.dstCluster.rootSession, ks, tab, "id", "data")
1041+
validateTableContent[int, int](t, h.srcCluster.rootSession, h.dstCluster.rootSession, ks, mv, "id", "data")
1042+
1043+
Print("Validate view progress")
1044+
pr, err := h.dstRestoreSvc.GetProgress(context.Background(), h.dstCluster.ClusterID, h.dstCluster.TaskID, h.dstCluster.RunID)
1045+
if err != nil {
1046+
t.Fatal(errors.Wrap(err, "get progress"))
1047+
}
1048+
for _, v := range pr.Views {
1049+
if v.BuildStatus != scyllaclient.StatusSuccess {
1050+
t.Fatalf("Expected status: %s, got: %s", scyllaclient.StatusSuccess, v.BuildStatus)
1051+
}
1052+
}
1053+
1054+
BlockREST(t, ManagedClusterHosts()...)
1055+
defer func() {
1056+
TryUnblockREST(t, ManagedClusterHosts())
1057+
if err := EnsureNodesAreUP(t, ManagedClusterHosts(), time.Minute); err != nil {
1058+
t.Fatal(err)
1059+
}
1060+
}()
1061+
time.Sleep(100 * time.Millisecond)
1062+
1063+
Print("Validate view progress when cluster is unavailable")
1064+
pr, err = h.dstRestoreSvc.GetProgress(context.Background(), h.dstCluster.ClusterID, h.dstCluster.TaskID, h.dstCluster.RunID)
1065+
if err != nil {
1066+
t.Fatal(errors.Wrap(err, "get progress"))
1067+
}
1068+
for _, v := range pr.Views {
1069+
if v.BuildStatus != scyllaclient.StatusSuccess {
1070+
t.Fatalf("Expected status: %s, got: %s", scyllaclient.StatusSuccess, v.BuildStatus)
1071+
}
1072+
}
1073+
}

pkg/service/restore/service.go

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -158,12 +158,8 @@ func (s *Service) GetProgress(ctx context.Context, clusterID, taskID, runID uuid
158158
return Progress{}, errors.Wrap(err, "get run")
159159
}
160160

161-
w, err := s.newProgressWorker(ctx, run)
162-
if err != nil {
163-
return Progress{}, errors.Wrap(err, "create progress worker")
164-
}
165-
166-
pr, err := w.aggregateProgress(ctx)
161+
w := s.newProgressWorker(run)
162+
pr, err := w.aggregateProgress()
167163
if err != nil {
168164
return Progress{}, err
169165
}
@@ -221,20 +217,14 @@ func (w *worker) setRunInfo(taskID, runID uuid.UUID) {
221217
w.run.ID = runID
222218
}
223219

224-
func (s *Service) newProgressWorker(ctx context.Context, run *Run) (worker, error) {
225-
client, err := s.scyllaClient(ctx, run.ClusterID)
226-
if err != nil {
227-
return worker{}, errors.Wrap(err, "get client")
228-
}
229-
220+
func (s *Service) newProgressWorker(run *Run) worker {
230221
return worker{
231222
run: run,
232223
config: s.config,
233224
logger: s.logger,
234225
metrics: s.metrics,
235-
client: client,
236226
session: s.session,
237-
}, nil
227+
}
238228
}
239229

240230
// GetRun returns run with specified cluster, task and run ID.

pkg/service/restore/tables_worker.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,11 @@ func (w *tablesWorker) restore(ctx context.Context) error {
146146
return nil
147147
},
148148
StageRecreateViews: func() error {
149-
for _, v := range w.run.Views {
149+
for i, v := range w.run.Views {
150150
if err := w.CreateView(ctx, v); err != nil {
151151
return errors.Wrapf(err, "recreate %s.%s with statement %s", v.Keyspace, v.View, v.CreateStmt)
152152
}
153-
if err := w.WaitForViewBuilding(ctx, v); err != nil {
153+
if err := w.WaitForViewBuilding(ctx, &w.run.Views[i]); err != nil {
154154
return errors.Wrapf(err, "wait for %s.%s", v.Keyspace, v.View)
155155
}
156156
}

pkg/service/restore/worker_views.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ func (w *worker) CreateView(ctx context.Context, view View) error {
7171
return alterSchemaRetryWrapper(ctx, op, notify)
7272
}
7373

74-
func (w *worker) WaitForViewBuilding(ctx context.Context, view View) error {
74+
func (w *worker) WaitForViewBuilding(ctx context.Context, view *View) error {
7575
labels := metrics.RestoreViewBuildStatusLabels{
7676
ClusterID: w.run.ClusterID.String(),
7777
Keyspace: view.Keyspace,
@@ -90,6 +90,8 @@ func (w *worker) WaitForViewBuilding(ctx context.Context, view View) error {
9090
return retry.Permanent(err)
9191
}
9292

93+
view.BuildStatus = status
94+
w.insertRun(ctx)
9395
switch status {
9496
case scyllaclient.StatusUnknown:
9597
w.metrics.SetViewBuildStatus(labels, metrics.BuildStatusUnknown)

pkg/testutils/exec.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,10 +159,12 @@ func WaitForNodeUPOrTimeout(h string, timeout time.Duration) error {
159159
}
160160

161161
// BlockREST blocks the Scylla API ports on h machine by dropping TCP packets.
162-
func BlockREST(t *testing.T, h string) {
162+
func BlockREST(t *testing.T, hosts ...string) {
163163
t.Helper()
164-
if err := RunIptablesCommand(t, h, CmdBlockScyllaREST); err != nil {
165-
t.Error(err)
164+
for _, host := range hosts {
165+
if err := RunIptablesCommand(t, host, CmdBlockScyllaREST); err != nil {
166+
t.Error(err)
167+
}
166168
}
167169
}
168170

schema/v3.5.0.cql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
ALTER TYPE restore_view ADD build_status text;

0 commit comments

Comments
 (0)