Skip to content

Commit

Permalink
Merge #141443
Browse files Browse the repository at this point in the history
141443: sqlstats: scaffold a new sql activity flush job r=kyle-a-wong a=kyle-a-wong

This commit creates the scaffolding for a new sql activity flush job, whose purpose is to do a cluster wide sql stats flush. This job will run in place of the existing node local sql stats flush loop.

Note: Moving forward, "sql activity" will be used in place of "sql stats" to help disambiguate the "sql stats" subsystem from the original usage of sql stats, which was used for table statistics and optimizer related code

Epic: CRDB-45771
Release note: None

Co-authored-by: Kyle Wong <[email protected]>
  • Loading branch information
craig[bot] and kyle-a-wong committed Feb 24, 2025
2 parents 0f2a937 + a6a2148 commit 4bcc002
Show file tree
Hide file tree
Showing 27 changed files with 249 additions and 19 deletions.
12 changes: 12 additions & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1508,6 +1508,18 @@
<tr><td>APPLICATION</td><td>jobs.schema_change_gc.resume_completed</td><td>Number of schema_change_gc jobs which successfully resumed to completion</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.schema_change_gc.resume_failed</td><td>Number of schema_change_gc jobs which failed with a non-retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.schema_change_gc.resume_retry_error</td><td>Number of schema_change_gc jobs which failed with a retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.currently_idle</td><td>Number of sql_activity_flush jobs currently considered Idle and can be freely shut down</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.currently_paused</td><td>Number of sql_activity_flush jobs currently considered Paused</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.currently_running</td><td>Number of sql_activity_flush jobs currently running in Resume or OnFailOrCancel state</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.expired_pts_records</td><td>Number of expired protected timestamp records owned by sql_activity_flush jobs</td><td>records</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.fail_or_cancel_completed</td><td>Number of sql_activity_flush jobs which successfully completed their failure or cancelation process</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.fail_or_cancel_failed</td><td>Number of sql_activity_flush jobs which failed with a non-retriable error on their failure or cancelation process</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.fail_or_cancel_retry_error</td><td>Number of sql_activity_flush jobs which failed with a retriable error on their failure or cancelation process</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.protected_age_sec</td><td>The age of the oldest PTS record protected by sql_activity_flush jobs</td><td>seconds</td><td>GAUGE</td><td>SECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.protected_record_count</td><td>Number of protected timestamp records held by sql_activity_flush jobs</td><td>records</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.resume_completed</td><td>Number of sql_activity_flush jobs which successfully resumed to completion</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.resume_failed</td><td>Number of sql_activity_flush jobs which failed with a non-retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.sql_activity_flush.resume_retry_error</td><td>Number of sql_activity_flush jobs which failed with a retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.standby_read_ts_poller.currently_idle</td><td>Number of standby_read_ts_poller jobs currently considered Idle and can be freely shut down</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.standby_read_ts_poller.currently_paused</td><td>Number of standby_read_ts_poller jobs currently considered Paused</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.standby_read_ts_poller.currently_running</td><td>Number of standby_read_ts_poller jobs currently running in Resume or OnFailOrCancel state</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
Expand Down
2 changes: 1 addition & 1 deletion docs/generated/settings/settings-for-tenants.txt
Original file line number Diff line number Diff line change
Expand Up @@ -405,4 +405,4 @@ trace.span_registry.enabled boolean false if set, ongoing traces can be seen at
trace.zipkin.collector string the address of a Zipkin instance to receive traces, as <host>:<port>. If no port is specified, 9411 will be used. application
ui.database_locality_metadata.enabled boolean true if enabled shows extended locality data about databases and tables in DB Console which can be expensive to compute application
ui.display_timezone enumeration etc/utc the timezone used to format timestamps in the ui [etc/utc = 0, america/new_york = 1] application
version version 1000025.1-upgrading-to-1000025.2-step-002 set the active cluster version in the format '<major>.<minor>' application
version version 1000025.1-upgrading-to-1000025.2-step-004 set the active cluster version in the format '<major>.<minor>' application
2 changes: 1 addition & 1 deletion docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,6 @@
<tr><td><div id="setting-trace-zipkin-collector" class="anchored"><code>trace.zipkin.collector</code></div></td><td>string</td><td><code></code></td><td>the address of a Zipkin instance to receive traces, as &lt;host&gt;:&lt;port&gt;. If no port is specified, 9411 will be used.</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-ui-database-locality-metadata-enabled" class="anchored"><code>ui.database_locality_metadata.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if enabled shows extended locality data about databases and tables in DB Console which can be expensive to compute</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-ui-display-timezone" class="anchored"><code>ui.display_timezone</code></div></td><td>enumeration</td><td><code>etc/utc</code></td><td>the timezone used to format timestamps in the ui [etc/utc = 0, america/new_york = 1]</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-version" class="anchored"><code>version</code></div></td><td>version</td><td><code>1000025.1-upgrading-to-1000025.2-step-002</code></td><td>set the active cluster version in the format &#39;&lt;major&gt;.&lt;minor&gt;&#39;</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-version" class="anchored"><code>version</code></div></td><td>version</td><td><code>1000025.1-upgrading-to-1000025.2-step-004</code></td><td>set the active cluster version in the format &#39;&lt;major&gt;.&lt;minor&gt;&#39;</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
</tbody>
</table>
1 change: 1 addition & 0 deletions pkg/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -2305,6 +2305,7 @@ GO_TARGETS = [
"//pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil:sqlstatsutil_test",
"//pkg/sql/sqlstats/persistedsqlstats:persistedsqlstats",
"//pkg/sql/sqlstats/persistedsqlstats:persistedsqlstats_test",
"//pkg/sql/sqlstats/sqlactivityjob:sqlactivityjob",
"//pkg/sql/sqlstats/sslocal:sslocal",
"//pkg/sql/sqlstats/sslocal:sslocal_test",
"//pkg/sql/sqlstats/ssmemstorage:ssmemstorage",
Expand Down
2 changes: 1 addition & 1 deletion pkg/cli/testdata/doctor/test_examine_cluster
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ debug doctor examine cluster
debug doctor examine cluster
Examining 69 descriptors and 68 namespace entries...
ParentID 100, ParentSchemaID 101: relation "foo" (105): expected matching namespace entry, found none
Examining 11 jobs...
Examining 12 jobs...
ERROR: validation failed
2 changes: 1 addition & 1 deletion pkg/cli/testdata/doctor/test_examine_cluster_dropped
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ debug doctor examine cluster
----
debug doctor examine cluster
Examining 68 descriptors and 68 namespace entries...
Examining 9 jobs...
Examining 10 jobs...
No problems found!
2 changes: 1 addition & 1 deletion pkg/cli/testdata/doctor/test_examine_cluster_jobs
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ Examining 68 descriptors and 69 namespace entries...
ParentID 183, ParentSchemaID 381: relation "foo" (104): expected matching namespace entry, found none
ParentID 183, ParentSchemaID 381: relation "foo" (104): mutation job 962952277419655169: job 962952277419655169 not found
ParentID 100, ParentSchemaID 101: namespace entry "foo" (104): mismatched name "foo" in relation descriptor
Examining 9 jobs...
Examining 10 jobs...
ERROR: validation failed
5 changes: 4 additions & 1 deletion pkg/clusterversion/cockroach_versions.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ const (

V25_2_Start

V25_2_AddSqlActivityFlushJob

// *************************************************
// Step (1) Add new versions above this comment.
// Do not add new versions to a patch release.
Expand Down Expand Up @@ -276,7 +278,8 @@ var versionTable = [numKeys]roachpb.Version{
V25_1: {Major: 25, Minor: 1, Internal: 0},

// v25.2 versions. Internal versions must be even.
V25_2_Start: {Major: 25, Minor: 1, Internal: 2},
V25_2_Start: {Major: 25, Minor: 1, Internal: 2},
V25_2_AddSqlActivityFlushJob: {Major: 25, Minor: 1, Internal: 4},

// *************************************************
// Step (2): Add new versions above this comment.
Expand Down
1 change: 1 addition & 0 deletions pkg/jobs/jobs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ func (rts *registryTestSuite) setUp(t *testing.T) func() {
SkipUpdateSQLActivityJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
SkipUpdateTableMetadataCacheBootstrap: true,
SkipSqlActivityFlushJobBootstrap: true,
}
args.Knobs.KeyVisualizer = &keyvisualizer.TestingKnobs{SkipJobBootstrap: true}

Expand Down
11 changes: 11 additions & 0 deletions pkg/jobs/jobspb/jobs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1466,6 +1466,14 @@ message ImportRollbackDetails {
];
}

message SqlActivityFlushDetails {

}

message SqlActivityFlushProgress {

}

message ImportRollbackProgress {}

message Payload {
Expand Down Expand Up @@ -1536,6 +1544,7 @@ message Payload {
LogicalReplicationDetails logical_replication_details = 48;
UpdateTableMetadataCacheDetails update_table_metadata_cache_details = 49;
StandbyReadTSPollerDetails standby_read_ts_poller_details = 50;
SqlActivityFlushDetails sql_activity_flush_details = 51;
}
reserved 26;
// PauseReason is used to describe the reason that the job is currently paused
Expand Down Expand Up @@ -1616,6 +1625,7 @@ message Progress {
LogicalReplicationProgress LogicalReplication = 36;
UpdateTableMetadataCacheProgress table_metadata_cache = 37;
StandbyReadTSPollerProgress standby_read_ts_poller = 38;
SqlActivityFlushProgress sql_activity_flush = 39;
}

uint64 trace_id = 21 [(gogoproto.nullable) = false, (gogoproto.customname) = "TraceID", (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb.TraceID"];
Expand Down Expand Up @@ -1658,6 +1668,7 @@ enum Type {
AUTO_CREATE_PARTIAL_STATS = 28 [(gogoproto.enumvalue_customname) = "TypeAutoCreatePartialStats"];
UPDATE_TABLE_METADATA_CACHE = 29 [(gogoproto.enumvalue_customname) = "TypeUpdateTableMetadataCache"];
STANDBY_READ_TS_POLLER = 30 [(gogoproto.enumvalue_customname) = "TypeStandbyReadTSPoller"];
SQL_ACTIVITY_FLUSH = 31 [(gogoproto.enumvalue_customname) = "TypeSQLActivityFlush"];
}

message Job {
Expand Down
16 changes: 15 additions & 1 deletion pkg/jobs/jobspb/wrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ var (
_ Details = LogicalReplicationDetails{}
_ Details = UpdateTableMetadataCacheDetails{}
_ Details = StandbyReadTSPollerDetails{}
_ Details = SqlActivityFlushDetails{}
)

// ProgressDetails is a marker interface for job progress details proto structs.
Expand Down Expand Up @@ -76,6 +77,7 @@ var (
_ ProgressDetails = LogicalReplicationProgress{}
_ ProgressDetails = UpdateTableMetadataCacheProgress{}
_ ProgressDetails = StandbyReadTSPollerProgress{}
_ ProgressDetails = SqlActivityFlushProgress{}
)

// Type returns the payload's job type and panics if the type is invalid.
Expand Down Expand Up @@ -172,6 +174,7 @@ var AutomaticJobTypes = [...]Type{
TypeAutoUpdateSQLActivity,
TypeMVCCStatisticsUpdate,
TypeUpdateTableMetadataCache,
TypeSQLActivityFlush,
}

// DetailsType returns the type for a payload detail.
Expand Down Expand Up @@ -239,6 +242,8 @@ func DetailsType(d isPayload_Details) (Type, error) {
return TypeUpdateTableMetadataCache, nil
case *Payload_StandbyReadTsPollerDetails:
return TypeStandbyReadTSPoller, nil
case *Payload_SqlActivityFlushDetails:
return TypeSQLActivityFlush, nil
default:
return TypeUnspecified, errors.Newf("Payload.Type called on a payload with an unknown details type: %T", d)
}
Expand Down Expand Up @@ -292,6 +297,7 @@ var JobDetailsForEveryJobType = map[Type]Details{
TypeLogicalReplication: LogicalReplicationDetails{},
TypeUpdateTableMetadataCache: UpdateTableMetadataCacheDetails{},
TypeStandbyReadTSPoller: StandbyReadTSPollerDetails{},
TypeSQLActivityFlush: SqlActivityFlushDetails{},
}

// WrapProgressDetails wraps a ProgressDetails object in the protobuf wrapper
Expand Down Expand Up @@ -359,6 +365,8 @@ func WrapProgressDetails(details ProgressDetails) interface {
return &Progress_TableMetadataCache{TableMetadataCache: &d}
case StandbyReadTSPollerProgress:
return &Progress_StandbyReadTsPoller{StandbyReadTsPoller: &d}
case SqlActivityFlushProgress:
return &Progress_SqlActivityFlush{SqlActivityFlush: &d}
default:
panic(errors.AssertionFailedf("WrapProgressDetails: unknown progress type %T", d))
}
Expand Down Expand Up @@ -424,6 +432,8 @@ func (p *Payload) UnwrapDetails() Details {
return *d.UpdateTableMetadataCacheDetails
case *Payload_StandbyReadTsPollerDetails:
return *d.StandbyReadTsPollerDetails
case *Payload_SqlActivityFlushDetails:
return *d.SqlActivityFlushDetails
default:
return nil
}
Expand Down Expand Up @@ -489,6 +499,8 @@ func (p *Progress) UnwrapDetails() ProgressDetails {
return *d.TableMetadataCache
case *Progress_StandbyReadTsPoller:
return *d.StandbyReadTsPoller
case *Progress_SqlActivityFlush:
return *d.SqlActivityFlush
default:
return nil
}
Expand Down Expand Up @@ -578,6 +590,8 @@ func WrapPayloadDetails(details Details) interface {
return &Payload_UpdateTableMetadataCacheDetails{UpdateTableMetadataCacheDetails: &d}
case StandbyReadTSPollerDetails:
return &Payload_StandbyReadTsPollerDetails{StandbyReadTsPollerDetails: &d}
case SqlActivityFlushDetails:
return &Payload_SqlActivityFlushDetails{SqlActivityFlushDetails: &d}
default:
panic(errors.AssertionFailedf("jobs.WrapPayloadDetails: unknown details type %T", d))
}
Expand Down Expand Up @@ -613,7 +627,7 @@ const (
func (Type) SafeValue() {}

// NumJobTypes is the number of jobs types.
const NumJobTypes = 31
const NumJobTypes = 32

// ChangefeedDetailsMarshaler allows for dependency injection of
// cloud.SanitizeExternalStorageURI to avoid the dependency from this
Expand Down
2 changes: 2 additions & 0 deletions pkg/jobs/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,8 @@ const (
MVCCStatisticsJobID = jobspb.JobID(104)

UpdateTableMetadataCacheJobID = jobspb.JobID(105)

SqlActivityFlushJobID = jobspb.JobID(106)
)

// MakeJobID generates a new job ID.
Expand Down
2 changes: 2 additions & 0 deletions pkg/jobs/registry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ func TestRegistryGC(t *testing.T) {
SkipJobMetricsPollingJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
SkipUpdateTableMetadataCacheBootstrap: true,
SkipSqlActivityFlushJobBootstrap: true,
},
KeyVisualizer: &keyvisualizer.TestingKnobs{
SkipJobBootstrap: true,
Expand Down Expand Up @@ -282,6 +283,7 @@ func TestRegistryGCPagination(t *testing.T) {
SkipUpdateSQLActivityJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
SkipUpdateTableMetadataCacheBootstrap: true,
SkipSqlActivityFlushJobBootstrap: true,
},
KeyVisualizer: &keyvisualizer.TestingKnobs{
SkipJobBootstrap: true,
Expand Down
Loading

0 comments on commit 4bcc002

Please sign in to comment.