Skip to content

Commit 506cfd9

Browse files
authored
feat(metrics): add progress metrics via collector (#17359)
* add query scan rows metrics * fix build * register the metrics * track scan & write progress * track spill progress * refactor * attach session manager to it * update the finished query * finish the metrics * fix clippy * fix header * fix cargo fmt * fix tablo
1 parent 31d0fac commit 506cfd9

File tree

11 files changed

+241
-3
lines changed

11 files changed

+241
-3
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/common/base/src/base/progress.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@ pub struct ProgressValues {
2424
pub bytes: usize,
2525
}
2626

27+
impl ProgressValues {
28+
pub fn add(&self, other: &ProgressValues) -> ProgressValues {
29+
ProgressValues {
30+
rows: self.rows + other.rows,
31+
bytes: self.bytes + other.bytes,
32+
}
33+
}
34+
}
35+
2736
#[derive(Debug)]
2837
pub struct Progress {
2938
rows: AtomicUsize,

src/common/base/src/runtime/metrics/registry.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,10 @@ impl GlobalRegistry {
110110
metric
111111
}
112112

113+
pub fn register_collector(&self, collector: Box<dyn prometheus_client::collector::Collector>) {
114+
self.inner.lock().registry.register_collector(collector);
115+
}
116+
113117
pub(crate) fn new_scoped_metric(&self, index: usize) -> impl Iterator<Item = ScopedMetric> {
114118
let global_registry = self.inner.lock();
115119
let mut scoped_metrics = Vec::with_capacity(global_registry.metrics.len() - index);

src/common/metrics/src/metrics/interpreter.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,13 @@ const METRIC_QUERY_TOTAL_PARTITIONS: &str = "query_total_partitions";
3939
const METRIC_QUERY_RESULT_ROWS: &str = "query_result_rows";
4040
const METRIC_QUERY_RESULT_BYTES: &str = "query_result_bytes";
4141

42+
pub const METRIC_QUERY_SCAN_PROGRESS_ROWS: &str = "query_scan_progress_rows";
43+
pub const METRIC_QUERY_SCAN_PROGRESS_BYTES: &str = "query_scan_progress_bytes";
44+
pub const METRIC_QUERY_WRITE_PROGRESS_ROWS: &str = "query_write_progress_rows";
45+
pub const METRIC_QUERY_WRITE_PROGRESS_BYTES: &str = "query_write_progress_bytes";
46+
pub const METRIC_QUERY_SPILL_PROGRESS_ROWS: &str = "query_spill_progress_rows";
47+
pub const METRIC_QUERY_SPILL_PROGRESS_BYTES: &str = "query_spill_progress_bytes";
48+
4249
pub static QUERY_START: LazyLock<FamilyCounter<VecLabels>> =
4350
LazyLock::new(|| register_counter_family(METRIC_QUERY_START));
4451
pub static QUERY_SUCCESS: LazyLock<FamilyCounter<VecLabels>> =

src/query/catalog/src/table_context.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ pub struct ProcessInfo {
9292
/// storage metrics for persisted data reading.
9393
pub data_metrics: Option<StorageMetrics>,
9494
pub scan_progress_value: Option<ProgressValues>,
95+
pub write_progress_value: Option<ProgressValues>,
96+
pub spill_progress_value: Option<ProgressValues>,
9597
pub mysql_connection_id: Option<u32>,
9698
pub created_time: SystemTime,
9799
pub status_info: Option<String>,

src/query/service/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ paste = { workspace = true }
150150
petgraph = { workspace = true }
151151
pin-project-lite = { workspace = true }
152152
poem = { workspace = true }
153+
prometheus-client = { workspace = true }
153154
prost = { workspace = true }
154155
rand = { workspace = true }
155156
recursive = { workspace = true }

src/query/service/src/interpreters/interpreter.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,16 @@ fn log_query_finished(ctx: &QueryContext, error: Option<ErrorCode>, has_profiles
187187
let typ = session.get_type();
188188
if typ.is_user_session() {
189189
SessionManager::instance().status.write().query_finish(now);
190+
SessionManager::instance()
191+
.metrics_collector
192+
.track_finished_query(
193+
ctx.get_scan_progress_value(),
194+
ctx.get_write_progress_value(),
195+
ctx.get_join_spill_progress_value(),
196+
ctx.get_aggregate_spill_progress_value(),
197+
ctx.get_group_by_spill_progress_value(),
198+
ctx.get_window_partition_spill_progress_value(),
199+
);
190200
}
191201

192202
if let Err(error) = InterpreterQueryLog::log_finish(ctx, now, error, has_profiles) {

src/query/service/src/sessions/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ mod session;
2020
mod session_ctx;
2121
mod session_info;
2222
mod session_mgr;
23+
mod session_mgr_metrics;
2324
mod session_mgr_status;
2425
mod session_privilege_mgr;
2526
mod session_status;

src/query/service/src/sessions/session_info.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ impl Session {
5454
memory_usage,
5555
data_metrics: Self::query_data_metrics(session_ctx),
5656
scan_progress_value: Self::query_scan_progress_value(session_ctx),
57+
write_progress_value: Self::query_write_progress_value(session_ctx),
58+
spill_progress_value: Self::query_spill_progress_value(session_ctx),
5759
mysql_connection_id: self.mysql_connection_id,
5860
created_time: Self::query_created_time(session_ctx),
5961
status_info: shared_query_context
@@ -105,6 +107,27 @@ impl Session {
105107
.map(|context_shared| context_shared.scan_progress.get_values())
106108
}
107109

110+
fn query_write_progress_value(status: &SessionContext) -> Option<ProgressValues> {
111+
status
112+
.get_query_context_shared()
113+
.as_ref()
114+
.map(|context_shared| context_shared.write_progress.get_values())
115+
}
116+
117+
fn query_spill_progress_value(status: &SessionContext) -> Option<ProgressValues> {
118+
status
119+
.get_query_context_shared()
120+
.as_ref()
121+
.map(|context_shared| {
122+
context_shared
123+
.agg_spill_progress
124+
.get_values()
125+
.add(&context_shared.join_spill_progress.get_values())
126+
.add(&context_shared.window_partition_spill_progress.get_values())
127+
.add(&context_shared.group_by_spill_progress.get_values())
128+
})
129+
}
130+
108131
fn query_created_time(status: &SessionContext) -> SystemTime {
109132
match status.get_query_context_shared() {
110133
None => SystemTime::now(),

src/query/service/src/sessions/session_mgr.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use std::time::Duration;
2424
use databend_common_base::base::tokio;
2525
use databend_common_base::base::GlobalInstance;
2626
use databend_common_base::base::SignalStream;
27+
use databend_common_base::runtime::metrics::GLOBAL_METRICS_REGISTRY;
2728
use databend_common_catalog::table_context::ProcessInfoState;
2829
use databend_common_config::GlobalConfig;
2930
use databend_common_config::InnerConfig;
@@ -38,6 +39,7 @@ use log::info;
3839
use parking_lot::RwLock;
3940

4041
use crate::sessions::session::Session;
42+
use crate::sessions::session_mgr_metrics::SessionManagerMetricsCollector;
4143
use crate::sessions::ProcessInfo;
4244
use crate::sessions::SessionContext;
4345
use crate::sessions::SessionManagerStatus;
@@ -47,6 +49,7 @@ pub struct SessionManager {
4749
pub(in crate::sessions) max_sessions: usize,
4850
pub(in crate::sessions) active_sessions: Arc<RwLock<HashMap<String, Weak<Session>>>>,
4951
pub status: Arc<RwLock<SessionManagerStatus>>,
52+
pub metrics_collector: SessionManagerMetricsCollector,
5053

5154
// When typ is MySQL, insert into this map, key is id, val is MySQL connection id.
5255
pub(crate) mysql_conn_map: Arc<RwLock<HashMap<Option<u32>, String>>>,
@@ -55,20 +58,26 @@ pub struct SessionManager {
5558

5659
impl SessionManager {
5760
pub fn init(conf: &InnerConfig) -> Result<()> {
58-
GlobalInstance::set(Self::create(conf));
61+
let global_instance = Self::create(conf);
62+
GlobalInstance::set(global_instance.clone());
63+
GLOBAL_METRICS_REGISTRY
64+
.register_collector(Box::new(global_instance.metrics_collector.clone()));
5965

6066
Ok(())
6167
}
6268

6369
pub fn create(conf: &InnerConfig) -> Arc<SessionManager> {
6470
let max_sessions = conf.query.max_active_sessions as usize;
65-
Arc::new(SessionManager {
71+
let mgr = Arc::new(SessionManager {
6672
max_sessions,
6773
mysql_basic_conn_id: AtomicU32::new(9_u32.to_le()),
6874
status: Arc::new(RwLock::new(SessionManagerStatus::default())),
6975
mysql_conn_map: Arc::new(RwLock::new(HashMap::with_capacity(max_sessions))),
7076
active_sessions: Arc::new(RwLock::new(HashMap::with_capacity(max_sessions))),
71-
})
77+
metrics_collector: SessionManagerMetricsCollector::new(),
78+
});
79+
mgr.metrics_collector.attach_session_manager(mgr.clone());
80+
mgr
7281
}
7382

7483
pub fn instance() -> Arc<SessionManager> {
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// Copyright 2021 Datafuse Labs
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use std::sync::Arc;
16+
17+
use databend_common_base::base::ProgressValues;
18+
use databend_common_metrics::interpreter::METRIC_QUERY_SCAN_PROGRESS_BYTES;
19+
use databend_common_metrics::interpreter::METRIC_QUERY_SCAN_PROGRESS_ROWS;
20+
use databend_common_metrics::interpreter::METRIC_QUERY_SPILL_PROGRESS_BYTES;
21+
use databend_common_metrics::interpreter::METRIC_QUERY_SPILL_PROGRESS_ROWS;
22+
use databend_common_metrics::interpreter::METRIC_QUERY_WRITE_PROGRESS_BYTES;
23+
use databend_common_metrics::interpreter::METRIC_QUERY_WRITE_PROGRESS_ROWS;
24+
use parking_lot::Mutex;
25+
use prometheus_client::collector::Collector;
26+
use prometheus_client::encoding::EncodeMetric;
27+
use prometheus_client::metrics::counter::ConstCounter;
28+
29+
use crate::sessions::SessionManager;
30+
31+
/// [`SessionManagerMetricsCollector`] dumps the progress metrics of scan/write/spills
32+
/// from the [`SessionManager`]'s running queries to the prometheus. To avoid the progress
33+
/// metrics being decreased, we also need to accumulate these progress values after the query
34+
/// is finished.
35+
#[derive(Clone)]
36+
pub struct SessionManagerMetricsCollector {
37+
inner: Arc<Mutex<SessionManagerMetricsCollectorInner>>,
38+
}
39+
40+
pub(crate) struct SessionManagerMetricsCollectorInner {
41+
session_mgr: Option<Arc<SessionManager>>,
42+
finished_scan_total: ProgressValues,
43+
finished_write_total: ProgressValues,
44+
finished_spill_total: ProgressValues,
45+
}
46+
47+
impl SessionManagerMetricsCollector {
48+
pub fn new() -> Self {
49+
Self {
50+
inner: Arc::new(Mutex::new(SessionManagerMetricsCollectorInner {
51+
session_mgr: None,
52+
finished_scan_total: ProgressValues::default(),
53+
finished_write_total: ProgressValues::default(),
54+
finished_spill_total: ProgressValues::default(),
55+
})),
56+
}
57+
}
58+
59+
pub fn attach_session_manager(&self, session_mgr: Arc<SessionManager>) {
60+
let mut guard = self.inner.lock();
61+
guard.session_mgr.replace(session_mgr);
62+
}
63+
64+
pub fn track_finished_query(
65+
&self,
66+
scan: ProgressValues,
67+
write: ProgressValues,
68+
join_spill: ProgressValues,
69+
aggregate_spill: ProgressValues,
70+
group_by_spill: ProgressValues,
71+
window_partition_spill: ProgressValues,
72+
) {
73+
let mut guard = self.inner.lock();
74+
guard.finished_scan_total = guard.finished_scan_total.add(&scan);
75+
guard.finished_write_total = guard.finished_write_total.add(&write);
76+
guard.finished_spill_total = guard
77+
.finished_spill_total
78+
.add(&join_spill)
79+
.add(&aggregate_spill)
80+
.add(&group_by_spill)
81+
.add(&window_partition_spill);
82+
}
83+
}
84+
85+
impl Default for SessionManagerMetricsCollector {
86+
fn default() -> Self {
87+
Self::new()
88+
}
89+
}
90+
91+
impl std::fmt::Debug for SessionManagerMetricsCollector {
92+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
93+
write!(f, "SessionMetricsCollector")
94+
}
95+
}
96+
97+
impl Collector for SessionManagerMetricsCollector {
98+
fn encode(
99+
&self,
100+
mut encoder: prometheus_client::encoding::DescriptorEncoder,
101+
) -> Result<(), std::fmt::Error> {
102+
let processes = {
103+
match self.inner.lock().session_mgr.as_ref() {
104+
Some(mgr) => mgr.processes_info(),
105+
None => return Ok(()),
106+
}
107+
};
108+
109+
let (mut scan_progress, mut write_progress, mut spill_progress) = {
110+
let guard = self.inner.lock();
111+
(
112+
guard.finished_scan_total.clone(),
113+
guard.finished_write_total.clone(),
114+
guard.finished_spill_total.clone(),
115+
)
116+
};
117+
for process in processes {
118+
if let Some(scan) = &process.scan_progress_value {
119+
scan_progress = scan_progress.add(scan);
120+
}
121+
if let Some(write) = &process.write_progress_value {
122+
write_progress = write_progress.add(write);
123+
}
124+
if let Some(spill) = &process.spill_progress_value {
125+
spill_progress = spill_progress.add(spill);
126+
}
127+
}
128+
129+
let metrics = vec![
130+
(
131+
METRIC_QUERY_SCAN_PROGRESS_ROWS,
132+
scan_progress.rows as f64,
133+
"Total scan rows in progress.",
134+
),
135+
(
136+
METRIC_QUERY_SCAN_PROGRESS_BYTES,
137+
scan_progress.bytes as f64,
138+
"Total scan bytes in progress.",
139+
),
140+
(
141+
METRIC_QUERY_WRITE_PROGRESS_ROWS,
142+
write_progress.rows as f64,
143+
"Total write rows in progress.",
144+
),
145+
(
146+
METRIC_QUERY_WRITE_PROGRESS_BYTES,
147+
write_progress.bytes as f64,
148+
"Total write bytes in progress.",
149+
),
150+
(
151+
METRIC_QUERY_SPILL_PROGRESS_ROWS,
152+
spill_progress.rows as f64,
153+
"Total spill rows in progress.",
154+
),
155+
(
156+
METRIC_QUERY_SPILL_PROGRESS_BYTES,
157+
spill_progress.bytes as f64,
158+
"Total spill bytes in progress.",
159+
),
160+
];
161+
162+
for (name, value, help) in metrics {
163+
let counter = ConstCounter::new(value);
164+
let counter_encoder =
165+
encoder.encode_descriptor(name, help, None, counter.metric_type())?;
166+
counter.encode(counter_encoder)?;
167+
}
168+
169+
Ok(())
170+
}
171+
}

0 commit comments

Comments
 (0)