Skip to content

Commit 7323282

Browse files
authored
Switch tracking unit for dbsync from total query time to total connection time (#35)
* feat: updated metrics collector to use connection time instead query time * chore: adjusted docker image to build with essential packages * chore: adjusted query to get only dmtr users * fix: removed dmtr header
1 parent 5cce992 commit 7323282

File tree

8 files changed

+613
-175
lines changed

8 files changed

+613
-175
lines changed

operator/Cargo.lock

+486-46
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

operator/Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ bech32 = "0.9.1"
2626
sha3 = "0.10.8"
2727
lazy_static = "1.4.0"
2828
deadpool-postgres = "0.12.1"
29+
chrono = "0.4.38"
30+
reqwest = { version = "0.12.4", features = ["json"] }
2931

3032
[[bin]]
3133
name = "controller"

operator/Dockerfile

+4-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ FROM rust:1.74-slim-buster as build
22

33
WORKDIR /app
44

5+
RUN apt update
6+
RUN apt install -y build-essential pkg-config libssl-dev cmake
7+
58
COPY ./Cargo.lock ./Cargo.lock
69
COPY ./Cargo.toml ./Cargo.toml
710
COPY ./src ./src
@@ -12,4 +15,4 @@ FROM rust:1.74-slim-buster
1215

1316
COPY --from=build /app/target/release/controller .
1417

15-
CMD ["./controller"]
18+
CMD ["./controller"]

operator/src/config.rs

+5
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ pub struct Config {
1717
pub dcu_per_second: HashMap<String, f64>,
1818

1919
pub metrics_delay: Duration,
20+
pub prometheus_url: String,
2021
pub statement_timeout: u64,
2122
}
2223

@@ -64,6 +65,8 @@ impl Config {
6465
.expect("METRICS_DELAY must be a number"),
6566
);
6667

68+
let prometheus_url = env::var("PROMETHEUS_URL").expect("PROMETHEUS_URL must be set");
69+
6770
let statement_timeout = env::var("STATEMENT_TIMEOUT")
6871
.unwrap_or("120000".to_string())
6972
.parse::<u64>()
@@ -75,6 +78,7 @@ impl Config {
7578
db_max_connections,
7679
dcu_per_second,
7780
metrics_delay,
81+
prometheus_url,
7882
statement_timeout,
7983
}
8084
}
@@ -93,6 +97,7 @@ mod tests {
9397
);
9498
env::set_var("DCU_PER_SECOND", "preview=5,preprod=5,mainnet=5");
9599
env::set_var("METRICS_DELAY", "100");
100+
env::set_var("PROMETHEUS_URL", "localhost");
96101
env::set_var("STATEMENT_TIMEOUT", "100");
97102

98103
let config = Config::from_env();

operator/src/lib.rs

+8
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ pub enum Error {
3131

3232
#[error("Config Error: {0}")]
3333
ConfigError(String),
34+
35+
#[error("Http Request error: {0}")]
36+
HttpError(String),
3437
}
3538

3639
impl Error {
@@ -75,6 +78,11 @@ impl From<bech32::Error> for Error {
7578
Error::Bech32Error(value)
7679
}
7780
}
81+
impl From<reqwest::Error> for Error {
82+
fn from(value: reqwest::Error) -> Self {
83+
Error::HttpError(value.to_string())
84+
}
85+
}
7886

7987
#[derive(Clone)]
8088
pub struct State {

operator/src/metrics.rs

+105-89
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
1-
use futures::future;
1+
use chrono::Utc;
22
use kube::{api::ListParams, Api, Client, Resource, ResourceExt};
33
use prometheus::{opts, IntCounterVec, Registry};
4-
use std::{collections::HashMap, sync::Arc};
5-
use tracing::{error, info, instrument};
4+
use serde::{Deserialize, Deserializer};
5+
use std::sync::Arc;
6+
use tracing::{error, info, instrument, warn};
67

7-
use crate::{
8-
get_config,
9-
postgres::{Postgres, UserStatements},
10-
DbSyncPort, Error, State,
11-
};
8+
use crate::{get_config, Config, DbSyncPort, Error, State};
129

1310
#[derive(Clone)]
1411
pub struct Metrics {
@@ -135,13 +132,14 @@ pub async fn run_metrics_collector(state: Arc<State>) {
135132
.await
136133
.expect("failed to create kube client");
137134

138-
let config = get_config();
139-
140-
let mut metrics_state: HashMap<String, HashMap<String, UserStatements>> = HashMap::new();
141-
142135
let crds_api = Api::<DbSyncPort>::all(client.clone());
143136

137+
let config = get_config();
138+
let mut last_execution = Utc::now();
139+
144140
loop {
141+
tokio::time::sleep(config.metrics_delay).await;
142+
145143
let crds_result = crds_api.list(&ListParams::default()).await;
146144
if let Err(error) = crds_result {
147145
error!(error = error.to_string(), "error to get k8s resources");
@@ -150,97 +148,115 @@ pub async fn run_metrics_collector(state: Arc<State>) {
150148
}
151149
let crds = crds_result.unwrap();
152150

153-
for crd in crds.items.iter().filter(|i| i.status.is_some()) {
154-
let status = crd.status.as_ref().unwrap();
151+
let end = Utc::now();
152+
let interval = (end - last_execution).num_seconds();
155153

156-
let pg_connections_result = state.get_pg_by_network(&crd.spec.network);
157-
if let Err(error) = pg_connections_result {
158-
error!(error = error.to_string());
159-
state.metrics.metrics_failure(&error);
154+
last_execution = end;
155+
156+
let query = format!(
157+
"sum by (user) (avg_over_time(pgbouncer_pools_client_active_connections{{user=~\"dmtr_.*\"}}[{interval}s] @ {})) > 0",
158+
end.timestamp_millis() / 1000
159+
);
160+
dbg!(&query);
161+
162+
let response = collect_prometheus_metrics(config, query).await;
163+
if let Err(err) = response {
164+
error!(error = err.to_string(), "error to make prometheus request");
165+
state.metrics.metrics_failure(&err);
166+
continue;
167+
}
168+
let response = response.unwrap();
169+
170+
for result in response.data.result {
171+
let crd = crds
172+
.iter()
173+
.filter(|c| c.status.is_some())
174+
.find(|c| c.status.as_ref().unwrap().username.eq(&result.metric.user));
175+
176+
if crd.is_none() {
177+
warn!(user = result.metric.user, "username doesnt have a crd");
160178
continue;
161179
}
162180

163-
let user_statements_result =
164-
get_user_statements(&status.username, pg_connections_result.unwrap()).await;
165-
if let Err(error) = user_statements_result {
166-
error!(error = error.to_string(), "error get user statements");
181+
let crd = crd.unwrap();
182+
183+
let dcu_per_second = config.dcu_per_second.get(&crd.spec.network);
184+
if dcu_per_second.is_none() {
185+
let error = Error::ConfigError(format!(
186+
"dcu_per_second not configured to {} network",
187+
&crd.spec.network
188+
));
189+
error!(error = error.to_string());
167190
state.metrics.metrics_failure(&error);
168191
continue;
169192
}
170193

171-
let user_statements = user_statements_result.unwrap();
172-
173-
let latest_user_statement = metrics_state
174-
.entry(crd.spec.network.clone())
175-
.or_default()
176-
.get(&user_statements.usename);
177-
178-
if let Some(latest_user_statement) = latest_user_statement {
179-
let total_exec_time =
180-
user_statements.total_exec_time - latest_user_statement.total_exec_time;
181-
182-
if total_exec_time == 0.0 {
183-
continue;
184-
}
185-
186-
let dcu_per_second = config.dcu_per_second.get(&crd.spec.network);
187-
if dcu_per_second.is_none() {
188-
let error = Error::ConfigError(format!(
189-
"dcu_per_second not configured to {} network",
190-
&crd.spec.network
191-
));
192-
error!(error = error.to_string());
193-
state.metrics.metrics_failure(&error);
194-
continue;
195-
}
196-
197-
let dcu_per_second = dcu_per_second.unwrap();
198-
199-
let dcu = (total_exec_time / 1000.) * dcu_per_second;
200-
state.metrics.count_dcu_consumed(
201-
&crd.namespace().unwrap(),
202-
&crd.spec.network,
203-
dcu,
204-
);
205-
}
194+
let dcu_per_second = dcu_per_second.unwrap();
195+
let total_exec_time = result.value * (interval as f64);
206196

207-
metrics_state
208-
.entry(crd.spec.network.clone())
209-
.and_modify(|statements| {
210-
statements.insert(user_statements.usename.clone(), user_statements);
211-
});
197+
let dcu = total_exec_time * dcu_per_second;
198+
state
199+
.metrics
200+
.count_dcu_consumed(&crd.namespace().unwrap(), &crd.spec.network, dcu);
212201
}
213-
214-
tokio::time::sleep(config.metrics_delay).await;
215202
}
216203
});
217204
}
218205

219-
async fn get_user_statements(
220-
username: &str,
221-
pg_connections: &[Postgres],
222-
) -> Result<UserStatements, Error> {
223-
let tasks = future::join_all(
224-
pg_connections
225-
.iter()
226-
.map(|pg| pg.find_metrics_by_user(username)),
227-
)
228-
.await;
229-
230-
let mut user_statements_all_host = UserStatements {
231-
usename: username.into(),
232-
total_exec_time: 0.,
233-
};
234-
235-
for user_statements_by_host_result in tasks.into_iter() {
236-
let user_statements_by_host = user_statements_by_host_result?;
237-
if user_statements_by_host.is_none() {
238-
continue;
239-
}
240-
241-
let user_statements_by_host = user_statements_by_host.unwrap();
242-
user_statements_all_host.total_exec_time += user_statements_by_host.total_exec_time;
206+
async fn collect_prometheus_metrics(
207+
config: &Config,
208+
query: String,
209+
) -> Result<PrometheusResponse, Error> {
210+
let client = reqwest::Client::builder().build().unwrap();
211+
212+
let response = client
213+
.get(format!("{}/query?query={query}", config.prometheus_url))
214+
.send()
215+
.await?;
216+
217+
let status = response.status();
218+
if status.is_client_error() || status.is_server_error() {
219+
error!(status = status.to_string(), "request status code fail");
220+
return Err(Error::HttpError(format!(
221+
"Prometheus request error. Status: {} Query: {}",
222+
status, query
223+
)));
243224
}
244225

245-
Ok(user_statements_all_host)
226+
Ok(response.json().await.unwrap())
227+
}
228+
229+
#[derive(Debug, Deserialize)]
230+
struct PrometheusDataResultMetric {
231+
user: String,
232+
}
233+
234+
#[derive(Debug, Deserialize)]
235+
struct PrometheusDataResult {
236+
metric: PrometheusDataResultMetric,
237+
#[serde(deserialize_with = "deserialize_value")]
238+
value: f64,
239+
}
240+
241+
#[derive(Debug, Deserialize)]
242+
#[serde(rename_all = "camelCase")]
243+
struct PrometheusData {
244+
result: Vec<PrometheusDataResult>,
245+
}
246+
247+
#[derive(Debug, Deserialize)]
248+
struct PrometheusResponse {
249+
data: PrometheusData,
250+
}
251+
252+
fn deserialize_value<'de, D>(deserializer: D) -> Result<f64, D::Error>
253+
where
254+
D: Deserializer<'de>,
255+
{
256+
let value: Vec<serde_json::Value> = Deserialize::deserialize(deserializer)?;
257+
Ok(value.into_iter().as_slice()[1]
258+
.as_str()
259+
.unwrap()
260+
.parse::<f64>()
261+
.unwrap())
246262
}

operator/src/postgres.rs

+1-39
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use std::str::FromStr;
22

33
use deadpool_postgres::{Manager, ManagerConfig, Pool, RecyclingMethod};
4-
use tokio_postgres::{NoTls, Row};
4+
use tokio_postgres::NoTls;
55

66
use crate::{get_config, Error};
77

@@ -111,42 +111,4 @@ impl Postgres {
111111

112112
Ok(result.is_some())
113113
}
114-
115-
pub async fn find_metrics_by_user(
116-
&self,
117-
username: &str,
118-
) -> Result<Option<UserStatements>, Error> {
119-
let query_metrics = "select
120-
usename,
121-
sum(total_exec_time) as total_exec_time
122-
from
123-
pg_stat_statements
124-
inner join
125-
pg_catalog.pg_user on pg_catalog.pg_user.usesysid = userid
126-
where
127-
pg_catalog.pg_user.usename = $1
128-
group by
129-
usename;";
130-
131-
let client = self.pool.get().await?;
132-
133-
let stmt = client.prepare(query_metrics).await?;
134-
let result = client.query_opt(&stmt, &[&username]).await?;
135-
136-
Ok(result.as_ref().map(|row| row.into()))
137-
}
138-
}
139-
140-
#[derive(Debug, Clone)]
141-
pub struct UserStatements {
142-
pub usename: String,
143-
pub total_exec_time: f64,
144-
}
145-
impl From<&Row> for UserStatements {
146-
fn from(row: &Row) -> Self {
147-
Self {
148-
usename: row.get("usename"),
149-
total_exec_time: row.get("total_exec_time"),
150-
}
151-
}
152114
}

operator/test/manifest.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ spec:
171171
value: "preview=5"
172172
- name: METRICS_DELAY
173173
value: "30"
174+
- name: PROMETHEUS_URL
175+
value: localhost
174176
- name: RUST_LOG
175177
value: info,kube=debug,controller=debug
176178
---

0 commit comments

Comments
 (0)