Skip to content

Commit df17b64

Browse files
leftwoAlan Hanson
andauthored
Fix missing write stats in Oximeter. (#1617)
Write stats were not making it to Oximeter any longer. Something we changed recently prevented the stat update being called for completed writes. The fix is to update the write fast-ack path to update the metrics that Oximeter needed. To do this, I made a common "stat update" function and added a call to the stat update function when a write "would have" been acked had we not fast-acked it already. I also put back a call so the gw__write__done probe point is fired. While I was here, I also updated some paths for dtrace scripts and updated column width for upstairs_count.d script. Fixes #1615 --------- Co-authored-by: Alan Hanson <[email protected]>
1 parent b63f7b7 commit df17b64

File tree

6 files changed

+65
-35
lines changed

6 files changed

+65
-35
lines changed

tools/dtrace/get-ds-state.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ filename='/tmp/get-ds-state.out'
77
# Clear out any previous state
88
echo "" > "$filename"
99
# Gather state on all running propolis servers, record summary to a file
10-
dtrace -s /opt/oxide/dtrace/crucible/get-ds-state.d | sort -n | uniq | awk 'NF' > "$filename"
10+
dtrace -s /opt/oxide/crucible_dtrace/get-ds-state.d | sort -n | uniq | awk 'NF' > "$filename"
1111
# Walk the lines in the file, append the zone name to each line.
1212
while read -r p; do
1313
# For each line in the file, pull out the PID we are looking at and

tools/dtrace/get-lr-state.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ filename='/tmp/get-lr-state.out'
77
# Clear out any previous state
88
echo "" > "$filename"
99
# Gather state on all running propolis servers, record summary to a file
10-
dtrace -s /opt/oxide/dtrace/crucible/get-lr-state.d | sort -n | uniq | awk 'NF' > "$filename"
10+
dtrace -s /opt/oxide/crucible_dtrace/get-lr-state.d | sort -n | uniq | awk 'NF' > "$filename"
1111
# Walk the lines in the file, append the zone name to each line.
1212
while read -r p; do
1313
# For each line in the file, pull out the PID we are looking at and

tools/dtrace/get-up-state.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ final='/tmp/get-up-state.final'
77
rm -f $final
88

99
# Gather our output first.
10-
dtrace -s /opt/oxide/dtrace/crucible/get-up-state.d | awk 'NF' > "$filename"
10+
dtrace -s /opt/oxide/crucible_dtrace/get-up-state.d | awk 'NF' > "$filename"
1111
if [[ $? -ne 0 ]]; then
1212
exit 1
1313
fi

tools/dtrace/upstairs_count.d

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,15 @@ crucible_upstairs*:::gw-barrier-done
7878
tick-1s
7979
/show > 20/
8080
{
81-
printf("%4s %4s %4s %4s %5s %5s %4s %4s %4s %4s",
81+
printf("%5s %5s %5s %5s %5s %5s %5s %5s %5s %5s",
8282
"F>", "F<", "W>", "W<", "R>", "R<", "WU>", "WU<", "B>", "B<");
8383
printf("\n");
8484
show = 0;
8585
}
8686

8787
tick-1s
8888
{
89-
printa("%@4u %@4u %@4u %@4u %@5u %@5u %@4u %@4u %@4u %@4u",
89+
printa("%@5u %@5u %@5u %@5u %@5u %@5u %@5u %@5u %@5u %@5u",
9090
@flush_start, @flush_done, @write_start, @write_done,
9191
@read_start, @read_done, @write_unwritten_start, @write_unwritten_done,
9292
@barrier_start, @barrier_done

upstairs/src/downstairs.rs

Lines changed: 55 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,14 @@ impl Downstairs {
537537
_ => (),
538538
}
539539
self.ack_job(ds_id);
540+
} else if ack_ready && job.work.is_write() {
541+
// We already acked this job, but, we should update dtrace probes
542+
Self::update_io_done_stats(
543+
&self.stats,
544+
job.work.clone(),
545+
ds_id,
546+
job.io_size(),
547+
);
540548
}
541549

542550
if complete {
@@ -558,14 +566,52 @@ impl Downstairs {
558566

559567
// Fire DTrace probes and update stats
560568
let io_size = done.io_size();
561-
match &done.work {
569+
let work = done.work.clone();
570+
Self::update_io_done_stats(&self.stats, work, ds_id, io_size);
571+
572+
debug!(self.log, "[A] ack job {}", ds_id);
573+
574+
if let Some(r) = &mut self.repair {
575+
r.on_job_complete(ds_id, done);
576+
}
577+
578+
// Copy (if present) read data back to the guest buffer they
579+
// provided to us, and notify any waiters.
580+
if let Some(res) = done.res.take() {
581+
let data = done
582+
.data
583+
.as_mut()
584+
.map(|v| (v.blocks.as_slice(), &mut v.data));
585+
res.transfer_and_notify(data, r);
586+
}
587+
588+
if self.gw_active.remove(&ds_id) {
589+
self.acked_ids.push(ds_id);
590+
} else {
591+
panic!("job {ds_id} not on gw_active list");
592+
}
593+
}
594+
595+
/// Update oximeter stats for a write operation.
596+
pub fn update_write_done_metrics(&mut self, size: usize) {
597+
self.stats.add_write(size as i64);
598+
}
599+
600+
/// Update dtrace and oximeter metrics for a completed IO
601+
pub fn update_io_done_stats(
602+
stats: &DownstairsStatOuter,
603+
work: IOop,
604+
ds_id: JobId,
605+
io_size: usize,
606+
) {
607+
match work {
562608
IOop::Read { .. } => {
563609
cdt::gw__read__done!(|| (ds_id.0));
564-
self.stats.add_read(io_size as i64);
610+
stats.add_read(io_size as i64);
565611
}
566612
IOop::Write { .. } => {
567613
cdt::gw__write__done!(|| (ds_id.0));
568-
self.stats.add_write(io_size as i64);
614+
// We already updated metrics right after the fast ack.
569615
}
570616
IOop::WriteUnwritten { .. } => {
571617
cdt::gw__write__unwritten__done!(|| (ds_id.0));
@@ -574,50 +620,29 @@ impl Downstairs {
574620
}
575621
IOop::Flush { .. } => {
576622
cdt::gw__flush__done!(|| (ds_id.0));
577-
self.stats.add_flush();
623+
stats.add_flush();
578624
}
579625
IOop::Barrier { .. } => {
580626
cdt::gw__barrier__done!(|| (ds_id.0));
581-
self.stats.add_barrier();
627+
stats.add_barrier();
582628
}
583629
IOop::ExtentFlushClose { extent, .. } => {
584630
cdt::gw__close__done!(|| (ds_id.0, extent.0));
585-
self.stats.add_flush_close();
631+
stats.add_flush_close();
586632
}
587633
IOop::ExtentLiveRepair { extent, .. } => {
588634
cdt::gw__repair__done!(|| (ds_id.0, extent.0));
589-
self.stats.add_extent_repair();
635+
stats.add_extent_repair();
590636
}
591637
IOop::ExtentLiveNoOp { .. } => {
592638
cdt::gw__noop__done!(|| (ds_id.0));
593-
self.stats.add_extent_noop();
639+
stats.add_extent_noop();
594640
}
595641
IOop::ExtentLiveReopen { extent, .. } => {
596642
cdt::gw__reopen__done!(|| (ds_id.0, extent.0));
597-
self.stats.add_extent_reopen();
643+
stats.add_extent_reopen();
598644
}
599645
}
600-
debug!(self.log, "[A] ack job {}", ds_id);
601-
602-
if let Some(r) = &mut self.repair {
603-
r.on_job_complete(ds_id, done);
604-
}
605-
606-
// Copy (if present) read data back to the guest buffer they
607-
// provided to us, and notify any waiters.
608-
if let Some(res) = done.res.take() {
609-
let data = done
610-
.data
611-
.as_mut()
612-
.map(|v| (v.blocks.as_slice(), &mut v.data));
613-
res.transfer_and_notify(data, r);
614-
}
615-
616-
if self.gw_active.remove(&ds_id) {
617-
self.acked_ids.push(ds_id);
618-
} else {
619-
panic!("job {ds_id} not on gw_active list");
620-
}
621646
}
622647

623648
/// Helper function to calculate pruned deps for a given job

upstairs/src/upstairs.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1487,6 +1487,11 @@ impl Upstairs {
14871487
// Fast-ack, pretending to be done immediately operations
14881488
res.send_ok(());
14891489

1490+
// Update Oximeter stats for this write.
1491+
if !is_write_unwritten {
1492+
self.downstairs.update_write_done_metrics(data.len());
1493+
}
1494+
14901495
Some(DeferredWrite {
14911496
ddef,
14921497
impacted_blocks,

0 commit comments

Comments
 (0)