Skip to content

Commit

Permalink
Fix missing write stats in Oximeter. (#1617)
Browse files Browse the repository at this point in the history
Write stats were not making it to Oximeter any longer.  Something we changed
recently prevented the stat update being called for completed writes.
The fix is to update the write fast-ack path to update the metrics that Oximeter
needed.

To do this, I made a common "stat update" function and added a call to
the stat update function when a write "would have" been acked had we not
fast-acked it already.  I also put back a call so the gw__write__done probe
point is fired.

While I was here, I also updated some paths for dtrace scripts and
updated column width for upstairs_count.d script.

Fixes #1615

---------

Co-authored-by: Alan Hanson <[email protected]>
  • Loading branch information
leftwo and Alan Hanson authored Jan 28, 2025
1 parent b63f7b7 commit df17b64
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 35 deletions.
2 changes: 1 addition & 1 deletion tools/dtrace/get-ds-state.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ filename='/tmp/get-ds-state.out'
# Clear out any previous state
echo "" > "$filename"
# Gather state on all running propolis servers, record summary to a file
dtrace -s /opt/oxide/dtrace/crucible/get-ds-state.d | sort -n | uniq | awk 'NF' > "$filename"
dtrace -s /opt/oxide/crucible_dtrace/get-ds-state.d | sort -n | uniq | awk 'NF' > "$filename"
# Walk the lines in the file, append the zone name to each line.
while read -r p; do
# For each line in the file, pull out the PID we are looking at and
Expand Down
2 changes: 1 addition & 1 deletion tools/dtrace/get-lr-state.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ filename='/tmp/get-lr-state.out'
# Clear out any previous state
echo "" > "$filename"
# Gather state on all running propolis servers, record summary to a file
dtrace -s /opt/oxide/dtrace/crucible/get-lr-state.d | sort -n | uniq | awk 'NF' > "$filename"
dtrace -s /opt/oxide/crucible_dtrace/get-lr-state.d | sort -n | uniq | awk 'NF' > "$filename"
# Walk the lines in the file, append the zone name to each line.
while read -r p; do
# For each line in the file, pull out the PID we are looking at and
Expand Down
2 changes: 1 addition & 1 deletion tools/dtrace/get-up-state.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ final='/tmp/get-up-state.final'
rm -f $final

# Gather our output first.
dtrace -s /opt/oxide/dtrace/crucible/get-up-state.d | awk 'NF' > "$filename"
dtrace -s /opt/oxide/crucible_dtrace/get-up-state.d | awk 'NF' > "$filename"
if [[ $? -ne 0 ]]; then
exit 1
fi
Expand Down
4 changes: 2 additions & 2 deletions tools/dtrace/upstairs_count.d
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@ crucible_upstairs*:::gw-barrier-done
tick-1s
/show > 20/
{
printf("%4s %4s %4s %4s %5s %5s %4s %4s %4s %4s",
printf("%5s %5s %5s %5s %5s %5s %5s %5s %5s %5s",
"F>", "F<", "W>", "W<", "R>", "R<", "WU>", "WU<", "B>", "B<");
printf("\n");
show = 0;
}

tick-1s
{
printa("%@4u %@4u %@4u %@4u %@5u %@5u %@4u %@4u %@4u %@4u",
printa("%@5u %@5u %@5u %@5u %@5u %@5u %@5u %@5u %@5u %@5u",
@flush_start, @flush_done, @write_start, @write_done,
@read_start, @read_done, @write_unwritten_start, @write_unwritten_done,
@barrier_start, @barrier_done
Expand Down
85 changes: 55 additions & 30 deletions upstairs/src/downstairs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,14 @@ impl Downstairs {
_ => (),
}
self.ack_job(ds_id);
} else if ack_ready && job.work.is_write() {
// We already acked this job, but, we should update dtrace probes
Self::update_io_done_stats(
&self.stats,
job.work.clone(),
ds_id,
job.io_size(),
);
}

if complete {
Expand All @@ -558,14 +566,52 @@ impl Downstairs {

// Fire DTrace probes and update stats
let io_size = done.io_size();
match &done.work {
let work = done.work.clone();
Self::update_io_done_stats(&self.stats, work, ds_id, io_size);

debug!(self.log, "[A] ack job {}", ds_id);

if let Some(r) = &mut self.repair {
r.on_job_complete(ds_id, done);
}

// Copy (if present) read data back to the guest buffer they
// provided to us, and notify any waiters.
if let Some(res) = done.res.take() {
let data = done
.data
.as_mut()
.map(|v| (v.blocks.as_slice(), &mut v.data));
res.transfer_and_notify(data, r);
}

if self.gw_active.remove(&ds_id) {
self.acked_ids.push(ds_id);
} else {
panic!("job {ds_id} not on gw_active list");
}
}

/// Update oximeter stats for a write operation.
pub fn update_write_done_metrics(&mut self, size: usize) {
self.stats.add_write(size as i64);
}

/// Update dtrace and oximeter metrics for a completed IO
pub fn update_io_done_stats(
stats: &DownstairsStatOuter,
work: IOop,
ds_id: JobId,
io_size: usize,
) {
match work {
IOop::Read { .. } => {
cdt::gw__read__done!(|| (ds_id.0));
self.stats.add_read(io_size as i64);
stats.add_read(io_size as i64);
}
IOop::Write { .. } => {
cdt::gw__write__done!(|| (ds_id.0));
self.stats.add_write(io_size as i64);
// We already updated metrics right after the fast ack.
}
IOop::WriteUnwritten { .. } => {
cdt::gw__write__unwritten__done!(|| (ds_id.0));
Expand All @@ -574,50 +620,29 @@ impl Downstairs {
}
IOop::Flush { .. } => {
cdt::gw__flush__done!(|| (ds_id.0));
self.stats.add_flush();
stats.add_flush();
}
IOop::Barrier { .. } => {
cdt::gw__barrier__done!(|| (ds_id.0));
self.stats.add_barrier();
stats.add_barrier();
}
IOop::ExtentFlushClose { extent, .. } => {
cdt::gw__close__done!(|| (ds_id.0, extent.0));
self.stats.add_flush_close();
stats.add_flush_close();
}
IOop::ExtentLiveRepair { extent, .. } => {
cdt::gw__repair__done!(|| (ds_id.0, extent.0));
self.stats.add_extent_repair();
stats.add_extent_repair();
}
IOop::ExtentLiveNoOp { .. } => {
cdt::gw__noop__done!(|| (ds_id.0));
self.stats.add_extent_noop();
stats.add_extent_noop();
}
IOop::ExtentLiveReopen { extent, .. } => {
cdt::gw__reopen__done!(|| (ds_id.0, extent.0));
self.stats.add_extent_reopen();
stats.add_extent_reopen();
}
}
debug!(self.log, "[A] ack job {}", ds_id);

if let Some(r) = &mut self.repair {
r.on_job_complete(ds_id, done);
}

// Copy (if present) read data back to the guest buffer they
// provided to us, and notify any waiters.
if let Some(res) = done.res.take() {
let data = done
.data
.as_mut()
.map(|v| (v.blocks.as_slice(), &mut v.data));
res.transfer_and_notify(data, r);
}

if self.gw_active.remove(&ds_id) {
self.acked_ids.push(ds_id);
} else {
panic!("job {ds_id} not on gw_active list");
}
}

/// Helper function to calculate pruned deps for a given job
Expand Down
5 changes: 5 additions & 0 deletions upstairs/src/upstairs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1487,6 +1487,11 @@ impl Upstairs {
// Fast-ack, pretending to be done immediately operations
res.send_ok(());

// Update Oximeter stats for this write.
if !is_write_unwritten {
self.downstairs.update_write_done_metrics(data.len());
}

Some(DeferredWrite {
ddef,
impacted_blocks,
Expand Down

0 comments on commit df17b64

Please sign in to comment.