Skip to content

Commit 29d12fc

Browse files
authored
fix(bmc-explorer): ignore BlueField BIOS 500 in NIC mode (#1201)
## Description BlueField DPU BMCs can return HTTP 500 when fetching the BIOS resource while the DPU is in NIC mode. This currently makes endpoint exploration fail even though the rest of the Redfish tree is still usable and BIOS-dependent checks can be skipped safely. Add an explicit `ignore_500_on_bios_fetch` exploration workaround flag and enable it for BlueField systems. When the flag is set, BIOS fetch HTTP 500 is logged and treated as missing BIOS instead of aborting exploration. ## Type of Change - [ ] **Add** - New feature or capability - [ ] **Change** - Changes in existing functionality - [x] **Fix** - Bug fixes - [ ] **Remove** - Removed features or deprecated functionality - [ ] **Internal** - Internal changes (refactoring, tests, docs, etc.) ## Related Issues (Optional) ## Breaking Changes - [ ] This PR contains breaking changes ## Testing - [x] Unit tests added/updated - [ ] Integration tests added/updated - [ ] Manual testing performed - [ ] No testing required (docs, internal refactor, etc.) ## Additional Notes Signed-off-by: Dmitry Porokh <dporokh@nvidia.com>
1 parent d36dc00 commit 29d12fc

5 files changed

Lines changed: 74 additions & 14 deletions

File tree

crates/bmc-explorer/src/computer_system.rs

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ lazy_static::lazy_static! {
4949
pub struct Config<'a, B: Bmc> {
5050
pub need_oem_nvidia_bluefield: bool,
5151
// Temporary workaround for BlueField DPU BMCs that intermittently return
52+
// HTTP 500 for the BIOS resource while the DPU is in NIC mode.
53+
pub ignore_500_on_bios_fetch: bool,
54+
// Temporary workaround for BlueField DPU BMCs that intermittently return
5255
// HTTP 404 for the OOB interface or the full EthernetInterfaces collection.
5356
// This is expected to be fixed in BMC firmware 24.10-39, which adds
5457
// internal retries.
@@ -83,7 +86,7 @@ impl<B: Bmc> ExploredComputerSystem<B> {
8386
vec![]
8487
};
8588

86-
let bios = system.bios().await.map_err(Error::nv_redfish("bios"))?;
89+
let bios = Self::fetch_bios(&system, config).await?;
8790

8891
let ethernet_interfaces = Self::fetch_eth_interfaces(&system, config)
8992
.await
@@ -113,6 +116,28 @@ impl<B: Bmc> ExploredComputerSystem<B> {
113116
})
114117
}
115118

119+
async fn fetch_bios(
120+
system: &ComputerSystem<B>,
121+
config: &Config<'_, B>,
122+
) -> Result<Option<Bios<B>>, Error<B>> {
123+
match system.bios().await {
124+
Ok(bios) => Ok(bios),
125+
Err(err) if config.ignore_500_on_bios_fetch => {
126+
if let nv_redfish::Error::Bmc(bmc_error) = &err
127+
&& (config.explore.error_classifier)(bmc_error)
128+
== Some(ErrorClass::InternalServerError)
129+
{
130+
// Ignore BlueField DPU BIOS HTTP 500 because it may fail in NIC mode.
131+
tracing::warn!("ignoring HTTP 500 while fetching BlueField DPU BIOS");
132+
Ok(None)
133+
} else {
134+
Err(Error::nv_redfish("bios")(err))
135+
}
136+
}
137+
Err(err) => Err(Error::nv_redfish("bios")(err)),
138+
}
139+
}
140+
116141
async fn fetch_eth_interfaces(
117142
system: &ComputerSystem<B>,
118143
config: &Config<'_, B>,
@@ -130,7 +155,7 @@ impl<B: Bmc> ExploredComputerSystem<B> {
130155
Err(err) if config.retry_404_on_eth_interfaces && retries_remaining != 0 => {
131156
if let nv_redfish::Error::Bmc(bmc_error) = &err
132157
&& (config.explore.error_classifier)(bmc_error)
133-
== Some(ErrorClass::HttpNotFound)
158+
== Some(ErrorClass::NotFound)
134159
{
135160
tracing::warn!(
136161
"received 404 on system's ethernet collection fetch. Retrying. {retries_remaining} tries left"
@@ -145,7 +170,6 @@ impl<B: Bmc> ExploredComputerSystem<B> {
145170
}
146171
}
147172
}
148-
149173
pub fn to_model(
150174
&self,
151175
hw_type: Option<hw::HwType>,

crates/bmc-explorer/src/lib.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@ use nv_redfish::{Bmc, Resource, ServiceRoot};
5454

5555
#[derive(PartialEq, Eq)]
5656
pub enum ErrorClass {
57-
HttpNotFound,
57+
NotFound,
58+
InternalServerError,
5859
}
5960

6061
pub type ErrorClassifier<'a, B> = &'a (dyn Fn(&<B as Bmc>::Error) -> Option<ErrorClass> + Sync);
@@ -122,9 +123,11 @@ pub async fn nv_generate_exploration_report<B: Bmc>(
122123
.next()
123124
.ok_or_else(Error::bmc_not_provided("at least one manager"))?;
124125

126+
let is_bluefield_system = system.id().into_inner() == "Bluefield";
125127
let system_explore_config = computer_system::Config {
126-
need_oem_nvidia_bluefield: system.id().into_inner() == "Bluefield",
127-
retry_404_on_eth_interfaces: system.id().into_inner() == "Bluefield",
128+
need_oem_nvidia_bluefield: is_bluefield_system,
129+
ignore_500_on_bios_fetch: is_bluefield_system,
130+
retry_404_on_eth_interfaces: is_bluefield_system,
128131
explore: config,
129132
};
130133
let explored_system = ExploredComputerSystem::explore(system, &system_explore_config).await?;

crates/bmc-explorer/tests/bluefield3_explore.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,32 @@ async fn explore_bluefield3_succeeds_when_erot_returns_error() {
199199
);
200200
assert_eq!(report.chassis.len(), 3);
201201
}
202+
203+
#[test]
204+
async fn explore_bluefield3_ignores_500_on_bios_fetch() {
205+
let h = test_support::dell_poweredge_r750_bluefield3_bmc(DpuSettings::default()).await;
206+
207+
h.state.injected_bugs.update_args(bmc_mock::bug::Args {
208+
http_error: Some(bmc_mock::bug::HttpErrorRule {
209+
method: Some("GET".into()),
210+
path: "/redfish/v1/Systems/Bluefield/Bios".to_string(),
211+
status: 500,
212+
remaining: 100,
213+
}),
214+
..Default::default()
215+
});
216+
217+
let report = nv_generate_exploration_report(h.service_root, &common::explorer_config())
218+
.await
219+
.expect("exploration must succeed when BlueField BIOS fetch returns 500");
220+
221+
assert_eq!(report.endpoint_type, EndpointType::Bmc);
222+
assert_eq!(report.vendor, Some(bmc_vendor::BMCVendor::Nvidia));
223+
assert!(
224+
report
225+
.machine_setup_status
226+
.as_ref()
227+
.is_some_and(|status| !status.diffs.is_empty() || status.is_done),
228+
"machine setup status must be present and structurally valid"
229+
);
230+
}

crates/bmc-explorer/tests/common.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,11 @@ use bmc_mock::test_support::axum_http_client::Error as TestBmcError;
2424

2525
pub fn error_classifier(err: &<TestBmc as nv_redfish::Bmc>::Error) -> Option<ErrorClass> {
2626
match err {
27-
TestBmcError::InvalidResponse {
28-
status: StatusCode::NOT_FOUND,
29-
..
30-
} => Some(ErrorClass::HttpNotFound),
27+
TestBmcError::InvalidResponse { status, .. } => match *status {
28+
StatusCode::NOT_FOUND => Some(ErrorClass::NotFound),
29+
StatusCode::INTERNAL_SERVER_ERROR => Some(ErrorClass::InternalServerError),
30+
_ => None,
31+
},
3132
_ => None,
3233
}
3334
}

crates/site-explorer/src/redfish.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,10 +1275,13 @@ fn nv_error_classifier(
12751275
) -> Option<bmc_explorer::ErrorClass> {
12761276
type BmcError = carbide_redfish::nv_redfish::BmcError;
12771277
match err {
1278-
BmcError::InvalidResponse {
1279-
status: http::StatusCode::NOT_FOUND,
1280-
..
1281-
} => Some(bmc_explorer::ErrorClass::HttpNotFound),
1278+
BmcError::InvalidResponse { status, .. } => match *status {
1279+
http::StatusCode::NOT_FOUND => Some(bmc_explorer::ErrorClass::NotFound),
1280+
http::StatusCode::INTERNAL_SERVER_ERROR => {
1281+
Some(bmc_explorer::ErrorClass::InternalServerError)
1282+
}
1283+
_ => None,
1284+
},
12821285
_ => None,
12831286
}
12841287
}

0 commit comments

Comments
 (0)