diff --git a/Cargo.lock b/Cargo.lock index b5f340d18..b38eb13a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3131,7 +3131,7 @@ dependencies = [ [[package]] name = "gateway-messages" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/management-gateway-service#177c9c719e12896c566a1b6b5416c9bc686531d3" +source = "git+https://github.com/oxidecomputer/management-gateway-service?branch=james%2Fhost-fails#7b4d6b5a8f394d26ccf90a17911ab76e3807113c" dependencies = [ "bitflags 2.9.4", "hubpack", @@ -5893,12 +5893,14 @@ dependencies = [ "drv-stm32h7-usart", "drv-stm32xx-sys-api", "enum-map", + "ereports", "heapless", "host-sp-messages", "hubpack", "idol", "idol-runtime", "ksz8463", + "microcbor", "multitimer", "num-traits", "oxide-barcode", diff --git a/Cargo.toml b/Cargo.toml index 5c5d47e35..bfb760aa5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -152,7 +152,7 @@ apob = { git = "https://github.com/oxidecomputer/apob", default-features = false # for the migration. attest-data = { git = "https://github.com/oxidecomputer/dice-util", default-features = false, version = "0.4.0", rev = "a0811d06c75c757a6e12c91ed6ea81fde137ba43" } dice-mfg-msgs = { git = "https://github.com/oxidecomputer/dice-util", default-features = false, version = "0.2.1", rev = "a0811d06c75c757a6e12c91ed6ea81fde137ba43" } -gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", default-features = false, features = ["smoltcp"] } +gateway-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", default-features = false, features = ["smoltcp"], branch = "james/host-fails" } gateway-ereport-messages = { git = "https://github.com/oxidecomputer/management-gateway-service", default-features = false } gimlet-inspector-protocol = { git = "https://github.com/oxidecomputer/gimlet-inspector-protocol", version = "0.1.0" } hif = { git = "https://github.com/oxidecomputer/hif", default-features = false } diff --git a/app/cosmo/base.toml b/app/cosmo/base.toml index 4fbb3beaf..ffa827fcf 100644 --- a/app/cosmo/base.toml +++ b/app/cosmo/base.toml @@ -132,7 +132,7 @@ notifications = ["i2c1-irq", "i2c2-irq", "i2c3-irq", "i2c4-irq"] [tasks.packrat] name = "task-packrat" priority = 1 -stacksize = 1400 +stacksize = 1600 start = true task-slots = ["jefe"] features = ["cosmo", "ereport"] @@ -258,7 +258,7 @@ features = ["stm32h753", "usart6", "baud_rate_3M", "hardware_flow_control", "vla uses = ["usart6", "dbgmcu"] interrupts = {"usart6.irq" = "usart-irq"} priority = 9 -max-sizes = {flash = 70000, ram = 65536} +max-sizes = {flash = 74000, ram = 65536} stacksize = 5400 start = true task-slots = ["sys", { cpu_seq = "cosmo_seq" }, "hf", "control_plane_agent", "net", "packrat", "i2c_driver", { spi_driver = "spi2_driver" }, "sprot", "auxflash"] diff --git a/app/gimlet/base.toml b/app/gimlet/base.toml index 05b740397..3e6fb364d 100644 --- a/app/gimlet/base.toml +++ b/app/gimlet/base.toml @@ -243,7 +243,7 @@ features = ["stm32h753", "uart7", "baud_rate_3M", "hardware_flow_control", "vlan uses = ["uart7", "dbgmcu"] interrupts = {"uart7.irq" = "usart-irq"} priority = 8 -max-sizes = {flash = 70000, ram = 65536} +max-sizes = {flash = 74000, ram = 65536} stacksize = 5376 start = true task-slots = ["sys", { cpu_seq = "gimlet_seq" }, "hf", "control_plane_agent", "net", "packrat", "i2c_driver", { spi_driver = "spi2_driver" }, "sprot"] diff --git a/idl/packrat.idol b/idl/packrat.idol index c8bd47691..a262c25a9 100644 --- a/idl/packrat.idol +++ b/idl/packrat.idol @@ -144,5 +144,84 @@ Interface( ), idempotent: true, ), + "write_host_bootfail": ( + doc: "Write a host's boot failure message and return the index of this failure", + args: { + "reason": "u8", + }, + leases: { + "data": (type: "[u8]", read: true), + }, + reply: Result( + ok: "HostInfoWriteOutput", + err: ServerDeath, + ), + ), + "read_first_host_bootfail_fragment": ( + doc: "Read a portion of the host's boot failure message", + args: { + }, + leases: { + "data": (type: "[u8]", write: true), + }, + reply: Result( + ok: "HostBootfailReadOutput", + err: CLike("HostInfoReadError"), + ), + idempotent: true, + ), + "read_host_bootfail_fragment": ( + doc: "Read a portion of the host's boot failure message", + args: { + "request": "HostInfoRequest", + }, + leases: { + "data": (type: "[u8]", write: true), + }, + reply: Result( + ok: "HostBootfailReadOutput", + err: CLike("HostInfoReadError"), + ), + idempotent: true, + ), + "write_host_panic": ( + doc: "Write a host's panic message and return the index of this panic", + args: { + }, + leases: { + "data": (type: "[u8]", read: true), + }, + reply: Result( + ok: "HostInfoWriteOutput", + err: ServerDeath, + ), + ), + "read_first_host_panic_fragment": ( + doc: "Read a portion of the host's panic message", + args: { + }, + leases: { + "data": (type: "[u8]", write: true), + }, + reply: Result( + ok: "HostPanicReadOutput", + err: CLike("HostInfoReadError"), + ), + idempotent: true, + ), + "read_host_panic_fragment": ( + doc: "Read a portion of the host's panic message", + args: { + "request": "HostInfoRequest", + }, + leases: { + "data": (type: "[u8]", write: true), + }, + reply: Result( + ok: "HostPanicReadOutput", + err: CLike("HostInfoReadError"), + ), + idempotent: true, + ), }, ) diff --git a/task/control-plane-agent/src/mgs_compute_sled.rs b/task/control-plane-agent/src/mgs_compute_sled.rs index 503402e01..1876f20bd 100644 --- a/task/control-plane-agent/src/mgs_compute_sled.rs +++ b/task/control-plane-agent/src/mgs_compute_sled.rs @@ -17,7 +17,8 @@ use gateway_messages::sp_impl::{ use gateway_messages::{ ApobComponentAction, ComponentAction, ComponentActionResponse, ComponentDetails, ComponentUpdatePrepare, DiscoverResponse, DumpSegment, - DumpTask, GpioToggleCount, Header, IgnitionCommand, IgnitionState, + DumpTask, GpioToggleCount, Header, HostBootfailPayloadData, + HostInfoRequest, HostPanicPayloadData, IgnitionCommand, IgnitionState, LastPostCode, Message, MessageKind, MgsError, MgsRequest, MgsResponse, PostCode, PowerState, PowerStateTransition, RotBootInfo, RotRequest, RotResponse, SERIAL_CONSOLE_IDLE_TIMEOUT, SensorRequest, SensorResponse, @@ -1259,6 +1260,101 @@ impl SpHandler for MgsHandler { })); self.host_flash_update.get_hash(slot) } + + fn get_host_panic_payload( + &mut self, + request: Option, + len: u32, + trailing_tx_buf: &mut [u8], + ) -> Result { + let max_len_usize = len as usize; + let max_len_usize = max_len_usize.min(trailing_tx_buf.len()); + let dest = &mut trailing_tx_buf[..max_len_usize]; + + let res = if let Some(req) = request { + self.common.packrat().read_host_panic_fragment( + task_packrat_api::HostInfoRequest { + offset: req.offset, + index: req.index, + }, + dest, + ) + } else { + self.common.packrat().read_first_host_panic_fragment(dest) + }; + + let info = res.map_err(|e| { + SpError::HostPanic(match e { + task_packrat_api::HostInfoReadError::NoHostInfo => { + gateway_messages::HostPanicError::NoHostInfo + } + task_packrat_api::HostInfoReadError::InvalidOffset => { + gateway_messages::HostPanicError::InvalidOffset + } + task_packrat_api::HostInfoReadError::InvalidIndex => { + gateway_messages::HostPanicError::InvalidIndex + } + task_packrat_api::HostInfoReadError::ServerRestarted => { + gateway_messages::HostPanicError::ServerRestarted + } + }) + })?; + + Ok(HostPanicPayloadData { + index: info.index, + len: info.read, + total_len: info.total_len as u32, + }) + } + + fn get_host_bootfail_payload( + &mut self, + request: Option, + len: u32, + trailing_tx_buf: &mut [u8], + ) -> Result { + let max_len_usize = len as usize; + let max_len_usize = max_len_usize.min(trailing_tx_buf.len()); + let dest = &mut trailing_tx_buf[..max_len_usize]; + + let res = if let Some(req) = request { + self.common.packrat().read_host_bootfail_fragment( + task_packrat_api::HostInfoRequest { + offset: req.offset, + index: req.index, + }, + dest, + ) + } else { + self.common + .packrat() + .read_first_host_bootfail_fragment(dest) + }; + + let info = res.map_err(|e| { + SpError::HostBootfail(match e { + task_packrat_api::HostInfoReadError::NoHostInfo => { + gateway_messages::HostBootfailError::NoHostInfo + } + task_packrat_api::HostInfoReadError::InvalidOffset => { + gateway_messages::HostBootfailError::InvalidOffset + } + task_packrat_api::HostInfoReadError::InvalidIndex => { + gateway_messages::HostBootfailError::InvalidIndex + } + task_packrat_api::HostInfoReadError::ServerRestarted => { + gateway_messages::HostBootfailError::ServerRestarted + } + }) + })?; + + Ok(HostBootfailPayloadData { + index: info.index, + len: info.read, + total_len: info.total_len as u32, + reason: info.reason, + }) + } } struct UsartHandler { diff --git a/task/host-sp-comms/Cargo.toml b/task/host-sp-comms/Cargo.toml index c5cc83dbf..fd5fdb76a 100644 --- a/task/host-sp-comms/Cargo.toml +++ b/task/host-sp-comms/Cargo.toml @@ -48,6 +48,10 @@ task-sensor-api = { path = "../../task/sensor-api", optional = true, features = ksz8463 = { path = "../../drv/ksz8463", optional = true } drv-sprot-api = { path = "../../drv/sprot-api"} +# ereports deps +ereports = { path = "../../lib/ereports", features = ["ereporter-macro"] } +microcbor = { path = "../../lib/microcbor" } + [build-dependencies] build-util.path = "../../build/util" build-i2c = { path = "../../build/i2c", optional = true } diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index 61a0b28df..451ed8daf 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -28,6 +28,7 @@ use host_sp_messages::{ }; use hubpack::SerializedSize; use idol_runtime::{NotificationHandler, RequestError}; +use microcbor::Encode; use multitimer::{Multitimer, Repeat}; use ringbuf::{counted_ringbuf, ringbuf_entry}; use static_assertions::const_assert; @@ -90,9 +91,6 @@ const A2_REBOOT_DELAY: u64 = 5_000; // response to send, and we haven't yet started to receive a request). const UART_ZERO_DELAY: u64 = 200; -// How long of a host panic / boot fail message are we willing to keep? -const MAX_HOST_FAIL_MESSAGE_LEN: usize = 4096; - // How many MAC addresses should we report to the host? Per RFD 320, a gimlet // currently needs 5 total: // @@ -233,9 +231,6 @@ const MAX_DTRACE_CONF_LEN: usize = 4096; // data for later read back (either by the host itself or by the control plane // via MGS). struct HostKeyValueStorage { - last_boot_fail_reason: u8, - last_boot_fail: &'static mut [u8; MAX_HOST_FAIL_MESSAGE_LEN], - last_panic: &'static mut [u8; MAX_HOST_FAIL_MESSAGE_LEN], etc_system: &'static mut [u8; MAX_ETC_SYSTEM_LEN], etc_system_len: usize, dtrace_conf: &'static mut [u8; MAX_DTRACE_CONF_LEN], @@ -292,6 +287,8 @@ struct ServerImpl { host_kv_storage: HostKeyValueStorage, hf_mux_state: Option, + ereporter: Ereporter, + /// Temporary space for inventory data, which is a large `enum` scratch: &'static mut host_sp_messages::InventoryData, @@ -331,8 +328,6 @@ impl ServerImpl { struct Bufs { tx_buf: tx_buf::StaticBufs, rx_buf: Vec, - last_boot_fail: [u8; MAX_HOST_FAIL_MESSAGE_LEN], - last_panic: [u8; MAX_HOST_FAIL_MESSAGE_LEN], etc_system: [u8; MAX_ETC_SYSTEM_LEN], dtrace_conf: [u8; MAX_DTRACE_CONF_LEN], scratch: host_sp_messages::InventoryData, @@ -346,8 +341,6 @@ impl ServerImpl { let &mut Bufs { ref mut tx_buf, ref mut rx_buf, - ref mut last_boot_fail, - ref mut last_panic, ref mut etc_system, ref mut dtrace_conf, ref mut scratch, @@ -361,8 +354,6 @@ impl ServerImpl { static BUFS: ClaimOnceCell = ClaimOnceCell::new(Bufs { tx_buf: tx_buf::StaticBufs::new(), rx_buf: Vec::new(), - last_boot_fail: [0; MAX_HOST_FAIL_MESSAGE_LEN], - last_panic: [0; MAX_HOST_FAIL_MESSAGE_LEN], etc_system: [0; MAX_ETC_SYSTEM_LEN], dtrace_conf: [0; MAX_DTRACE_CONF_LEN], #[cfg(not(any( @@ -380,6 +371,9 @@ impl ServerImpl { }); BUFS.claim() }; + + let packrat = Packrat::from(PACKRAT.get_task_id()); + Self { uart, sys, @@ -399,13 +393,10 @@ impl ServerImpl { cp_agent: ControlPlaneAgent::from( CONTROL_PLANE_AGENT.get_task_id(), ), - packrat: Packrat::from(PACKRAT.get_task_id()), + packrat: packrat.clone(), sprot: SpRot::from(SPROT.get_task_id()), reboot_state: None, host_kv_storage: HostKeyValueStorage { - last_boot_fail_reason: 0, - last_boot_fail, - last_panic, etc_system, etc_system_len: 0, dtrace_conf, @@ -414,6 +405,7 @@ impl ServerImpl { hf_mux_state: None, last_power_off: None, scratch, + ereporter: Ereporter::claim_static_resources(packrat), } } @@ -978,20 +970,32 @@ impl ServerImpl { // Indicate that the host boot failed, so that we can then tell // sequencer why we are asking it to power off the system. self.last_power_off = Some(StateChangeReason::HostBootFailure); - // TODO forward to MGS - // - // For now, copy it into a static var we can pull out via - // `humility host boot-fail`. - let n = usize::min( - data.len(), - self.host_kv_storage.last_boot_fail.len(), - ); - self.host_kv_storage.last_boot_fail[..n] - .copy_from_slice(&data[..n]); - for b in &mut self.host_kv_storage.last_boot_fail[n..] { - *b = 0; - } - self.host_kv_storage.last_boot_fail_reason = reason; + + // Store the bootfail message in packrat so it can be accessed by MGS in the future + // TODO: What to do if this call fails? Without it, we don't have a proper index, but + // this would only happen if packrat crashes. We could store some fake info here to + // continue preparing an ereport, but THAT is going to be a problem anyway because + // we send ereports to, you guessed it: packrat, which just crashed. + let response = self + .packrat + .write_host_bootfail(reason, data) + .unwrap_lite(); + + // TODO: Do we want to give this to packrat too? + // TODO: Update `humility host boot-fail` to use packrat API! + let flashidx = match self.hf.get_dev() { + Ok(HfDevSelect::Flash0) => 0, + Ok(HfDevSelect::Flash1) => 1, + Err(_) => 0xFF, + }; + + // ereport! + _ = self.ereporter.deliver_ereport(&HostBootFail { + n: response.index, + msglen: response.written as u32, + reason, + flashidx, + }); Some(SpToHost::Ack) } HostToSp::HostPanic => { @@ -1003,19 +1007,29 @@ impl ServerImpl { self.last_power_off = Some(StateChangeReason::HostPanic); } - // TODO forward to MGS - // - // For now, copy it into a static var we can pull out via - // `humility host last-panic`. - let n = usize::min( - data.len(), - self.host_kv_storage.last_panic.len(), - ); - self.host_kv_storage.last_panic[..n] - .copy_from_slice(&data[..n]); - for b in &mut self.host_kv_storage.last_panic[n..] { - *b = 0; - } + // Store the panic message in packrat so it can be accessed by MGS in the future + // TODO: What to do if this call fails? Without it, we don't have a proper index, but + // this would only happen if packrat crashes. We could store some fake info here to + // continue preparing an ereport, but THAT is going to be a problem anyway because + // we send ereports to, you guessed it: packrat, which just crashed. + let response = + self.packrat.write_host_panic(data).unwrap_lite(); + + // TODO: Do we want to give this to packrat too? + // TODO: Update `humility host last-panic` to use packrat API! + let flashidx = match self.hf.get_dev() { + Ok(HfDevSelect::Flash0) => 0, + Ok(HfDevSelect::Flash1) => 1, + Err(_) => 0xFF, + }; + + // ereport! + _ = self.ereporter.deliver_ereport(&HostPanic { + n: response.index, + msglen: response.written as u32, + flashidx, + }); + Some(SpToHost::Ack) } HostToSp::GetStatus => { @@ -2012,3 +2026,50 @@ mod idl { } include!(concat!(env!("OUT_DIR"), "/notifications.rs")); + +ereports::declare_ereporter! { + struct Ereporter { + HostPanic(HostPanic), + BootPanic(HostBootFail), + } +} + +/// An ereport represent a host reported panic +#[derive(Encode)] +#[ereport(class = "host.panic", version = 0)] +struct HostPanic { + /// The total number of host panics observed by this invocation of + /// host-sp-comms. + /// + /// This count will wrap, but is guaranteed to never be zero. + n: u32, + /// The length, in bytes, of the stored panic message. + /// + /// This quantity may be less than the amount received, as it is capped + /// by the available storage space allocated (`MAX_HOST_FAIL_MESSAGE_LEN`). + msglen: u32, + /// The flash boot index, directly correlated to which boot slot we are + /// operating from. Currently 0 (BSU: A), 1 (BSU: B), or 0xFF (unknown). + flashidx: u8, +} + +/// An ereport represent a host reported boot failure +#[derive(Encode)] +#[ereport(class = "host.btfail", version = 0)] +struct HostBootFail { + /// The total number of host boot failures observed by this invocation + /// of host-sp-comms. + /// + /// This count will wrap, but is guaranteed to never be zero. + n: u32, + /// The length, in bytes, of the stored panic message. + /// + /// This quantity may be less than the amount received, as it is capped + /// by the available storage space allocated (`MAX_HOST_FAIL_MESSAGE_LEN`). + msglen: u32, + /// The reported reason code for the host boot failure + reason: u8, + /// The flash boot index, directly correlated to which boot slot we are + /// operating from. Currently 0 (BSU: A), 1 (BSU: B), or 0xFF (unknown). + flashidx: u8, +} diff --git a/task/packrat-api/src/lib.rs b/task/packrat-api/src/lib.rs index ee6f0d6da..fa859cda4 100644 --- a/task/packrat-api/src/lib.rs +++ b/task/packrat-api/src/lib.rs @@ -72,6 +72,59 @@ pub enum EreportWriteError { Lost = 1, } +#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, Immutable)] +#[repr(C)] +pub struct HostInfoWriteOutput { + pub index: u32, + pub written: usize, +} + +#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, Immutable)] +#[repr(C)] +pub struct HostBootfailReadOutput { + pub read: usize, + pub offset: usize, + pub total_len: usize, + pub index: u32, + pub reason: u8, + pub _pad: [u8; 3], +} + +#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, zerocopy::Immutable)] +#[repr(C)] +pub struct HostPanicReadOutput { + pub read: usize, + pub offset: usize, + pub total_len: usize, + pub index: u32, +} + +#[derive(Copy, Clone, Debug, FromBytes, IntoBytes, Immutable, KnownLayout)] +#[repr(C)] +pub struct HostInfoRequest { + pub offset: u32, + pub index: u32, +} + +#[derive( + Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, IdolError, counters::Count, +)] +pub enum HostInfoReadError { + /// We have never received the requested Host Info, or this is a platform + /// that is not expected to have a host. + NoHostInfo = 1, + + /// The requested byte-offset is beyond the range of the currently stored + /// host info. + InvalidOffset, + + /// Requested index does not match the currently stored host information + InvalidIndex, + + #[idol(server_death)] + ServerRestarted, +} + /// Errors returned by [`Packrat::encode_ereport`]. #[derive(counters::Count)] #[cfg(feature = "microcbor")] diff --git a/task/packrat/src/main.rs b/task/packrat/src/main.rs index 6cf790b10..62c3a38fd 100644 --- a/task/packrat/src/main.rs +++ b/task/packrat/src/main.rs @@ -65,12 +65,16 @@ use core::convert::Infallible; use gateway_ereport_messages as ereport_messages; -use idol_runtime::{Leased, LenLimit, NotificationHandler, RequestError}; +use idol_runtime::{ + ClientError, Leased, LenLimit, NotificationHandler, RequestError, +}; use ringbuf::{ringbuf, ringbuf_entry}; use static_cell::ClaimOnceCell; use task_packrat_api::{ CacheGetError, CacheSetError, EreportReadError, EreportWriteError, - HostStartupOptions, MacAddressBlock, OxideIdentity, + HostBootfailReadOutput, HostInfoReadError, HostInfoRequest, + HostInfoWriteOutput, HostPanicReadOutput, HostStartupOptions, + MacAddressBlock, OxideIdentity, }; use userlib::RecvMessage; @@ -154,12 +158,54 @@ enum TraceSet { AttemptedSetToNewValue(T), } +/// Metadata about panics observed from the host +struct HostPanicMetadata { + /// Length in bytes of the currently stored panic message + total_length: usize, + /// (hopefully not) Rolling counter of panic messages observed this power cycle + total_count: u32, +} + +/// Metadata about panics observed from the host +struct HostBootFailMetadata { + /// Length in bytes of the currently stored bootfail message + total_length: usize, + /// (hopefully not) Rolling counter of panic messages observed this power cycle + total_count: u32, + /// Bootfail reason + reason: u8, +} + +pub struct HostInfo { + host_panic_payload: [u8; 4096], + host_bootfail_payload: [u8; 4096], + host_panic_state: Option, + host_bootfail_state: Option, +} + +impl HostInfo { + const fn new() -> Self { + Self { + host_panic_payload: [0u8; _], + host_bootfail_payload: [0u8; _], + host_panic_state: None, + host_bootfail_state: None, + } + } +} + ringbuf!(Trace, 16, Trace::None); #[unsafe(export_name = "main")] fn main() -> ! { struct StaticBufs { mac_address_block: Option, identity: Option, + #[cfg(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + ))] + host_info: HostInfo, #[cfg(feature = "gimlet")] gimlet_bufs: gimlet::StaticBufs, #[cfg(feature = "cosmo")] @@ -170,6 +216,12 @@ fn main() -> ! { let &mut StaticBufs { ref mut mac_address_block, ref mut identity, + #[cfg(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + ))] + ref mut host_info, #[cfg(feature = "gimlet")] ref mut gimlet_bufs, #[cfg(feature = "cosmo")] @@ -181,6 +233,12 @@ fn main() -> ! { ClaimOnceCell::new(StaticBufs { mac_address_block: None, identity: None, + #[cfg(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + ))] + host_info: HostInfo::new(), #[cfg(feature = "gimlet")] gimlet_bufs: gimlet::StaticBufs::new(), #[cfg(feature = "cosmo")] @@ -194,6 +252,12 @@ fn main() -> ! { let mut server = ServerImpl { mac_address_block, identity, + #[cfg(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + ))] + host_info, #[cfg(feature = "gimlet")] gimlet_data: gimlet::GimletData::new(gimlet_bufs), #[cfg(feature = "grapefruit")] @@ -213,6 +277,8 @@ fn main() -> ! { struct ServerImpl { mac_address_block: &'static mut Option, identity: &'static mut Option, + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + host_info: &'static mut HostInfo, #[cfg(feature = "gimlet")] gimlet_data: gimlet::GimletData, #[cfg(feature = "grapefruit")] @@ -569,6 +635,352 @@ impl idl::InOrderPackratImpl for ServerImpl { self.identity.as_ref(), ) } + + /// We're not a system that is expected to have a host, so we shouldn't have + /// anyone writing host bootfail messages to us + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn write_host_bootfail( + &mut self, + _msg: &userlib::RecvMessage, + _reason: u8, + _data: Leased, + ) -> Result< + HostInfoWriteOutput, + idol_runtime::RequestError, + > { + Err(idol_runtime::ClientError::UnknownOperation.fail()) + } + + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn write_host_bootfail( + &mut self, + _msg: &userlib::RecvMessage, + reason: u8, + data: Leased, + ) -> Result< + HostInfoWriteOutput, + idol_runtime::RequestError, + > { + // First, attempt to copy-in the new data, to ensure that we don't rev the + // metadata if the lease access fails. + let to_copy = + self.host_info.host_bootfail_payload.len().min(data.len()); + data.read_range( + 0..to_copy, + &mut self.host_info.host_bootfail_payload[..to_copy], + ) + .map_err(|_| ClientError::WentAway.fail())?; + + // Okay! We've written the requested data. Let's update the metadata. + // + // Take the old count, if any, and add one to it. If that count wrapped, + // or if we didn't have an old count, set it to 1, so we never return + // a count of zero if we've ever observed a boot failure. + let new_ct = self + .host_info + .host_bootfail_state + .take() + .map(|s| s.total_count.wrapping_add(1)) + .unwrap_or(0) + .max(1); + self.host_info.host_bootfail_state = Some(HostBootFailMetadata { + total_length: to_copy, + total_count: new_ct, + reason, + }); + + // Give the writer the current index and the number of bytes actually written + Ok(HostInfoWriteOutput { + index: new_ct, + written: to_copy, + }) + } + + /// We're not a system that is expected to have a host, therefore we can always return + /// "no host info", since we won't ever have any. + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn read_host_bootfail_fragment( + &mut self, + _msg: &userlib::RecvMessage, + _request: HostInfoRequest, + _data: Leased, + ) -> Result< + HostBootfailReadOutput, + idol_runtime::RequestError, + > { + Err(HostInfoReadError::NoHostInfo.into()) + } + + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn read_first_host_bootfail_fragment( + &mut self, + _msg: &userlib::RecvMessage, + data: Leased, + ) -> Result< + HostBootfailReadOutput, + idol_runtime::RequestError, + > { + Err(HostInfoReadError::NoHostInfo.into()) + } + + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn read_first_host_bootfail_fragment( + &mut self, + _msg: &userlib::RecvMessage, + data: Leased, + ) -> Result< + HostBootfailReadOutput, + idol_runtime::RequestError, + > { + self.host_bootfail_helper(None, data) + } + + /// Attempt to obtain the requested host info. + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn read_host_bootfail_fragment( + &mut self, + _msg: &userlib::RecvMessage, + request: HostInfoRequest, + data: Leased, + ) -> Result< + HostBootfailReadOutput, + idol_runtime::RequestError, + > { + self.host_bootfail_helper(Some(&request), data) + } + + /// We're not a system that is expected to have a host, therefore we can always return + /// "no host info", since we won't ever have any. + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn write_host_panic( + &mut self, + _msg: &userlib::RecvMessage, + _data: Leased, + ) -> Result< + HostInfoWriteOutput, + idol_runtime::RequestError, + > { + Err(idol_runtime::ClientError::UnknownOperation.fail()) + } + + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn write_host_panic( + &mut self, + _msg: &userlib::RecvMessage, + data: Leased, + ) -> Result< + HostInfoWriteOutput, + idol_runtime::RequestError, + > { + // First, attempt to copy-in the new data, to ensure that we don't rev the + // metadata if the lease access fails. + let to_copy = self.host_info.host_panic_payload.len().min(data.len()); + data.read_range( + 0..to_copy, + &mut self.host_info.host_panic_payload[..to_copy], + ) + .map_err(|_| ClientError::WentAway.fail())?; + + // Okay! We've written the requested data. Let's update the metadata. + // + // Take the old count, if any, and add one to it. If that count wrapped, + // or if we didn't have an old count, set it to 1, so we never return + // a count of zero if we've ever observed a panic. + let new_ct = self + .host_info + .host_panic_state + .take() + .map(|s| s.total_count.wrapping_add(1)) + .unwrap_or(0) + .max(1); + self.host_info.host_panic_state = Some(HostPanicMetadata { + total_length: to_copy, + total_count: new_ct, + }); + + // Give the writer the current index and the number of bytes actually written + Ok(HostInfoWriteOutput { + index: new_ct, + written: to_copy, + }) + } + + /// We're not a system that is expected to have a host, therefore we can always return + /// "no host info", since we won't ever have any. + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn read_host_panic_fragment( + &mut self, + _msg: &userlib::RecvMessage, + _request: HostInfoRequest, + _data: Leased, + ) -> Result< + HostPanicReadOutput, + idol_runtime::RequestError, + > { + Err(HostInfoReadError::NoHostInfo.into()) + } + + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn read_host_panic_fragment( + &mut self, + _msg: &userlib::RecvMessage, + request: HostInfoRequest, + data: Leased, + ) -> Result< + HostPanicReadOutput, + idol_runtime::RequestError, + > { + self.host_panic_helper(Some(&request), data) + } + + #[cfg(not(any( + feature = "gimlet", + feature = "grapefruit", + feature = "cosmo" + )))] + fn read_first_host_panic_fragment( + &mut self, + _msg: &userlib::RecvMessage, + data: Leased, + ) -> Result< + HostPanicReadOutput, + idol_runtime::RequestError, + > { + Err(HostInfoReadError::NoHostInfo.into()) + } + + #[cfg(any(feature = "gimlet", feature = "grapefruit", feature = "cosmo"))] + fn read_first_host_panic_fragment( + &mut self, + _msg: &userlib::RecvMessage, + data: Leased, + ) -> Result< + HostPanicReadOutput, + idol_runtime::RequestError, + > { + self.host_panic_helper(None, data) + } +} + +impl ServerImpl { + fn host_panic_helper( + &self, + req: Option<&HostInfoRequest>, + data: Leased, + ) -> Result< + HostPanicReadOutput, + idol_runtime::RequestError, + > { + // Do we *have* a panic to report? + let Some(bfs) = self.host_info.host_panic_state.as_ref() else { + return Err(HostInfoReadError::NoHostInfo.into()); + }; + + let length = bfs + .total_length + .min(self.host_info.host_panic_payload.len()); + let offset = if let Some(req) = req { + // Do we have the specific panic data being requested? + if bfs.total_count != req.index { + return Err(HostInfoReadError::InvalidIndex.into()); + } + + // Is the offset requested valid? + let offset_req = req.offset as usize; + if offset_req >= length { + return Err(HostInfoReadError::InvalidOffset.into()); + } + + offset_req + } else { + 0 + }; + + // Attempt to copy the requested range into the destination + let relevant = &self.host_info.host_panic_payload[offset..]; + let max_to_copy = data.len().min(relevant.len()); + data.write_range(0..max_to_copy, &relevant[..max_to_copy]) + .map_err(|_| ClientError::WentAway.fail())?; + + // Okay! Written! Return how many bytes were actually copied + Ok(HostPanicReadOutput { + read: max_to_copy, + offset: offset, + index: bfs.total_count, + total_len: length, + }) + } + + fn host_bootfail_helper( + &self, + request: Option<&HostInfoRequest>, + data: Leased, + ) -> Result< + HostBootfailReadOutput, + idol_runtime::RequestError, + > { + // Do we *have* a bootfail to report? + let Some(bfs) = self.host_info.host_bootfail_state.as_ref() else { + return Err(HostInfoReadError::NoHostInfo.into()); + }; + + let length = bfs + .total_length + .min(self.host_info.host_bootfail_payload.len()); + let offset = if let Some(req) = request { + // Do we have the specific bootfail data being requested? + if bfs.total_count != req.index { + return Err(HostInfoReadError::InvalidIndex.into()); + } + + // Is the offset requested valid? + let offset_req = req.offset as usize; + if offset_req >= length { + return Err(HostInfoReadError::InvalidOffset.into()); + } + offset_req + } else { + 0 + }; + + // Attempt to copy the requested range into the destination + let relevant = &self.host_info.host_bootfail_payload[offset..]; + let max_to_copy = data.len().min(relevant.len()); + data.write_range(0..max_to_copy, &relevant[..max_to_copy]) + .map_err(|_| ClientError::WentAway.fail())?; + + let out = HostBootfailReadOutput { + read: max_to_copy, + reason: bfs.reason, + index: bfs.total_count, + total_len: length, + offset: offset, + _pad: [0u8; _], + }; + + // Okay! Written! Return how many bytes were actually copied + Ok(out) + } } // If we are not built with ereport support, we expect no notifications. @@ -604,7 +1016,9 @@ impl NotificationHandler for ServerImpl { mod idl { use super::{ CacheGetError, CacheSetError, EreportReadError, EreportWriteError, - HostStartupOptions, MacAddressBlock, OxideIdentity, ereport_messages, + HostBootfailReadOutput, HostInfoReadError, HostInfoRequest, + HostInfoWriteOutput, HostPanicReadOutput, HostStartupOptions, + MacAddressBlock, OxideIdentity, ereport_messages, }; include!(concat!(env!("OUT_DIR"), "/server_stub.rs"));