From 1145c393818d70f9c73670540a7e90c556ef41ae Mon Sep 17 00:00:00 2001 From: Aaron Hartwig Date: Mon, 25 Mar 2024 11:15:41 -0500 Subject: [PATCH] Decouple `transceivers` server from Front IO readiness (#1588) Currently the `transceivers` server will loop and await the Front IO board to be ready before entering is service loop. This means that in the case where the Front IO board is not present (or ready), `transceivers` server cannot handle any incoming requests, making the host software that is sending those requests unhappy. This change addresses that. Additionally, this commit pulls in the richer HwError variants from the transceiver-messages crate. By interrogating the FPGAs about what precisely went wrong, the SP can now pass along much more detailed fault information to the host OS via these variants. Fixes #1562, closes #1572 --- Cargo.lock | 2 +- Cargo.toml | 2 +- drv/sidecar-front-io/src/controller.rs | 3 +- drv/sidecar-front-io/src/transceivers.rs | 161 +++++++++-- drv/transceivers-api/src/lib.rs | 1 + drv/transceivers-server/src/main.rs | 84 +++--- drv/transceivers-server/src/udp.rs | 331 +++++++++++++++++------ 7 files changed, 445 insertions(+), 139 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2ce36765..8b1d50ef6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5329,7 +5329,7 @@ dependencies = [ [[package]] name = "transceiver-messages" version = "0.1.1" -source = "git+https://github.com/oxidecomputer/transceiver-control/#84e28d1263d9d07c5410fb0644469c8eb7b5fb5f" +source = "git+https://github.com/oxidecomputer/transceiver-control/#0a18f231372069b74b72f1756e2337186e888dc0" dependencies = [ "bitflags 2.4.2", "hubpack", diff --git a/Cargo.toml b/Cargo.toml index 980abda7b..187428781 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -135,5 +135,5 @@ sprockets-common = { git = "https://github.com/oxidecomputer/sprockets.git", def sprockets-rot = { git = "https://github.com/oxidecomputer/sprockets.git", default-features = false } tlvc = { git = "https://github.com/oxidecomputer/tlvc", default-features = false, version = "0.3.1" } tlvc-text = { git = "https://github.com/oxidecomputer/tlvc", default-features = false, version = "0.3.0" } -transceiver-messages = { git = "https://github.com/oxidecomputer/transceiver-control/", default-features = false } +transceiver-messages = { git = "https://github.com/oxidecomputer/transceiver-control/", default-features = false} vsc7448-pac = { git = "https://github.com/oxidecomputer/vsc7448", default-features = false } diff --git a/drv/sidecar-front-io/src/controller.rs b/drv/sidecar-front-io/src/controller.rs index 0a8284fcb..ef7a20556 100644 --- a/drv/sidecar-front-io/src/controller.rs +++ b/drv/sidecar-front-io/src/controller.rs @@ -4,6 +4,7 @@ use crate::{Addr, SIDECAR_IO_BITSTREAM_CHECKSUM}; use drv_fpga_api::*; +use userlib::UnwrapLite; pub struct FrontIOController { fpga: Fpga, @@ -90,6 +91,6 @@ impl FrontIOController { /// Returns the expected (short) checksum, which simply a prefix of the full /// SHA3-256 hash of the bitstream. pub fn short_checksum() -> [u8; 4] { - SIDECAR_IO_BITSTREAM_CHECKSUM[..4].try_into().unwrap() + SIDECAR_IO_BITSTREAM_CHECKSUM[..4].try_into().unwrap_lite() } } diff --git a/drv/sidecar-front-io/src/transceivers.rs b/drv/sidecar-front-io/src/transceivers.rs index 48044d154..692545ac7 100644 --- a/drv/sidecar-front-io/src/transceivers.rs +++ b/drv/sidecar-front-io/src/transceivers.rs @@ -6,6 +6,7 @@ use crate::{Addr, Reg}; use drv_fpga_api::{FpgaError, FpgaUserDesign, WriteOp}; use drv_transceivers_api::{ModuleStatus, TransceiversError, NUM_PORTS}; use transceiver_messages::ModuleId; +use userlib::{FromPrimitive, UnwrapLite}; use zerocopy::{byteorder, AsBytes, FromBytes, Unaligned, U16}; // The transceiver modules are split across two FPGAs on the QSFP Front IO @@ -24,7 +25,7 @@ pub enum FpgaController { } /// Physical port location -#[derive(Copy, Clone)] +#[derive(Copy, Clone, PartialEq)] pub struct PortLocation { pub controller: FpgaController, pub port: PhysicalPort, @@ -37,7 +38,7 @@ impl From for PortLocation { } /// Physical port location within a particular FPGA, as a 0-15 index -#[derive(Copy, Clone)] +#[derive(Copy, Clone, PartialEq)] pub struct PhysicalPort(pub u8); impl PhysicalPort { pub fn as_mask(&self) -> PhysicalPortMask { @@ -47,6 +48,20 @@ impl PhysicalPort { pub fn get(&self) -> u8 { self.0 } + + pub fn to_logical_port( + &self, + fpga: FpgaController, + ) -> Result { + let loc = PortLocation { + controller: fpga, + port: *self, + }; + match PORT_MAP.into_iter().position(|&l| l == loc) { + Some(p) => Ok(LogicalPort(p as u8)), + None => Err(TransceiversError::InvalidPhysicalToLogicalMap), + } + } } /// Physical port mask within a particular FPGA, as a 16-bit bitfield @@ -479,11 +494,26 @@ pub struct ModuleResult { success: LogicalPortMask, failure: LogicalPortMask, error: LogicalPortMask, + failure_types: LogicalPortFailureTypes, +} + +/// This is a slimmed down version of ModuleResult for use in the ringbuf +#[derive(Copy, Clone, PartialEq)] +pub struct ModuleResultSlim { + success: LogicalPortMask, + failure: LogicalPortMask, + error: LogicalPortMask, } impl From for ModuleResult { fn from(r: ModuleResultNoFailure) -> Self { - ModuleResult::new(r.success(), LogicalPortMask(0), r.error()).unwrap() + ModuleResult::new( + r.success(), + LogicalPortMask(0), + r.error(), + LogicalPortFailureTypes::default(), + ) + .unwrap_lite() } } @@ -493,6 +523,7 @@ impl ModuleResult { success: LogicalPortMask, failure: LogicalPortMask, error: LogicalPortMask, + failure_types: LogicalPortFailureTypes, ) -> Result { if !(success & failure).is_empty() || !(success & error).is_empty() @@ -504,6 +535,7 @@ impl ModuleResult { success, failure, error, + failure_types, }) } @@ -519,6 +551,10 @@ impl ModuleResult { self.error } + pub fn failure_types(&self) -> LogicalPortFailureTypes { + self.failure_types + } + /// Combines two `ModuleResults` /// /// Intended to be used for a sequence of `ModuleResult` yielding function @@ -555,7 +591,29 @@ impl ModuleResult { // has subsequently occurred. let failure = (self.failure() | next.failure()) & !self.error(); - Self::new(success, failure, error).unwrap() + // merge the failure types observed, prefering newer failure types + // should both results have a failure at the same port. + let mut combined_failures = LogicalPortFailureTypes::default(); + for p in failure.to_indices() { + if next.failure().is_set(p) { + combined_failures.0[p.0 as usize] = + next.failure_types().0[p.0 as usize]; + } else if self.failure().is_set(p) { + combined_failures.0[p.0 as usize] = + self.failure_types().0[p.0 as usize]; + } + } + + Self::new(success, failure, error, combined_failures).unwrap_lite() + } + + /// Helper to provide a nice way to get a ModuleResultSlim from this result + pub fn to_slim(&self) -> ModuleResultSlim { + ModuleResultSlim { + success: self.success(), + failure: self.failure(), + error: self.error(), + } } } @@ -592,6 +650,49 @@ impl ModuleResultNoFailure { } } +/// A type to provide more ergonomic access to the FPGA generated type +pub type FpgaI2CFailure = Reg::QSFP::PORT0_STATUS::Encoded; + +/// A type to consolidate per-module failure types. +/// +/// Currently the only types of operations that can be considered failures are +/// those that involve the FPGA doing I2C. Thus, that is the only supported type +/// right now. +#[derive(Copy, Clone, PartialEq)] +pub struct LogicalPortFailureTypes(pub [FpgaI2CFailure; NUM_PORTS as usize]); + +impl Default for LogicalPortFailureTypes { + fn default() -> Self { + Self([FpgaI2CFailure::NoError; NUM_PORTS as usize]) + } +} + +impl core::ops::Index for LogicalPortFailureTypes { + type Output = FpgaI2CFailure; + + fn index(&self, i: LogicalPort) -> &Self::Output { + &self.0[i.0 as usize] + } +} + +#[derive(Copy, Clone)] +pub struct PortI2CStatus { + pub done: bool, + pub error: FpgaI2CFailure, +} + +impl PortI2CStatus { + pub fn new(status: u8) -> Self { + Self { + done: (status & Reg::QSFP::PORT0_STATUS::BUSY) == 0, + error: FpgaI2CFailure::from_u8( + status & Reg::QSFP::PORT0_STATUS::ERROR, + ) + .unwrap_lite(), + } + } +} + impl Transceivers { pub fn new(fpga_task: userlib::TaskId) -> Self { Self { @@ -641,7 +742,7 @@ impl Transceivers { // only have an error where there was a requested module in mask error &= mask; - ModuleResultNoFailure::new(success, error).unwrap() + ModuleResultNoFailure::new(success, error).unwrap_lite() } /// Set power enable bits per the specified `mask`. @@ -769,8 +870,8 @@ impl Transceivers { let error = !success; ( - ModuleStatus::read_from(status_masks.as_bytes()).unwrap(), - ModuleResultNoFailure::new(success, error).unwrap(), + ModuleStatus::read_from(status_masks.as_bytes()).unwrap_lite(), + ModuleResultNoFailure::new(success, error).unwrap_lite(), ) } @@ -816,7 +917,7 @@ impl Transceivers { // only have an error where there was a requested module in mask error &= mask; - ModuleResultNoFailure::new(success, error).unwrap() + ModuleResultNoFailure::new(success, error).unwrap_lite() } /// Initiate an I2C random read on all ports per the specified `mask`. @@ -912,7 +1013,7 @@ impl Transceivers { success &= mask; let error = mask & !success; - ModuleResultNoFailure::new(success, error).unwrap() + ModuleResultNoFailure::new(success, error).unwrap_lite() } /// Read the value of the QSFP_PORTx_STATUS @@ -987,7 +1088,7 @@ impl Transceivers { } let error = !success; - ModuleResultNoFailure::new(success, error).unwrap() + ModuleResultNoFailure::new(success, error).unwrap_lite() } /// For a given `local_port`, return the Addr where its read buffer begins @@ -1089,30 +1190,31 @@ impl Transceivers { let mut physical_failure = FpgaPortMasks::default(); let mut physical_error = FpgaPortMasks::default(); let phys_mask: FpgaPortMasks = mask.into(); + let mut failure_types = LogicalPortFailureTypes::default(); #[derive(AsBytes, Default, FromBytes)] #[repr(C)] - struct StatusAndErr { + struct BusyAndPortStatus { busy: u16, - err: [u8; 8], + status: [u8; 8], } for fpga_index in phys_mask.iter_fpgas() { let fpga = self.fpga(fpga_index); // This loop should break immediately, because I2C is fast - let status = loop { - // Two bytes of BUSY, followed by 8 bytes of error status - let status = match fpga.read(Addr::QSFP_I2C_BUSY0) { + let status_all = loop { + // Two bytes of BUSY, followed by 816bytes of port status + let status_all = match fpga.read(Addr::QSFP_I2C_BUSY0) { Ok(data) => data, // If there is an FPGA communication error, mark that as an // error on all of that FPGA's ports Err(_) => { *physical_error.get_mut(fpga_index) = PhysicalPortMask(0xffff); - StatusAndErr::default() + BusyAndPortStatus::default() } }; - if status.busy == 0 { - break status; + if status_all.busy == 0 { + break status_all; } userlib::hl::sleep_for(1); }; @@ -1123,25 +1225,30 @@ impl Transceivers { FpgaController::Right => phys_mask.right, }; for port in (0..16).map(PhysicalPort) { + // skip ports we didn't interact with if !phys_mask.is_set(port) { continue; } - // Each error byte packs together two ports - let err = status.err[port.0 as usize / 2] - >> ((port.0 as usize % 2) * 4); - - // For now, check for the presence of an error, but don't bother - // reporting the details. - let has_err = (err & 0b1000) != 0; - if has_err { + + // cast status byte into a type + let failure = FpgaI2CFailure::from_u8( + status_all.status[port.0 as usize] + & Reg::QSFP::PORT0_STATUS::ERROR, + ) + .unwrap_lite(); + // if a failure occurred, mark it and record the failure type + if failure != FpgaI2CFailure::NoError { physical_failure.get_mut(fpga_index).set(port); + let logical_port = + port.to_logical_port(fpga_index).unwrap_lite(); + failure_types.0[logical_port.0 as usize] = failure; } } } let error = mask & LogicalPortMask::from(physical_error); let failure = mask & LogicalPortMask::from(physical_failure); let success = mask & !(error | failure); - ModuleResult::new(success, failure, error).unwrap() + ModuleResult::new(success, failure, error, failure_types).unwrap_lite() } } diff --git a/drv/transceivers-api/src/lib.rs b/drv/transceivers-api/src/lib.rs index 8f476d2b9..0fe6188af 100644 --- a/drv/transceivers-api/src/lib.rs +++ b/drv/transceivers-api/src/lib.rs @@ -22,6 +22,7 @@ pub enum TransceiversError { InvalidPowerState, InvalidModuleResult, LedI2cError, + InvalidPhysicalToLogicalMap, #[idol(server_death)] ServerRestarted, diff --git a/drv/transceivers-server/src/main.rs b/drv/transceivers-server/src/main.rs index a3b8fae54..051011f16 100644 --- a/drv/transceivers-server/src/main.rs +++ b/drv/transceivers-server/src/main.rs @@ -8,8 +8,7 @@ use drv_fpga_api::FpgaError; use drv_i2c_devices::pca9956b::Error; use drv_sidecar_front_io::{ - leds::FullErrorSummary, - leds::Leds, + leds::{FullErrorSummary, Leds}, transceivers::{LogicalPort, LogicalPortMask, Transceivers}, Reg, }; @@ -26,7 +25,7 @@ use task_thermal_api::{Thermal, ThermalError, ThermalProperties}; use transceiver_messages::{ message::LedState, mgmt::ManagementInterface, MAX_PACKET_SIZE, }; -use userlib::{units::Celsius, *}; +use userlib::{sys_get_timer, task_slot, units::Celsius, FromPrimitive}; use zerocopy::{AsBytes, FromBytes}; mod udp; // UDP API is implemented in a separate file @@ -91,12 +90,22 @@ const MAX_CONSECUTIVE_NACKS: u8 = 3; #[derive(Copy, Clone)] struct LedStates([LedState; NUM_PORTS as usize]); +#[derive(Copy, Clone, PartialEq)] +enum FrontIOStatus { + NotReady, + NotPresent, + Ready, +} + struct ServerImpl { transceivers: Transceivers, leds: Leds, net: task_net_api::Net, modules_present: LogicalPortMask, + /// The Front IO board is not guaranteed to be present and ready + front_io_board_present: FrontIOStatus, + /// State around LED management led_error: FullErrorSummary, leds_initialized: bool, @@ -599,26 +608,6 @@ fn main() -> ! { // before we start doing things with them. A more sophisticated // notification system will be put in place. let seq = Sequencer::from(SEQ.get_task_id()); - loop { - let ready = seq.front_io_board_ready(); - - match ready { - Ok(true) => { - ringbuf_entry!(Trace::FrontIOBoardReady(true)); - break; - } - Err(SeqError::NoFrontIOBoard) => { - ringbuf_entry!(Trace::FrontIOSeqErr( - SeqError::NoFrontIOBoard - )); - break; - } - _ => { - ringbuf_entry!(Trace::FrontIOBoardReady(false)); - userlib::hl::sleep_for(100) - } - } - } let transceivers = Transceivers::new(FRONT_IO.get_task_id()); let leds = Leds::new( @@ -635,6 +624,7 @@ fn main() -> ! { leds, net, modules_present: LogicalPortMask(0), + front_io_board_present: FrontIOStatus::NotReady, led_error: Default::default(), leds_initialized: false, led_states: LedStates([LedState::Off; NUM_PORTS as usize]), @@ -647,13 +637,6 @@ fn main() -> ! { thermal_models: [None; NUM_PORTS as usize], }; - ringbuf_entry!(Trace::LEDInit); - - match server.transceivers.enable_led_controllers() { - Ok(_) => server.led_init(), - Err(e) => ringbuf_entry!(Trace::LEDEnableError(e)), - }; - // There are two timers, one for each communication bus: #[derive(Copy, Clone, Enum)] #[allow(clippy::upper_case_acronyms)] @@ -684,20 +667,59 @@ fn main() -> ! { let mut buffer = [0; idl::INCOMING_SIZE]; loop { + if server.front_io_board_present == FrontIOStatus::NotReady { + server.front_io_board_present = match seq.front_io_board_ready() + { + Ok(true) => { + ringbuf_entry!(Trace::FrontIOBoardReady(true)); + FrontIOStatus::Ready + } + Err(SeqError::NoFrontIOBoard) => { + ringbuf_entry!(Trace::FrontIOSeqErr( + SeqError::NoFrontIOBoard + )); + FrontIOStatus::NotPresent + } + _ => { + ringbuf_entry!(Trace::FrontIOBoardReady(false)); + FrontIOStatus::NotReady + } + }; + + // If a board is present, attempt to initialize its + // LED drivers + if server.front_io_board_present == FrontIOStatus::Ready { + ringbuf_entry!(Trace::LEDInit); + match server.transceivers.enable_led_controllers() { + Ok(_) => server.led_init(), + Err(e) => { + ringbuf_entry!(Trace::LEDEnableError(e)) + } + }; + } + } + multitimer.poll_now(); for t in multitimer.iter_fired() { match t { Timers::I2C => { + // Handle the Front IO status checking as part of this + // loop because the frequency is what we had before and + // the server itself has no knowledge of the sequencer. server.handle_i2c_loop(); } Timers::SPI => { - server.handle_spi_loop(); + if server.front_io_board_present == FrontIOStatus::Ready + { + server.handle_spi_loop(); + } } Timers::Blink => { server.blink_on = !server.blink_on; } } } + server.check_net( tx_data_buf.as_mut_slice(), rx_data_buf.as_mut_slice(), diff --git a/drv/transceivers-server/src/udp.rs b/drv/transceivers-server/src/udp.rs index 9ffa88b16..0348283eb 100644 --- a/drv/transceivers-server/src/udp.rs +++ b/drv/transceivers-server/src/udp.rs @@ -10,12 +10,10 @@ //! //! All of the API types in `transceiver_messages` operate on **physical** //! ports, i.e. an FPGA paired by a physical port index (or mask). -use crate::ServerImpl; -use drv_sidecar_front_io::{ - transceivers::{ - LogicalPort, LogicalPortMask, ModuleResult, ModuleResultNoFailure, - }, - Reg, +use crate::{FrontIOStatus, ServerImpl}; +use drv_sidecar_front_io::transceivers::{ + FpgaI2CFailure, LogicalPort, LogicalPortFailureTypes, LogicalPortMask, + ModuleResult, ModuleResultNoFailure, ModuleResultSlim, PortI2CStatus, }; use hubpack::SerializedSize; use ringbuf::*; @@ -26,6 +24,7 @@ use transceiver_messages::{ mgmt::{ManagementInterface, MemoryRead, MemoryWrite, Page}, ModuleId, }; +use userlib::UnwrapLite; //////////////////////////////////////////////////////////////////////////////// @@ -53,12 +52,15 @@ enum Trace { MacAddrs, GotError(ProtocolError), ResponseSize(ResponseSize), - OperationResult(ModuleResult), + OperationResult(ModuleResultSlim), OperationNoFailResult(ModuleResultNoFailure), ClearPowerFault(ModuleId), LedState(ModuleId), SetLedState(ModuleId, LedState), ClearDisableLatch(ModuleId), + PageSelectI2CFailures(LogicalPort, FpgaI2CFailure), + ReadI2CFailures(LogicalPort, FpgaI2CFailure), + WriteI2CFailures(LogicalPort, FpgaI2CFailure), } ringbuf!(Trace, 16, Trace::None); @@ -304,15 +306,23 @@ impl ServerImpl { HostRequest::Status(modules) => { ringbuf_entry!(Trace::Status(modules)); let mask = LogicalPortMask::from(modules); - let (num_status_bytes, result) = self.get_status(mask, out); + let (data_len, result) = if self.front_io_board_present + == FrontIOStatus::Ready + { + self.get_status(mask, out) + } else { + ( + 0, + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite(), + ) + }; + ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); - let (err_len, errored_modules) = self.handle_errors( - modules, - result, - &mut out[num_status_bytes..], - ); - let final_payload_len = num_status_bytes + err_len; + let (err_len, errored_modules) = + self.handle_errors(modules, result, &mut out[data_len..]); + let final_payload_len = data_len + err_len; ( MessageBody::SpResponse(SpResponse::Status { @@ -325,16 +335,23 @@ impl ServerImpl { HostRequest::ExtendedStatus(modules) => { ringbuf_entry!(Trace::ExtendedStatus(modules)); let mask = LogicalPortMask::from(modules); - let (num_status_bytes, result) = - self.get_extended_status(mask, out); + let (data_len, result) = if self.front_io_board_present + == FrontIOStatus::Ready + { + self.get_extended_status(mask, out) + } else { + ( + 0, + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite(), + ) + }; + ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); - let (err_len, errored_modules) = self.handle_errors( - modules, - result, - &mut out[num_status_bytes..], - ); - let final_payload_len = num_status_bytes + err_len; + let (err_len, errored_modules) = + self.handle_errors(modules, result, &mut out[data_len..]); + let final_payload_len = data_len + err_len; ( MessageBody::SpResponse(SpResponse::ExtendedStatus { @@ -348,7 +365,10 @@ impl ServerImpl { ringbuf_entry!(Trace::Read(modules, read)); // The host is not setting the the upper 32 bits at this time, // but should that happen we need to know how many HwErrors we - // will serialize due to invalid modules being specified. + // will serialize due to invalid modules being specified. We + // only precalculate error length requirements in Read as that + // is the only message whose response can approach + // MAX_PAYLOAD_SIZE. let num_invalid = ModuleId(modules.0 & 0xffffffff00000000) .selected_transceiver_count(); let mask = LogicalPortMask::from(modules); @@ -365,18 +385,33 @@ impl ServerImpl { ); } - let result = self.read(read, mask & !self.disabled, out); - ringbuf_entry!(Trace::OperationResult(result)); + let (data_len, result) = + if self.front_io_board_present == FrontIOStatus::Ready { + let r = self.read(read, mask & !self.disabled, out); + let len = r.success().count() * read.len() as usize; + (len, r) + } else { + ( + 0, + ModuleResult::new( + LogicalPortMask(0), + LogicalPortMask(0), + mask, + LogicalPortFailureTypes::default(), + ) + .unwrap_lite(), + ) + }; + + ringbuf_entry!(Trace::OperationResult(result.to_slim())); let success = ModuleId::from(result.success()); - let read_bytes = result.success().count() * read.len() as usize; let (err_len, failed_modules) = self .handle_errors_and_failures_and_disabled( modules, result, - HwError::I2cError, - &mut out[read_bytes..], + &mut out[data_len..], ); - let final_payload_len = read_bytes + err_len; + let final_payload_len = data_len + err_len; ( MessageBody::SpResponse(SpResponse::Read { @@ -398,16 +433,26 @@ impl ServerImpl { 0, ); } + let mask = LogicalPortMask::from(modules); - let result = self.write(write, mask & !self.disabled, data); - ringbuf_entry!(Trace::OperationResult(result)); + let result = + if self.front_io_board_present == FrontIOStatus::Ready { + self.write(write, mask & !self.disabled, data) + } else { + ModuleResult::new( + LogicalPortMask(0), + LogicalPortMask(0), + mask, + LogicalPortFailureTypes::default(), + ) + .unwrap_lite() + }; + + ringbuf_entry!(Trace::OperationResult(result.to_slim())); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = self .handle_errors_and_failures_and_disabled( - modules, - result, - HwError::I2cError, - out, + modules, result, out, ); ( @@ -422,7 +467,15 @@ impl ServerImpl { HostRequest::AssertReset(modules) => { ringbuf_entry!(Trace::AssertReset(modules)); let mask = LogicalPortMask::from(modules) & !self.disabled; - let result = self.transceivers.assert_reset(mask); + + let result = + if self.front_io_board_present == FrontIOStatus::Ready { + self.transceivers.assert_reset(mask) + } else { + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite() + }; + ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = @@ -439,7 +492,15 @@ impl ServerImpl { HostRequest::DeassertReset(modules) => { ringbuf_entry!(Trace::DeassertReset(modules)); let mask = LogicalPortMask::from(modules) & !self.disabled; - let result = self.transceivers.deassert_reset(mask); + + let result = + if self.front_io_board_present == FrontIOStatus::Ready { + self.transceivers.deassert_reset(mask) + } else { + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite() + }; + ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = @@ -456,7 +517,15 @@ impl ServerImpl { HostRequest::AssertLpMode(modules) => { ringbuf_entry!(Trace::AssertLpMode(modules)); let mask = LogicalPortMask::from(modules) & !self.disabled; - let result = self.transceivers.assert_lpmode(mask); + + let result = + if self.front_io_board_present == FrontIOStatus::Ready { + self.transceivers.assert_lpmode(mask) + } else { + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite() + }; + ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = @@ -473,7 +542,15 @@ impl ServerImpl { HostRequest::DeassertLpMode(modules) => { ringbuf_entry!(Trace::DeassertLpMode(modules)); let mask = LogicalPortMask::from(modules) & !self.disabled; - let result = self.transceivers.deassert_lpmode(mask); + + let result = + if self.front_io_board_present == FrontIOStatus::Ready { + self.transceivers.deassert_lpmode(mask) + } else { + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite() + }; + ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = @@ -490,7 +567,15 @@ impl ServerImpl { HostRequest::EnablePower(modules) => { ringbuf_entry!(Trace::EnablePower(modules)); let mask = LogicalPortMask::from(modules) & !self.disabled; - let result = self.transceivers.enable_power(mask); + + let result = + if self.front_io_board_present == FrontIOStatus::Ready { + self.transceivers.enable_power(mask) + } else { + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite() + }; + ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = @@ -507,7 +592,15 @@ impl ServerImpl { HostRequest::DisablePower(modules) => { ringbuf_entry!(Trace::DisablePower(modules)); let mask = LogicalPortMask::from(modules) & !self.disabled; - let result = self.transceivers.disable_power(mask); + + let result = + if self.front_io_board_present == FrontIOStatus::Ready { + self.transceivers.disable_power(mask) + } else { + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite() + }; + ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = @@ -550,7 +643,15 @@ impl ServerImpl { HostRequest::ClearPowerFault(modules) => { ringbuf_entry!(Trace::ClearPowerFault(modules)); let mask = LogicalPortMask::from(modules) & !self.disabled; - let result = self.transceivers.clear_power_fault(mask); + + let result = + if self.front_io_board_present == FrontIOStatus::Ready { + self.transceivers.clear_power_fault(mask) + } else { + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite() + }; + ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = @@ -567,32 +668,46 @@ impl ServerImpl { HostRequest::LedState(modules) => { ringbuf_entry!(Trace::LedState(modules)); let mask = LogicalPortMask::from(modules); - let (num_led_bytes, result) = - self.get_led_state_response(mask, out); + let (data_len, result) = if self.front_io_board_present + == FrontIOStatus::Ready + { + self.get_led_state_response(mask, out) + } else { + ( + 0, + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite(), + ) + }; + // This operation just queries internal SP state, so it is // always successful. However, invalid modules may been // specified by the host so we need to check that anyway. let success = ModuleId::from(result.success()); - let (num_err_bytes, failed_modules) = self.handle_errors( - modules, - result, - &mut out[num_led_bytes..], - ); + let (num_err_bytes, failed_modules) = + self.handle_errors(modules, result, &mut out[data_len..]); ( MessageBody::SpResponse(SpResponse::LedState { modules: success, failed_modules, }), - num_led_bytes + num_err_bytes, + data_len + num_err_bytes, ) } HostRequest::SetLedState { modules, state } => { ringbuf_entry!(Trace::SetLedState(modules, state)); let mask = LogicalPortMask::from(modules); - self.set_led_state(mask, state); + let result = - ModuleResultNoFailure::new(mask, LogicalPortMask(0)) - .unwrap(); + if self.front_io_board_present == FrontIOStatus::Ready { + self.set_led_state(mask, state); + ModuleResultNoFailure::new(mask, LogicalPortMask(0)) + .unwrap_lite() + } else { + ModuleResultNoFailure::new(LogicalPortMask(0), mask) + .unwrap_lite() + }; + // This operation just sets internal SP state, so it is always // successful. However, invalid modules may been specified by // the host so we need to check that anyway. @@ -637,11 +752,14 @@ impl ServerImpl { /// operation before sending the response. For results where a /// `ModuleResultNoFailure` is returned, use `handle_errors` or /// `handle_errors_and_disabled` instead. + /// + /// Panics: This function can panic if a module has been marked to have had + /// a failure but the failure type enum is NoError. In practice this should + /// not happen. fn handle_errors_and_failures_and_disabled( &mut self, modules: ModuleId, result: ModuleResult, - failure_type: HwError, out: &mut [u8], ) -> (usize, ModuleId) { let mut error_idx: usize = 0; @@ -663,6 +781,21 @@ impl ServerImpl { .unwrap(); error_idx += err_size; } else if result.failure().is_set(module) { + let failure_type = match result.failure_types()[module] { + FpgaI2CFailure::NoModule => HwError::NotPresent, + FpgaI2CFailure::NoPower => HwError::NotPowered, + FpgaI2CFailure::PowerFault => HwError::PowerFault, + FpgaI2CFailure::NotInitialized => { + HwError::NotInitialized + } + FpgaI2CFailure::I2cAddressNack => { + HwError::I2cAddressNack + } + FpgaI2CFailure::I2cByteNack => HwError::I2cByteNack, + // We only mark failures when an error is set, so this + // branch should never match. + FpgaI2CFailure::NoError => unreachable!(), + }; // failure: whatever `HwError` specified by `failure_type` let err_size = hubpack::serialize( &mut out[error_idx..], @@ -671,12 +804,10 @@ impl ServerImpl { .unwrap(); error_idx += err_size; } else if result.error().is_set(module) { - // error: fpga communication issue - let err_size = hubpack::serialize( - &mut out[error_idx..], - &HwError::FpgaError, - ) - .unwrap(); + let err_type = self.resolve_error_type(); + let err_size = + hubpack::serialize(&mut out[error_idx..], &err_type) + .unwrap(); error_idx += err_size; } } else if requested_invalid_modules.is_set(module.0).unwrap() { @@ -722,12 +853,10 @@ impl ServerImpl { if module <= LogicalPortMask::MAX_PORT_INDEX && result.error().is_set(module) { - // error: fpga communication issue - let err_size = hubpack::serialize( - &mut out[error_idx..], - &HwError::FpgaError, - ) - .unwrap(); + let err_type = self.resolve_error_type(); + let err_size = + hubpack::serialize(&mut out[error_idx..], &err_type) + .unwrap(); error_idx += err_size; } else if requested_invalid_modules.is_set(module.0).unwrap() { // let the host know it requested unsupported modules @@ -748,6 +877,17 @@ impl ServerImpl { ) } + // shared logic between the various functions which handle errors + fn resolve_error_type(&self) -> HwError { + match self.front_io_board_present { + // Front IO is present and ready, so the only other error + // path currently is if we handle an FpgaError. + FrontIOStatus::Ready => HwError::FpgaError, + FrontIOStatus::NotPresent => HwError::NoFrontIo, + FrontIOStatus::NotReady => HwError::FrontIoNotReady, + } + } + /// This function reads a `ModuleResultNoFailure` and populates error /// information at the end of the trailing data buffer, taking `self.data` /// into account. This means it should be called as the last operation @@ -931,9 +1071,13 @@ impl ServerImpl { const BANK_SELECT: u8 = 0x7E; const PAGE_SELECT: u8 = 0x7F; - let mut result = - ModuleResult::new(mask, LogicalPortMask(0), LogicalPortMask(0)) - .unwrap(); + let mut result = ModuleResult::new( + mask, + LogicalPortMask(0), + LogicalPortMask(0), + LogicalPortFailureTypes::default(), + ) + .unwrap_lite(); // We can always write the lower page; upper pages require modifying // registers in the transceiver to select it. @@ -947,9 +1091,13 @@ impl ServerImpl { result = result.chain(self.wait_and_check_i2c(result.success())); } else { // If the request is to the lower page it is always successful - result = - ModuleResult::new(mask, LogicalPortMask(0), LogicalPortMask(0)) - .unwrap(); + result = ModuleResult::new( + mask, + LogicalPortMask(0), + LogicalPortMask(0), + LogicalPortFailureTypes::default(), + ) + .unwrap_lite(); } if let Some(bank) = page.bank() { @@ -961,6 +1109,13 @@ impl ServerImpl { )); result = result.chain(self.wait_and_check_i2c(result.success())); } + + for port in result.failure().to_indices() { + ringbuf_entry!(Trace::PageSelectI2CFailures( + port, + result.failure_types()[port] + )) + } result } @@ -994,6 +1149,7 @@ impl ServerImpl { let mut error = LogicalPortMask(0); let mut idx = 0; let buf_len = mem.len() as usize; + let mut failure_types = LogicalPortFailureTypes::default(); for port in result.success().to_indices() { // The status register is contiguous with the output buffer, so @@ -1017,15 +1173,16 @@ impl ServerImpl { break; }; - let status = buf[0]; + let status = PortI2CStatus::new(buf[0]); // Use QSFP::PORT0 for constants, since they're all identical - if status & Reg::QSFP::PORT0_STATUS::BUSY == 0 { + if status.done { // Check error mask - if status & Reg::QSFP::PORT0_STATUS::ERROR != 0 { + if status.error != FpgaI2CFailure::NoError { // Record which port the error ocurred at so we can // give the host a more meaningful error. failure.set(port); + failure_types.0[port.0 as usize] = status.error } else { // Add data to payload success.set(port); @@ -1039,8 +1196,18 @@ impl ServerImpl { userlib::hl::sleep_for(1); } } - let final_result = ModuleResult::new(success, failure, error).unwrap(); - result.chain(final_result) + let mut final_result = + ModuleResult::new(success, failure, error, failure_types) + .unwrap_lite(); + final_result = result.chain(final_result); + + for port in final_result.failure().to_indices() { + ringbuf_entry!(Trace::ReadI2CFailures( + port, + final_result.failure_types()[port] + )) + } + final_result } // The `LogicalPortMask` indicates which of the requested ports the @@ -1063,7 +1230,15 @@ impl ServerImpl { mem.len(), result.success(), )); - result.chain(self.wait_and_check_i2c(result.success())) + result = result.chain(self.wait_and_check_i2c(result.success())); + + for port in result.failure().to_indices() { + ringbuf_entry!(Trace::WriteI2CFailures( + port, + result.failure_types()[port] + )) + } + result } fn get_led_state_response(