Skip to content

Commit

Permalink
Decouple transceivers server from Front IO readiness (#1588)
Browse files Browse the repository at this point in the history
Currently the `transceivers` server will loop and await the Front IO
board to be ready before entering is service loop. This means that in
the case where the Front IO board is not present (or ready),
`transceivers` server cannot handle any incoming requests, making the
host software that is sending those requests unhappy. This change
addresses that.

Additionally, this commit pulls in the richer HwError variants from
the transceiver-messages crate. By interrogating the FPGAs about
what precisely went wrong, the SP can now pass along much more
detailed fault information to the host OS via these variants.

Fixes #1562, closes #1572
  • Loading branch information
Aaron-Hartwig committed Mar 25, 2024
1 parent 4d268d9 commit 1145c39
Show file tree
Hide file tree
Showing 7 changed files with 445 additions and 139 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Expand Up @@ -135,5 +135,5 @@ sprockets-common = { git = "https://github.com/oxidecomputer/sprockets.git", def
sprockets-rot = { git = "https://github.com/oxidecomputer/sprockets.git", default-features = false }
tlvc = { git = "https://github.com/oxidecomputer/tlvc", default-features = false, version = "0.3.1" }
tlvc-text = { git = "https://github.com/oxidecomputer/tlvc", default-features = false, version = "0.3.0" }
transceiver-messages = { git = "https://github.com/oxidecomputer/transceiver-control/", default-features = false }
transceiver-messages = { git = "https://github.com/oxidecomputer/transceiver-control/", default-features = false}
vsc7448-pac = { git = "https://github.com/oxidecomputer/vsc7448", default-features = false }
3 changes: 2 additions & 1 deletion drv/sidecar-front-io/src/controller.rs
Expand Up @@ -4,6 +4,7 @@

use crate::{Addr, SIDECAR_IO_BITSTREAM_CHECKSUM};
use drv_fpga_api::*;
use userlib::UnwrapLite;

pub struct FrontIOController {
fpga: Fpga,
Expand Down Expand Up @@ -90,6 +91,6 @@ impl FrontIOController {
/// Returns the expected (short) checksum, which simply a prefix of the full
/// SHA3-256 hash of the bitstream.
pub fn short_checksum() -> [u8; 4] {
SIDECAR_IO_BITSTREAM_CHECKSUM[..4].try_into().unwrap()
SIDECAR_IO_BITSTREAM_CHECKSUM[..4].try_into().unwrap_lite()
}
}
161 changes: 134 additions & 27 deletions drv/sidecar-front-io/src/transceivers.rs
Expand Up @@ -6,6 +6,7 @@ use crate::{Addr, Reg};
use drv_fpga_api::{FpgaError, FpgaUserDesign, WriteOp};
use drv_transceivers_api::{ModuleStatus, TransceiversError, NUM_PORTS};
use transceiver_messages::ModuleId;
use userlib::{FromPrimitive, UnwrapLite};
use zerocopy::{byteorder, AsBytes, FromBytes, Unaligned, U16};

// The transceiver modules are split across two FPGAs on the QSFP Front IO
Expand All @@ -24,7 +25,7 @@ pub enum FpgaController {
}

/// Physical port location
#[derive(Copy, Clone)]
#[derive(Copy, Clone, PartialEq)]
pub struct PortLocation {
pub controller: FpgaController,
pub port: PhysicalPort,
Expand All @@ -37,7 +38,7 @@ impl From<LogicalPort> for PortLocation {
}

/// Physical port location within a particular FPGA, as a 0-15 index
#[derive(Copy, Clone)]
#[derive(Copy, Clone, PartialEq)]
pub struct PhysicalPort(pub u8);
impl PhysicalPort {
pub fn as_mask(&self) -> PhysicalPortMask {
Expand All @@ -47,6 +48,20 @@ impl PhysicalPort {
pub fn get(&self) -> u8 {
self.0
}

pub fn to_logical_port(
&self,
fpga: FpgaController,
) -> Result<LogicalPort, TransceiversError> {
let loc = PortLocation {
controller: fpga,
port: *self,
};
match PORT_MAP.into_iter().position(|&l| l == loc) {
Some(p) => Ok(LogicalPort(p as u8)),
None => Err(TransceiversError::InvalidPhysicalToLogicalMap),
}
}
}

/// Physical port mask within a particular FPGA, as a 16-bit bitfield
Expand Down Expand Up @@ -479,11 +494,26 @@ pub struct ModuleResult {
success: LogicalPortMask,
failure: LogicalPortMask,
error: LogicalPortMask,
failure_types: LogicalPortFailureTypes,
}

/// This is a slimmed down version of ModuleResult for use in the ringbuf
#[derive(Copy, Clone, PartialEq)]
pub struct ModuleResultSlim {
success: LogicalPortMask,
failure: LogicalPortMask,
error: LogicalPortMask,
}

impl From<ModuleResultNoFailure> for ModuleResult {
fn from(r: ModuleResultNoFailure) -> Self {
ModuleResult::new(r.success(), LogicalPortMask(0), r.error()).unwrap()
ModuleResult::new(
r.success(),
LogicalPortMask(0),
r.error(),
LogicalPortFailureTypes::default(),
)
.unwrap_lite()
}
}

Expand All @@ -493,6 +523,7 @@ impl ModuleResult {
success: LogicalPortMask,
failure: LogicalPortMask,
error: LogicalPortMask,
failure_types: LogicalPortFailureTypes,
) -> Result<Self, TransceiversError> {
if !(success & failure).is_empty()
|| !(success & error).is_empty()
Expand All @@ -504,6 +535,7 @@ impl ModuleResult {
success,
failure,
error,
failure_types,
})
}

Expand All @@ -519,6 +551,10 @@ impl ModuleResult {
self.error
}

pub fn failure_types(&self) -> LogicalPortFailureTypes {
self.failure_types
}

/// Combines two `ModuleResults`
///
/// Intended to be used for a sequence of `ModuleResult` yielding function
Expand Down Expand Up @@ -555,7 +591,29 @@ impl ModuleResult {
// has subsequently occurred.
let failure = (self.failure() | next.failure()) & !self.error();

Self::new(success, failure, error).unwrap()
// merge the failure types observed, prefering newer failure types
// should both results have a failure at the same port.
let mut combined_failures = LogicalPortFailureTypes::default();
for p in failure.to_indices() {
if next.failure().is_set(p) {
combined_failures.0[p.0 as usize] =
next.failure_types().0[p.0 as usize];
} else if self.failure().is_set(p) {
combined_failures.0[p.0 as usize] =
self.failure_types().0[p.0 as usize];
}
}

Self::new(success, failure, error, combined_failures).unwrap_lite()
}

/// Helper to provide a nice way to get a ModuleResultSlim from this result
pub fn to_slim(&self) -> ModuleResultSlim {
ModuleResultSlim {
success: self.success(),
failure: self.failure(),
error: self.error(),
}
}
}

Expand Down Expand Up @@ -592,6 +650,49 @@ impl ModuleResultNoFailure {
}
}

/// A type to provide more ergonomic access to the FPGA generated type
pub type FpgaI2CFailure = Reg::QSFP::PORT0_STATUS::Encoded;

/// A type to consolidate per-module failure types.
///
/// Currently the only types of operations that can be considered failures are
/// those that involve the FPGA doing I2C. Thus, that is the only supported type
/// right now.
#[derive(Copy, Clone, PartialEq)]
pub struct LogicalPortFailureTypes(pub [FpgaI2CFailure; NUM_PORTS as usize]);

impl Default for LogicalPortFailureTypes {
fn default() -> Self {
Self([FpgaI2CFailure::NoError; NUM_PORTS as usize])
}
}

impl core::ops::Index<LogicalPort> for LogicalPortFailureTypes {
type Output = FpgaI2CFailure;

fn index(&self, i: LogicalPort) -> &Self::Output {
&self.0[i.0 as usize]
}
}

#[derive(Copy, Clone)]
pub struct PortI2CStatus {
pub done: bool,
pub error: FpgaI2CFailure,
}

impl PortI2CStatus {
pub fn new(status: u8) -> Self {
Self {
done: (status & Reg::QSFP::PORT0_STATUS::BUSY) == 0,
error: FpgaI2CFailure::from_u8(
status & Reg::QSFP::PORT0_STATUS::ERROR,
)
.unwrap_lite(),
}
}
}

impl Transceivers {
pub fn new(fpga_task: userlib::TaskId) -> Self {
Self {
Expand Down Expand Up @@ -641,7 +742,7 @@ impl Transceivers {
// only have an error where there was a requested module in mask
error &= mask;

ModuleResultNoFailure::new(success, error).unwrap()
ModuleResultNoFailure::new(success, error).unwrap_lite()
}

/// Set power enable bits per the specified `mask`.
Expand Down Expand Up @@ -769,8 +870,8 @@ impl Transceivers {
let error = !success;

(
ModuleStatus::read_from(status_masks.as_bytes()).unwrap(),
ModuleResultNoFailure::new(success, error).unwrap(),
ModuleStatus::read_from(status_masks.as_bytes()).unwrap_lite(),
ModuleResultNoFailure::new(success, error).unwrap_lite(),
)
}

Expand Down Expand Up @@ -816,7 +917,7 @@ impl Transceivers {
// only have an error where there was a requested module in mask
error &= mask;

ModuleResultNoFailure::new(success, error).unwrap()
ModuleResultNoFailure::new(success, error).unwrap_lite()
}

/// Initiate an I2C random read on all ports per the specified `mask`.
Expand Down Expand Up @@ -912,7 +1013,7 @@ impl Transceivers {
success &= mask;
let error = mask & !success;

ModuleResultNoFailure::new(success, error).unwrap()
ModuleResultNoFailure::new(success, error).unwrap_lite()
}

/// Read the value of the QSFP_PORTx_STATUS
Expand Down Expand Up @@ -987,7 +1088,7 @@ impl Transceivers {
}
let error = !success;

ModuleResultNoFailure::new(success, error).unwrap()
ModuleResultNoFailure::new(success, error).unwrap_lite()
}

/// For a given `local_port`, return the Addr where its read buffer begins
Expand Down Expand Up @@ -1089,30 +1190,31 @@ impl Transceivers {
let mut physical_failure = FpgaPortMasks::default();
let mut physical_error = FpgaPortMasks::default();
let phys_mask: FpgaPortMasks = mask.into();
let mut failure_types = LogicalPortFailureTypes::default();

#[derive(AsBytes, Default, FromBytes)]
#[repr(C)]
struct StatusAndErr {
struct BusyAndPortStatus {
busy: u16,
err: [u8; 8],
status: [u8; 8],
}
for fpga_index in phys_mask.iter_fpgas() {
let fpga = self.fpga(fpga_index);
// This loop should break immediately, because I2C is fast
let status = loop {
// Two bytes of BUSY, followed by 8 bytes of error status
let status = match fpga.read(Addr::QSFP_I2C_BUSY0) {
let status_all = loop {
// Two bytes of BUSY, followed by 816bytes of port status
let status_all = match fpga.read(Addr::QSFP_I2C_BUSY0) {
Ok(data) => data,
// If there is an FPGA communication error, mark that as an
// error on all of that FPGA's ports
Err(_) => {
*physical_error.get_mut(fpga_index) =
PhysicalPortMask(0xffff);
StatusAndErr::default()
BusyAndPortStatus::default()
}
};
if status.busy == 0 {
break status;
if status_all.busy == 0 {
break status_all;
}
userlib::hl::sleep_for(1);
};
Expand All @@ -1123,25 +1225,30 @@ impl Transceivers {
FpgaController::Right => phys_mask.right,
};
for port in (0..16).map(PhysicalPort) {
// skip ports we didn't interact with
if !phys_mask.is_set(port) {
continue;
}
// Each error byte packs together two ports
let err = status.err[port.0 as usize / 2]
>> ((port.0 as usize % 2) * 4);

// For now, check for the presence of an error, but don't bother
// reporting the details.
let has_err = (err & 0b1000) != 0;
if has_err {

// cast status byte into a type
let failure = FpgaI2CFailure::from_u8(
status_all.status[port.0 as usize]
& Reg::QSFP::PORT0_STATUS::ERROR,
)
.unwrap_lite();
// if a failure occurred, mark it and record the failure type
if failure != FpgaI2CFailure::NoError {
physical_failure.get_mut(fpga_index).set(port);
let logical_port =
port.to_logical_port(fpga_index).unwrap_lite();
failure_types.0[logical_port.0 as usize] = failure;
}
}
}
let error = mask & LogicalPortMask::from(physical_error);
let failure = mask & LogicalPortMask::from(physical_failure);
let success = mask & !(error | failure);
ModuleResult::new(success, failure, error).unwrap()
ModuleResult::new(success, failure, error, failure_types).unwrap_lite()
}
}

Expand Down
1 change: 1 addition & 0 deletions drv/transceivers-api/src/lib.rs
Expand Up @@ -22,6 +22,7 @@ pub enum TransceiversError {
InvalidPowerState,
InvalidModuleResult,
LedI2cError,
InvalidPhysicalToLogicalMap,

#[idol(server_death)]
ServerRestarted,
Expand Down

0 comments on commit 1145c39

Please sign in to comment.