Skip to content

Commit

Permalink
Add hw_counters for infiniband device. (#549)
Browse files Browse the repository at this point in the history
Signed-off-by: aztecher <mikiyaf.business@gmail.com>
  • Loading branch information
aztecher committed Sep 22, 2023
1 parent b2168a3 commit ce949a5
Show file tree
Hide file tree
Showing 3 changed files with 537 additions and 0 deletions.
148 changes: 148 additions & 0 deletions sysfs/class_infiniband.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,42 @@ type InfiniBandCounters struct {
VL15Dropped *uint64 // counters/VL15_dropped
}

// InfiniBandHwCounters contains counter value from files in
// /sys/class/infiniband/<Name>/ports/<Port>/hw_counters
// for a single port of one InfiniBand device.
type InfiniBandHwCounters struct {
DuplicateRequest *uint64 // hw_counters/duplicate_request
ImpliedNakSeqErr *uint64 // hw_counters/implied_nak_seq_err
Lifespan *uint64 // hw_counters/lifespan
LocalAckTimeoutErr *uint64 // hw_counters/local_ack_timeout_err
NpCnpSent *uint64 // hw_counters/np_cnp_sent
NpEcnMarkedRocePackets *uint64 // hw_counters/np_ecn_marked_roce_packets
OutOfBuffer *uint64 // hw_counters/out_of_buffer
OutOfSequence *uint64 // hw_counters/out_of_sequence
PacketSeqErr *uint64 // hw_counters/packet_seq_err
ReqCqeError *uint64 // hw_counters/req_cqe_error
ReqCqeFlushError *uint64 // hw_counters/req_cqe_flush_error
ReqRemoteAccessErrors *uint64 // hw_counters/req_remote_access_errors
ReqRemoteInvalidRequest *uint64 // hw_counters/req_remote_invalid_request
RespCqeError *uint64 // hw_counters/resp_cqe_error
RespCqeFlushError *uint64 // hw_counters/resp_cqe_flush_error
RespLocalLengthError *uint64 // hw_counters/resp_local_length_error
RespRemoteAccessErrors *uint64 // hw_counters/resp_remote_access_errors
RnrNakRetryErr *uint64 // hw_counters/rnr_nak_retry_err
RoceAdpRetrans *uint64 // hw_counters/roce_adp_retrans
RoceAdpRetransTo *uint64 // hw_counters/roce_adp_retrans_to
RoceSlowRestart *uint64 // hw_counters/roce_slow_restart
RoceSlowRestartCnps *uint64 // hw_counters/roce_slow_restart_cnps
RoceSlowRestartTrans *uint64 // hw_counters/roce_slow_restart_trans
RpCnpHandled *uint64 // hw_counters/rp_cnp_handled
RpCnpIgnored *uint64 // hw_counters/rp_cnp_ignored
RxAtomicRequests *uint64 // hw_counters/rx_atomic_requests
RxDctConnect *uint64 // hw_counters/rx_dct_connect
RxIcrcEncapsulated *uint64 // hw_counters/rx_icrc_encapsulated
RxReadRequests *uint64 // hw_counters/rx_read_requests
RxWriteRequests *uint64 // hw_counters/rx_write_requests
}

// InfiniBandPort contains info from files in
// /sys/class/infiniband/<Name>/ports/<Port>
// for a single port of one InfiniBand device.
Expand All @@ -79,6 +115,7 @@ type InfiniBandPort struct {
PhysStateID uint // String representation from /sys/class/infiniband/<Name>/ports/<Port>/phys_state
Rate uint64 // in bytes/second from /sys/class/infiniband/<Name>/ports/<Port>/rate
Counters InfiniBandCounters
HwCounters InfiniBandHwCounters
}

// InfiniBandDevice contains info from files in /sys/class/infiniband for a
Expand Down Expand Up @@ -248,6 +285,14 @@ func (fs FS) parseInfiniBandPort(name string, port string) (*InfiniBandPort, err
}
ibp.Counters = *counters

if strings.Contains(ibp.Name, "mlx5") {
hwCounters, err := parseInfiniBandHwCounters(portPath)
if err != nil {
return nil, err
}
ibp.HwCounters = *hwCounters
}

return &ibp, nil
}

Expand Down Expand Up @@ -411,3 +456,106 @@ func parseInfiniBandCounters(portPath string) (*InfiniBandCounters, error) {

return &counters, nil
}

func parseInfiniBandHwCounters(portPath string) (*InfiniBandHwCounters, error) {
var hwCounters InfiniBandHwCounters

path := filepath.Join(portPath, "hw_counters")
files, err := os.ReadDir(path)
if err != nil {
return nil, err
}

for _, f := range files {
if !f.Type().IsRegular() {
continue
}

name := filepath.Join(path, f.Name())
value, err := util.SysReadFile(name)
if err != nil {
if os.IsNotExist(err) || os.IsPermission(err) || err.Error() == "operation not supported" || errors.Is(err, os.ErrInvalid) {
continue
}
return nil, fmt.Errorf("failed to read file %q: %w", name, err)
}

vp := util.NewValueParser(value)

switch f.Name() {
case "duplicate_request":
hwCounters.DuplicateRequest = vp.PUInt64()
case "implied_nak_seq_err":
hwCounters.ImpliedNakSeqErr = vp.PUInt64()
case "lifespan":
hwCounters.Lifespan = vp.PUInt64()
case "local_ack_timeout_err":
hwCounters.LocalAckTimeoutErr = vp.PUInt64()
case "np_cnp_sent":
hwCounters.NpCnpSent = vp.PUInt64()
case "np_ecn_marked_roce_packets":
hwCounters.NpEcnMarkedRocePackets = vp.PUInt64()
case "out_of_buffer":
hwCounters.OutOfBuffer = vp.PUInt64()
case "out_of_sequence":
hwCounters.OutOfSequence = vp.PUInt64()
case "packet_seq_err":
hwCounters.PacketSeqErr = vp.PUInt64()
case "req_cqe_error":
hwCounters.ReqCqeError = vp.PUInt64()
case "req_cqe_flush_error":
hwCounters.ReqCqeFlushError = vp.PUInt64()
case "req_remote_access_errors":
hwCounters.ReqRemoteAccessErrors = vp.PUInt64()
case "req_remote_invalid_request":
hwCounters.ReqRemoteInvalidRequest = vp.PUInt64()
case "resp_cqe_error":
hwCounters.RespCqeError = vp.PUInt64()
case "resp_cqe_flush_error":
hwCounters.RespCqeFlushError = vp.PUInt64()
case "resp_local_length_error":
hwCounters.RespLocalLengthError = vp.PUInt64()
case "resp_remote_access_errors":
hwCounters.RespRemoteAccessErrors = vp.PUInt64()
case "rnr_nak_retry_err":
hwCounters.RnrNakRetryErr = vp.PUInt64()
case "roce_adp_retrans":
hwCounters.RoceAdpRetrans = vp.PUInt64()
case "roce_adp_retrans_to":
hwCounters.RoceAdpRetransTo = vp.PUInt64()
case "roce_slow_restart":
hwCounters.RoceSlowRestart = vp.PUInt64()
case "roce_slow_restart_cnps":
hwCounters.RoceSlowRestartCnps = vp.PUInt64()
case "roce_slow_restart_trans":
hwCounters.RoceSlowRestartTrans = vp.PUInt64()
case "rp_cnp_handled":
hwCounters.RpCnpHandled = vp.PUInt64()
case "rp_cnp_ignored":
hwCounters.RpCnpIgnored = vp.PUInt64()
case "rx_atomic_requests":
hwCounters.RxAtomicRequests = vp.PUInt64()
case "rx_dct_connect":
hwCounters.RxDctConnect = vp.PUInt64()
case "rx_icrc_encapsulated":
hwCounters.RxIcrcEncapsulated = vp.PUInt64()
case "rx_read_requests":
hwCounters.RxReadRequests = vp.PUInt64()
case "rx_write_requests":
hwCounters.RxWriteRequests = vp.PUInt64()
}

if err := vp.Err(); err != nil {
// Ugly workaround for handling https://github.com/prometheus/node_exporter/issues/966
// when counters are `N/A (not available)`.
// This was already patched and submitted, see
// https://www.spinics.net/lists/linux-rdma/msg68596.html
// Remove this as soon as the fix lands in the enterprise distros.
if strings.Contains(value, "N/A (no PMA)") {
continue
}
return nil, err
}
}
return &hwCounters, nil
}
114 changes: 114 additions & 0 deletions sysfs/class_infiniband_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,53 @@ func TestInfiniBandClass(t *testing.T) {
mlx4Port2PortXmitWait uint64 = 3846
mlx4Port2SymbolError uint64
mlx4Port2VL15Dropped uint64

mlx5Port1ExcessiveBufferOverrunErrors uint64
mlx5Port1LinkDowned uint64
mlx5Port1LinkErrorRecovery uint64
mlx5Port1LocalLinkIntegrityErrors uint64
mlx5Port1MulticastRcvPackets uint64
mlx5Port1MulticastXmitPackets uint64
mlx5Port1PortRcvConstraintErrors uint64
mlx5Port1PortRcvData uint64 = 72505381512
mlx5Port1PortRcvErrors uint64
mlx5Port1PortRcvPackets uint64 = 541889824
mlx5Port1PortRcvRemotePhysicalErrors uint64
mlx5Port1PortRcvSwitchRelayErrors uint64
mlx5Port1PortXmitConstraintErrors uint64
mlx5Port1PortXmitData uint64 = 11523046035392
mlx5Port1PortXmitDiscards uint64
mlx5Port1PortXmitPackets uint64 = 10907922116
mlx5Port1PortXmitWait uint64
mlx5Port1SymbolError uint64
mlx5Port1UnicastRcvPackets uint64 = 541889824
mlx5Port1UnicastXmitPackets uint64 = 10907922116
mlx5Port1VL15Dropped uint64
mlx5Port1DuplicateRequest uint64 = 41
mlx5Port1ImpliedNakSeqErr uint64
mlx5Port1Lifespan uint64 = 10
mlx5Port1LocalAckTimeoutErr uint64 = 131
mlx5Port1OutOfBuffer uint64
mlx5Port1OutOfSequence uint64 = 1
mlx5Port1PacketSeqErr uint64 = 1
mlx5Port1ReqCqeError uint64 = 3481
mlx5Port1ReqCqeFlushError uint64 = 80
mlx5Port1ReqRemoteAccessErrors uint64
mlx5Port1ReqRemoteInvalidRequest uint64
mlx5Port1RespCqeError uint64 = 8109
mlx5Port1RespCqeFlushError uint64 = 4708
mlx5Port1RespLocalLengthError uint64
mlx5Port1RespRemoteAccessErrors uint64
mlx5Port1RnrNakRetryErr uint64
mlx5Port1RoceAdpRetrans uint64 = 99
mlx5Port1RoceAdpRetransTo uint64 = 4
mlx5Port1RoceSlowRestart uint64
mlx5Port1RoceSlowRestartCnps uint64 = 131
mlx5Port1RoceSlowRestartTrans uint64
mlx5Port1RxAtomicRequests uint64
mlx5Port1RxDctConnect uint64
mlx5Port1RxReadRequests uint64 = 175528982
mlx5Port1RxWriteRequests uint64 = 742114
)

want := InfiniBandClass{
Expand Down Expand Up @@ -220,6 +267,73 @@ func TestInfiniBandClass(t *testing.T) {
},
},
},
"mlx5_0": InfiniBandDevice{
Name: "mlx5_0",
BoardID: "SM_2001000001034",
FirmwareVersion: "14.28.2006",
HCAType: "MT4118",
Ports: map[uint]InfiniBandPort{
1: {
Name: "mlx5_0",
Port: 1,
State: "ACTIVE",
StateID: 4,
PhysState: "ACTIVE",
PhysStateID: 4,
Rate: 3125000000,
Counters: InfiniBandCounters{
ExcessiveBufferOverrunErrors: &mlx5Port1ExcessiveBufferOverrunErrors,
LinkDowned: &mlx5Port1LinkDowned,
LinkErrorRecovery: &mlx5Port1LinkErrorRecovery,
LocalLinkIntegrityErrors: &mlx5Port1LocalLinkIntegrityErrors,
MulticastRcvPackets: &mlx5Port1MulticastRcvPackets,
MulticastXmitPackets: &mlx5Port1MulticastXmitPackets,
PortRcvConstraintErrors: &mlx5Port1PortRcvConstraintErrors,
PortRcvData: &mlx5Port1PortRcvData,
PortRcvErrors: &mlx5Port1PortRcvErrors,
PortRcvPackets: &mlx5Port1PortRcvPackets,
PortRcvRemotePhysicalErrors: &mlx5Port1PortRcvRemotePhysicalErrors,
PortRcvSwitchRelayErrors: &mlx5Port1PortRcvSwitchRelayErrors,
PortXmitConstraintErrors: &mlx5Port1PortXmitConstraintErrors,
PortXmitData: &mlx5Port1PortXmitData,
PortXmitDiscards: &mlx5Port1PortXmitDiscards,
PortXmitPackets: &mlx5Port1PortXmitPackets,
PortXmitWait: &mlx5Port1PortXmitWait,
SymbolError: &mlx5Port1SymbolError,
UnicastRcvPackets: &mlx5Port1UnicastRcvPackets,
UnicastXmitPackets: &mlx5Port1UnicastXmitPackets,
VL15Dropped: &mlx5Port1VL15Dropped,
},
HwCounters: InfiniBandHwCounters{
DuplicateRequest: &mlx5Port1DuplicateRequest,
ImpliedNakSeqErr: &mlx5Port1ImpliedNakSeqErr,
Lifespan: &mlx5Port1Lifespan,
LocalAckTimeoutErr: &mlx5Port1LocalAckTimeoutErr,
OutOfBuffer: &mlx5Port1OutOfBuffer,
OutOfSequence: &mlx5Port1OutOfSequence,
PacketSeqErr: &mlx5Port1PacketSeqErr,
ReqCqeError: &mlx5Port1ReqCqeError,
ReqCqeFlushError: &mlx5Port1ReqCqeFlushError,
ReqRemoteAccessErrors: &mlx5Port1ReqRemoteAccessErrors,
ReqRemoteInvalidRequest: &mlx5Port1ReqRemoteInvalidRequest,
RespCqeError: &mlx5Port1RespCqeError,
RespCqeFlushError: &mlx5Port1RespCqeFlushError,
RespLocalLengthError: &mlx5Port1RespLocalLengthError,
RespRemoteAccessErrors: &mlx5Port1RespRemoteAccessErrors,
RnrNakRetryErr: &mlx5Port1RnrNakRetryErr,
RoceAdpRetrans: &mlx5Port1RoceAdpRetrans,
RoceAdpRetransTo: &mlx5Port1RoceAdpRetransTo,
RoceSlowRestart: &mlx5Port1RoceSlowRestart,
RoceSlowRestartCnps: &mlx5Port1RoceSlowRestartCnps,
RoceSlowRestartTrans: &mlx5Port1RoceSlowRestartTrans,
RxAtomicRequests: &mlx5Port1RxAtomicRequests,
RxDctConnect: &mlx5Port1RxDctConnect,
RxReadRequests: &mlx5Port1RxReadRequests,
RxWriteRequests: &mlx5Port1RxWriteRequests,
},
},
},
},
}

if diff := cmp.Diff(want, got); diff != "" {
Expand Down

0 comments on commit ce949a5

Please sign in to comment.