Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hw_counters for infiniband device. #549

Merged
merged 1 commit into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
148 changes: 148 additions & 0 deletions sysfs/class_infiniband.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,42 @@ type InfiniBandCounters struct {
VL15Dropped *uint64 // counters/VL15_dropped
}

// InfiniBandHwCounters contains counter value from files in
// /sys/class/infiniband/<Name>/ports/<Port>/hw_counters
// for a single port of one InfiniBand device.
type InfiniBandHwCounters struct {
DuplicateRequest *uint64 // hw_counters/duplicate_request
ImpliedNakSeqErr *uint64 // hw_counters/implied_nak_seq_err
Lifespan *uint64 // hw_counters/lifespan
LocalAckTimeoutErr *uint64 // hw_counters/local_ack_timeout_err
NpCnpSent *uint64 // hw_counters/np_cnp_sent
NpEcnMarkedRocePackets *uint64 // hw_counters/np_ecn_marked_roce_packets
OutOfBuffer *uint64 // hw_counters/out_of_buffer
OutOfSequence *uint64 // hw_counters/out_of_sequence
PacketSeqErr *uint64 // hw_counters/packet_seq_err
ReqCqeError *uint64 // hw_counters/req_cqe_error
ReqCqeFlushError *uint64 // hw_counters/req_cqe_flush_error
ReqRemoteAccessErrors *uint64 // hw_counters/req_remote_access_errors
ReqRemoteInvalidRequest *uint64 // hw_counters/req_remote_invalid_request
RespCqeError *uint64 // hw_counters/resp_cqe_error
RespCqeFlushError *uint64 // hw_counters/resp_cqe_flush_error
RespLocalLengthError *uint64 // hw_counters/resp_local_length_error
RespRemoteAccessErrors *uint64 // hw_counters/resp_remote_access_errors
RnrNakRetryErr *uint64 // hw_counters/rnr_nak_retry_err
RoceAdpRetrans *uint64 // hw_counters/roce_adp_retrans
RoceAdpRetransTo *uint64 // hw_counters/roce_adp_retrans_to
RoceSlowRestart *uint64 // hw_counters/roce_slow_restart
RoceSlowRestartCnps *uint64 // hw_counters/roce_slow_restart_cnps
RoceSlowRestartTrans *uint64 // hw_counters/roce_slow_restart_trans
RpCnpHandled *uint64 // hw_counters/rp_cnp_handled
RpCnpIgnored *uint64 // hw_counters/rp_cnp_ignored
RxAtomicRequests *uint64 // hw_counters/rx_atomic_requests
RxDctConnect *uint64 // hw_counters/rx_dct_connect
RxIcrcEncapsulated *uint64 // hw_counters/rx_icrc_encapsulated
RxReadRequests *uint64 // hw_counters/rx_read_requests
RxWriteRequests *uint64 // hw_counters/rx_write_requests
}

// InfiniBandPort contains info from files in
// /sys/class/infiniband/<Name>/ports/<Port>
// for a single port of one InfiniBand device.
Expand All @@ -79,6 +115,7 @@ type InfiniBandPort struct {
PhysStateID uint // String representation from /sys/class/infiniband/<Name>/ports/<Port>/phys_state
Rate uint64 // in bytes/second from /sys/class/infiniband/<Name>/ports/<Port>/rate
Counters InfiniBandCounters
HwCounters InfiniBandHwCounters
}

// InfiniBandDevice contains info from files in /sys/class/infiniband for a
Expand Down Expand Up @@ -242,6 +279,14 @@ func (fs FS) parseInfiniBandPort(name string, port string) (*InfiniBandPort, err
}
ibp.Counters = *counters

if strings.Contains(ibp.Name, "mlx5") {
hwCounters, err := parseInfiniBandHwCounters(portPath)
if err != nil {
return nil, err
}
ibp.HwCounters = *hwCounters
}

return &ibp, nil
}

Expand Down Expand Up @@ -405,3 +450,106 @@ func parseInfiniBandCounters(portPath string) (*InfiniBandCounters, error) {

return &counters, nil
}

func parseInfiniBandHwCounters(portPath string) (*InfiniBandHwCounters, error) {
var hwCounters InfiniBandHwCounters

path := filepath.Join(portPath, "hw_counters")
files, err := os.ReadDir(path)
if err != nil {
return nil, err
}

for _, f := range files {
if !f.Type().IsRegular() {
continue
}

name := filepath.Join(path, f.Name())
value, err := util.SysReadFile(name)
if err != nil {
if os.IsNotExist(err) || os.IsPermission(err) || err.Error() == "operation not supported" || errors.Is(err, os.ErrInvalid) {
continue
}
return nil, fmt.Errorf("failed to read file %q: %w", name, err)
}

vp := util.NewValueParser(value)

switch f.Name() {
case "duplicate_request":
hwCounters.DuplicateRequest = vp.PUInt64()
case "implied_nak_seq_err":
hwCounters.ImpliedNakSeqErr = vp.PUInt64()
case "lifespan":
hwCounters.Lifespan = vp.PUInt64()
case "local_ack_timeout_err":
hwCounters.LocalAckTimeoutErr = vp.PUInt64()
case "np_cnp_sent":
hwCounters.NpCnpSent = vp.PUInt64()
case "np_ecn_marked_roce_packets":
hwCounters.NpEcnMarkedRocePackets = vp.PUInt64()
case "out_of_buffer":
hwCounters.OutOfBuffer = vp.PUInt64()
case "out_of_sequence":
hwCounters.OutOfSequence = vp.PUInt64()
case "packet_seq_err":
hwCounters.PacketSeqErr = vp.PUInt64()
case "req_cqe_error":
hwCounters.ReqCqeError = vp.PUInt64()
case "req_cqe_flush_error":
hwCounters.ReqCqeFlushError = vp.PUInt64()
case "req_remote_access_errors":
hwCounters.ReqRemoteAccessErrors = vp.PUInt64()
case "req_remote_invalid_request":
hwCounters.ReqRemoteInvalidRequest = vp.PUInt64()
case "resp_cqe_error":
hwCounters.RespCqeError = vp.PUInt64()
case "resp_cqe_flush_error":
hwCounters.RespCqeFlushError = vp.PUInt64()
case "resp_local_length_error":
hwCounters.RespLocalLengthError = vp.PUInt64()
case "resp_remote_access_errors":
hwCounters.RespRemoteAccessErrors = vp.PUInt64()
case "rnr_nak_retry_err":
hwCounters.RnrNakRetryErr = vp.PUInt64()
case "roce_adp_retrans":
hwCounters.RoceAdpRetrans = vp.PUInt64()
case "roce_adp_retrans_to":
hwCounters.RoceAdpRetransTo = vp.PUInt64()
case "roce_slow_restart":
hwCounters.RoceSlowRestart = vp.PUInt64()
case "roce_slow_restart_cnps":
hwCounters.RoceSlowRestartCnps = vp.PUInt64()
case "roce_slow_restart_trans":
hwCounters.RoceSlowRestartTrans = vp.PUInt64()
case "rp_cnp_handled":
hwCounters.RpCnpHandled = vp.PUInt64()
case "rp_cnp_ignored":
hwCounters.RpCnpIgnored = vp.PUInt64()
case "rx_atomic_requests":
hwCounters.RxAtomicRequests = vp.PUInt64()
case "rx_dct_connect":
hwCounters.RxDctConnect = vp.PUInt64()
case "rx_icrc_encapsulated":
hwCounters.RxIcrcEncapsulated = vp.PUInt64()
case "rx_read_requests":
hwCounters.RxReadRequests = vp.PUInt64()
case "rx_write_requests":
hwCounters.RxWriteRequests = vp.PUInt64()
}

if err := vp.Err(); err != nil {
// Ugly workaround for handling https://github.com/prometheus/node_exporter/issues/966
// when counters are `N/A (not available)`.
// This was already patched and submitted, see
// https://www.spinics.net/lists/linux-rdma/msg68596.html
// Remove this as soon as the fix lands in the enterprise distros.
if strings.Contains(value, "N/A (no PMA)") {
continue
}
return nil, err
}
}
return &hwCounters, nil
}
114 changes: 114 additions & 0 deletions sysfs/class_infiniband_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,53 @@ func TestInfiniBandClass(t *testing.T) {
mlx4Port2PortXmitWait uint64 = 3846
mlx4Port2SymbolError uint64
mlx4Port2VL15Dropped uint64

mlx5Port1ExcessiveBufferOverrunErrors uint64
mlx5Port1LinkDowned uint64
mlx5Port1LinkErrorRecovery uint64
mlx5Port1LocalLinkIntegrityErrors uint64
mlx5Port1MulticastRcvPackets uint64
mlx5Port1MulticastXmitPackets uint64
mlx5Port1PortRcvConstraintErrors uint64
mlx5Port1PortRcvData uint64 = 72505381512
mlx5Port1PortRcvErrors uint64
mlx5Port1PortRcvPackets uint64 = 541889824
mlx5Port1PortRcvRemotePhysicalErrors uint64
mlx5Port1PortRcvSwitchRelayErrors uint64
mlx5Port1PortXmitConstraintErrors uint64
mlx5Port1PortXmitData uint64 = 11523046035392
mlx5Port1PortXmitDiscards uint64
mlx5Port1PortXmitPackets uint64 = 10907922116
mlx5Port1PortXmitWait uint64
mlx5Port1SymbolError uint64
mlx5Port1UnicastRcvPackets uint64 = 541889824
mlx5Port1UnicastXmitPackets uint64 = 10907922116
mlx5Port1VL15Dropped uint64
mlx5Port1DuplicateRequest uint64 = 41
mlx5Port1ImpliedNakSeqErr uint64
mlx5Port1Lifespan uint64 = 10
mlx5Port1LocalAckTimeoutErr uint64 = 131
mlx5Port1OutOfBuffer uint64
mlx5Port1OutOfSequence uint64 = 1
mlx5Port1PacketSeqErr uint64 = 1
mlx5Port1ReqCqeError uint64 = 3481
mlx5Port1ReqCqeFlushError uint64 = 80
mlx5Port1ReqRemoteAccessErrors uint64
mlx5Port1ReqRemoteInvalidRequest uint64
mlx5Port1RespCqeError uint64 = 8109
mlx5Port1RespCqeFlushError uint64 = 4708
mlx5Port1RespLocalLengthError uint64
mlx5Port1RespRemoteAccessErrors uint64
mlx5Port1RnrNakRetryErr uint64
mlx5Port1RoceAdpRetrans uint64 = 99
mlx5Port1RoceAdpRetransTo uint64 = 4
mlx5Port1RoceSlowRestart uint64
mlx5Port1RoceSlowRestartCnps uint64 = 131
mlx5Port1RoceSlowRestartTrans uint64
mlx5Port1RxAtomicRequests uint64
mlx5Port1RxDctConnect uint64
mlx5Port1RxReadRequests uint64 = 175528982
mlx5Port1RxWriteRequests uint64 = 742114
)

want := InfiniBandClass{
Expand Down Expand Up @@ -220,6 +267,73 @@ func TestInfiniBandClass(t *testing.T) {
},
},
},
"mlx5_0": InfiniBandDevice{
Name: "mlx5_0",
BoardID: "SM_2001000001034",
FirmwareVersion: "14.28.2006",
HCAType: "MT4118",
Ports: map[uint]InfiniBandPort{
1: {
Name: "mlx5_0",
Port: 1,
State: "ACTIVE",
StateID: 4,
PhysState: "ACTIVE",
PhysStateID: 4,
Rate: 3125000000,
Counters: InfiniBandCounters{
ExcessiveBufferOverrunErrors: &mlx5Port1ExcessiveBufferOverrunErrors,
LinkDowned: &mlx5Port1LinkDowned,
LinkErrorRecovery: &mlx5Port1LinkErrorRecovery,
LocalLinkIntegrityErrors: &mlx5Port1LocalLinkIntegrityErrors,
MulticastRcvPackets: &mlx5Port1MulticastRcvPackets,
MulticastXmitPackets: &mlx5Port1MulticastXmitPackets,
PortRcvConstraintErrors: &mlx5Port1PortRcvConstraintErrors,
PortRcvData: &mlx5Port1PortRcvData,
PortRcvErrors: &mlx5Port1PortRcvErrors,
PortRcvPackets: &mlx5Port1PortRcvPackets,
PortRcvRemotePhysicalErrors: &mlx5Port1PortRcvRemotePhysicalErrors,
PortRcvSwitchRelayErrors: &mlx5Port1PortRcvSwitchRelayErrors,
PortXmitConstraintErrors: &mlx5Port1PortXmitConstraintErrors,
PortXmitData: &mlx5Port1PortXmitData,
PortXmitDiscards: &mlx5Port1PortXmitDiscards,
PortXmitPackets: &mlx5Port1PortXmitPackets,
PortXmitWait: &mlx5Port1PortXmitWait,
SymbolError: &mlx5Port1SymbolError,
UnicastRcvPackets: &mlx5Port1UnicastRcvPackets,
UnicastXmitPackets: &mlx5Port1UnicastXmitPackets,
VL15Dropped: &mlx5Port1VL15Dropped,
},
HwCounters: InfiniBandHwCounters{
DuplicateRequest: &mlx5Port1DuplicateRequest,
ImpliedNakSeqErr: &mlx5Port1ImpliedNakSeqErr,
Lifespan: &mlx5Port1Lifespan,
LocalAckTimeoutErr: &mlx5Port1LocalAckTimeoutErr,
OutOfBuffer: &mlx5Port1OutOfBuffer,
OutOfSequence: &mlx5Port1OutOfSequence,
PacketSeqErr: &mlx5Port1PacketSeqErr,
ReqCqeError: &mlx5Port1ReqCqeError,
ReqCqeFlushError: &mlx5Port1ReqCqeFlushError,
ReqRemoteAccessErrors: &mlx5Port1ReqRemoteAccessErrors,
ReqRemoteInvalidRequest: &mlx5Port1ReqRemoteInvalidRequest,
RespCqeError: &mlx5Port1RespCqeError,
RespCqeFlushError: &mlx5Port1RespCqeFlushError,
RespLocalLengthError: &mlx5Port1RespLocalLengthError,
RespRemoteAccessErrors: &mlx5Port1RespRemoteAccessErrors,
RnrNakRetryErr: &mlx5Port1RnrNakRetryErr,
RoceAdpRetrans: &mlx5Port1RoceAdpRetrans,
RoceAdpRetransTo: &mlx5Port1RoceAdpRetransTo,
RoceSlowRestart: &mlx5Port1RoceSlowRestart,
RoceSlowRestartCnps: &mlx5Port1RoceSlowRestartCnps,
RoceSlowRestartTrans: &mlx5Port1RoceSlowRestartTrans,
RxAtomicRequests: &mlx5Port1RxAtomicRequests,
RxDctConnect: &mlx5Port1RxDctConnect,
RxReadRequests: &mlx5Port1RxReadRequests,
RxWriteRequests: &mlx5Port1RxWriteRequests,
},
},
},
},
}

if diff := cmp.Diff(want, got); diff != "" {
Expand Down