blob: 456ff8034a2e29c93fb9869eaaf66ea0962660da [file] [log] [blame] [edit]
/*
* SPDX-FileCopyrightText: Copyright OpenBMC Authors
* SPDX-License-Identifier: Apache-2.0
*/
#include "NvidiaPciePortMetrics.hpp"
#include "NvidiaUtils.hpp"
#include "Utils.hpp"
#include <bits/basic_string.h>
#include <MctpRequester.hpp>
#include <NvidiaGpuMctpVdm.hpp>
#include <NvidiaPcieDevice.hpp>
#include <OcpMctpVdm.hpp>
#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
#include <sdbusplus/message/native_types.hpp>
#include <cstddef>
#include <cstdint>
#include <format>
#include <functional>
#include <memory>
#include <span>
#include <string>
#include <system_error>
#include <vector>
using std::string;
using namespace std::literals;
constexpr const char* metricInterface = "xyz.openbmc_project.Metric.Value";
NvidiaPciePortMetrics::NvidiaPciePortMetrics(
std::shared_ptr<sdbusplus::asio::connection>& conn,
mctp::MctpRequester& mctpRequester, const std::string& name,
const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
sdbusplus::asio::object_server& objectServer, uint8_t scalarGroupId,
const std::vector<NvidiaMetricInfo>& metricsInfo) :
eid(eid), portType(portType), upstreamPortNumber(upstreamPortNumber),
portNumber(portNumber), scalarGroupId(scalarGroupId), path(path),
conn(conn), mctpRequester(mctpRequester)
{
const std::string metricsDbusPathPrefix =
metricPath + std::format("port_{}_{}", pcieDeviceName, name);
const sdbusplus::message::object_path portDbusPath =
sdbusplus::message::object_path(pcieDevicePathPrefix) / pcieDeviceName /
name;
for (const auto& [id, name] : metricsInfo)
{
const std::string metricsDbusPath = metricsDbusPathPrefix + name;
metricValueInterfaces[id] =
objectServer.add_interface(metricsDbusPath, metricInterface);
metricValueInterfaces[id]->register_property(
"Unit", "xyz.openbmc_project.Metric.Value.Unit.Count"s);
metricValueInterfaces[id]->register_property("Value", 0.0);
std::vector<Association> associations;
associations.emplace_back("measuring", "measured_by", portDbusPath);
metricAssociationInterfaces[id] =
objectServer.add_interface(metricsDbusPath, association::interface);
metricAssociationInterfaces[id]->register_property("Associations",
associations);
if (!metricValueInterfaces[id]->initialize())
{
lg2::error(
"Error initializing PCIe Port Metric Interface for EID={EID}, "
"PortType={PT}, PortNumber={PN}, ScalarGroup={SG}, Metric={MN}",
"EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
portNumber, "EID", eid, "PN", portNumber, "SG", scalarGroupId,
"MN", name);
}
if (!metricAssociationInterfaces[id]->initialize())
{
lg2::error(
"Error initializing PCIe Port Metric Association Interface for EID={EID}, "
"PortType={PT}, PortNumber={PN}, ScalarGroup={SG}, Metric={MN}",
"EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
portNumber, "EID", eid, "PN", portNumber, "SG", scalarGroupId,
"MN", name);
}
}
}
void NvidiaPciePortMetrics::processResponse(
const std::error_code& sendRecvMsgResult, std::span<const uint8_t> response)
{
if (sendRecvMsgResult)
{
lg2::error(
"Error updating PCIe Port Metrics: sending message over MCTP failed, "
"rc={RC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
"RC", sendRecvMsgResult.message(), "EID", eid, "PT",
static_cast<uint8_t>(portType), "PN", portNumber, "SG",
scalarGroupId);
return;
}
ocp::accelerator_management::CompletionCode cc{};
uint16_t reasonCode = 0;
size_t numTelemetryValue = 0;
int rc = gpu::decodeQueryScalarGroupTelemetryV2Response(
response, cc, reasonCode, numTelemetryValue, telemetryValues);
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
{
lg2::error(
"Error updating PCIe Port Errors: decode failed, "
"rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
"RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
eid, "PT", static_cast<uint8_t>(portType), "PN", portNumber, "SG",
scalarGroupId);
return;
}
for (size_t i = 0; i < numTelemetryValue; ++i)
{
if (metricValueInterfaces[i] != nullptr)
{
metricValueInterfaces[i]->set_property(
"Value", static_cast<double>(telemetryValues[i]));
}
}
}
void NvidiaPciePortMetrics::update()
{
auto rc = gpu::encodeQueryScalarGroupTelemetryV2Request(
0, portType, upstreamPortNumber, portNumber, scalarGroupId, request);
if (rc != 0)
{
lg2::error(
"Error updating PCIe Port Errors: encode failed, rc={RC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
"RC", rc, "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
portNumber, "SG", scalarGroupId);
return;
}
mctpRequester.sendRecvMsg(
eid, request,
[weak{weak_from_this()}](const std::error_code& ec,
std::span<const uint8_t> buffer) {
std::shared_ptr<NvidiaPciePortMetrics> self = weak.lock();
if (!self)
{
lg2::error("Invalid reference to NvidiaPciePortErrors");
return;
}
self->processResponse(ec, buffer);
});
}
std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortErrors(
std::shared_ptr<sdbusplus::asio::connection>& conn,
mctp::MctpRequester& mctpRequester, const std::string& name,
const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
sdbusplus::asio::object_server& objectServer)
{
static constexpr uint8_t nvidiaPciePortErrorScalarGroupId = 2;
return std::make_shared<NvidiaPciePortMetrics>(
conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
upstreamPortNumber, portNumber, objectServer,
nvidiaPciePortErrorScalarGroupId,
std::vector<NvidiaMetricInfo>{
{0, "/pcie/non_fatal_error_count"},
{1, "/pcie/fatal_error_count"},
{2, "/pcie/unsupported_request_count"},
{3, "/pcie/correctable_error_count"},
});
}
std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortCounters(
std::shared_ptr<sdbusplus::asio::connection>& conn,
mctp::MctpRequester& mctpRequester, const std::string& name,
const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
sdbusplus::asio::object_server& objectServer)
{
static constexpr uint8_t nvidiaPciePortCounterScalarGroupId = 4;
return std::make_shared<NvidiaPciePortMetrics>(
conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
upstreamPortNumber, portNumber, objectServer,
nvidiaPciePortCounterScalarGroupId,
std::vector<NvidiaMetricInfo>{
{1, "/pcie/nak_received_count"},
{2, "/pcie/nak_sent_count"},
{4, "/pcie/replay_rollover_count"},
{6, "/pcie/replay_count"},
});
}
std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortL0ToRecoveryCount(
std::shared_ptr<sdbusplus::asio::connection>& conn,
mctp::MctpRequester& mctpRequester, const std::string& name,
const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
sdbusplus::asio::object_server& objectServer)
{
static constexpr uint8_t nvidiaPciePortL0ToRecoveryCountScalarGroupId = 3;
return std::make_shared<NvidiaPciePortMetrics>(
conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
upstreamPortNumber, portNumber, objectServer,
nvidiaPciePortL0ToRecoveryCountScalarGroupId,
std::vector<NvidiaMetricInfo>{
{0, "/pcie/l0_to_recovery_count"},
});
}