blob: ee56a72a7cb1d1d7c57f48d4034a84ce8f0d4ad2 [file] [log] [blame] [edit]
/*
* SPDX-FileCopyrightText: Copyright OpenBMC Authors
* SPDX-License-Identifier: Apache-2.0
*/
#include "NvidiaPcieDevice.hpp"
#include "NvidiaDeviceDiscovery.hpp"
#include "NvidiaEthPort.hpp"
#include "NvidiaGpuMctpVdm.hpp"
#include "NvidiaPcieInterface.hpp"
#include "NvidiaPciePort.hpp"
#include "NvidiaPciePortMetrics.hpp"
#include "Utils.hpp"
#include <MctpRequester.hpp>
#include <OcpMctpVdm.hpp>
#include <boost/asio/io_context.hpp>
#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
#include <sdbusplus/message/native_types.hpp>
#include <chrono>
#include <cstdint>
#include <format>
#include <memory>
#include <span>
#include <string>
#include <system_error>
#include <utility>
#include <vector>
PcieDevice::PcieDevice(const SensorConfigs& configs, const std::string& name,
const std::string& path,
const std::shared_ptr<sdbusplus::asio::connection>& conn,
uint8_t eid, boost::asio::io_context& io,
mctp::MctpRequester& mctpRequester,
sdbusplus::asio::object_server& objectServer) :
eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
waitTimer(io, std::chrono::steady_clock::duration(0)),
mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
configs(configs), name(escapeName(name)), path(path)
{}
void PcieDevice::init()
{
sdbusplus::message::object_path networkAdapterPath =
sdbusplus::message::object_path(nicPathPrefix) / (name + "_NIC");
networkAdapterInterface = objectServer.add_interface(
networkAdapterPath,
"xyz.openbmc_project.Inventory.Item.NetworkAdapter");
std::vector<Association> associations;
associations.emplace_back(
"contained_by", "containing",
sdbusplus::message::object_path(path).parent_path());
networkAdapterAssociationInterface =
objectServer.add_interface(networkAdapterPath, association::interface);
networkAdapterAssociationInterface->register_property(
"Associations", associations);
if (!networkAdapterInterface->initialize())
{
lg2::error(
"Failed to initialize network adapter interface for for eid {EID}",
"EID", eid);
}
if (!networkAdapterAssociationInterface->initialize())
{
lg2::error(
"Error initializing Association Interface for Network Adapter for eid {EID}",
"EID", eid);
}
getPciePortCounts();
for (uint64_t k = 0; k < configs.nicNetworkPortCount; ++k)
{
getNetworkPortAddresses(static_cast<uint16_t>(k + 1));
}
}
void PcieDevice::getPciePortCounts()
{
const int rc = gpu::encodeListPciePortsRequest(0, getPciePortCountsRequest);
if (rc != 0)
{
lg2::error(
"Error updating PCIe Port Counts: encode failed, rc={RC}, EID={EID}",
"RC", rc, "EID", eid);
return;
}
mctpRequester.sendRecvMsg(
eid, getPciePortCountsRequest,
[weak{weak_from_this()}](const std::error_code& ec,
std::span<const uint8_t> buffer) {
std::shared_ptr<PcieDevice> self = weak.lock();
if (!self)
{
lg2::error("Invalid reference to PcieDevice");
return;
}
self->processPciePortCountsResponse(ec, buffer);
});
}
void PcieDevice::processPciePortCountsResponse(
const std::error_code& ec, std::span<const uint8_t> response)
{
if (ec)
{
lg2::error(
"Error processing PCIe Port Counts response: sending message over MCTP failed, rc={RC}, EID={EID}",
"RC", ec.message(), "EID", eid);
return;
}
ocp::accelerator_management::CompletionCode cc{};
uint16_t reasonCode = 0;
const int rc = gpu::decodeListPciePortsResponse(
response, cc, reasonCode, pcieDeviceInfo.numUpstreamPorts,
pcieDeviceInfo.numDownstreamPorts);
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
{
lg2::error(
"Error processing PCIe Port Counts response: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}",
"RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
eid);
return;
}
lg2::info("PCIe Device with eid {EID} has {UP} upstream ports.", "EID", eid,
"UP", pcieDeviceInfo.numUpstreamPorts);
makeSensors();
}
void PcieDevice::getNetworkPortAddresses(const uint16_t portNumber)
{
const int rc = gpu::encodeGetPortNetworkAddressesRequest(
0, portNumber, getPortNetworkAddressesRequest);
if (rc != 0)
{
lg2::error(
"Error updating Network Port Addresses: encode failed, rc={RC}, EID={EID}",
"RC", rc, "EID", eid);
return;
}
mctpRequester.sendRecvMsg(
eid, getPortNetworkAddressesRequest,
[portNumber, weak{weak_from_this()}](const std::error_code& ec,
std::span<const uint8_t> buffer) {
std::shared_ptr<PcieDevice> self = weak.lock();
if (!self)
{
lg2::error("Invalid reference to PcieDevice, EID={EID}", "EID",
self->eid);
return;
}
self->processGetNetworkPortAddressesResponse(portNumber, ec,
buffer);
});
}
void PcieDevice::processGetNetworkPortAddressesResponse(
const uint16_t portNumber, const std::error_code& ec,
std::span<const uint8_t> response)
{
if (ec)
{
lg2::error(
"Error processing Network Port Addresses response: sending message over MCTP failed, rc={RC}, EID={EID}",
"RC", ec.message(), "EID", eid);
return;
}
ocp::accelerator_management::CompletionCode cc{};
uint16_t reasonCode = 0;
gpu::NetworkPortLinkType linkType = gpu::NetworkPortLinkType::UNKNOWN;
std::vector<std::pair<uint8_t, uint64_t>> addresses;
const int rc = gpu::decodeGetPortNetworkAddressesResponse(
response, cc, reasonCode, linkType, addresses);
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
{
lg2::error(
"Error processing Network Port Addresses response: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}",
"RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
eid);
return;
}
if (linkType == gpu::NetworkPortLinkType::ETHERNET)
{
lg2::info(
"Port {PN} of PCIe Device with eid {EID} is of type Ethernet.",
"EID", eid, "PN", portNumber);
const std::string nicDeviceName = name + "_NIC";
const std::string portName = std::format("Port_{}", portNumber);
ethPortMetrics.emplace_back(std::make_shared<NvidiaEthPortMetrics>(
conn, mctpRequester, portName, nicDeviceName, path, eid, portNumber,
objectServer));
}
}
void PcieDevice::makeSensors()
{
const std::string pcieDeviceName = name + "_PCIe";
pcieInterface = std::make_shared<NvidiaPcieInterface>(
conn, mctpRequester, pcieDeviceName, path, eid, objectServer);
uint64_t downstreamPortIndex = 0;
for (uint64_t i = 0; i < pcieDeviceInfo.numUpstreamPorts; ++i)
{
const std::string portName = std::format("UP_{}", i);
pciePorts.emplace_back(std::make_shared<NvidiaPciePortInfo>(
conn, mctpRequester, portName, pcieDeviceName, path, eid,
gpu::PciePortType::UPSTREAM, i, i, objectServer));
pciePortMetrics.emplace_back(makeNvidiaPciePortErrors(
conn, mctpRequester, portName, pcieDeviceName, path, eid,
gpu::PciePortType::UPSTREAM, i, i, objectServer));
pciePortMetrics.emplace_back(makeNvidiaPciePortCounters(
conn, mctpRequester, portName, pcieDeviceName, path, eid,
gpu::PciePortType::UPSTREAM, i, i, objectServer));
pciePortMetrics.emplace_back(makeNvidiaPciePortL0ToRecoveryCount(
conn, mctpRequester, portName, pcieDeviceName, path, eid,
gpu::PciePortType::UPSTREAM, i, i, objectServer));
for (uint64_t j = 0; j < pcieDeviceInfo.numDownstreamPorts[i]; ++j)
{
const std::string portName =
std::format("DOWN_{}", downstreamPortIndex);
pciePorts.emplace_back(std::make_shared<NvidiaPciePortInfo>(
conn, mctpRequester, portName, pcieDeviceName, path, eid,
gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
objectServer));
pciePortMetrics.emplace_back(makeNvidiaPciePortErrors(
conn, mctpRequester, portName, pcieDeviceName, path, eid,
gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
objectServer));
pciePortMetrics.emplace_back(makeNvidiaPciePortCounters(
conn, mctpRequester, portName, pcieDeviceName, path, eid,
gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
objectServer));
pciePortMetrics.emplace_back(makeNvidiaPciePortL0ToRecoveryCount(
conn, mctpRequester, portName, pcieDeviceName, path, eid,
gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
objectServer));
++downstreamPortIndex;
}
}
lg2::info("Added PCIe {NAME} Sensors with chassis path: {PATH}.", "NAME",
name, "PATH", path);
read();
}
void PcieDevice::read()
{
pcieInterface->update();
for (auto& port : pciePorts)
{
port->update();
}
for (auto& portMetrics : pciePortMetrics)
{
portMetrics->update();
}
for (auto& ethPortMetric : ethPortMetrics)
{
ethPortMetric->update();
}
waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
waitTimer.async_wait([this](const boost::system::error_code& ec) {
if (ec)
{
return;
}
read();
});
}