Nvidia-Gpu: Support for Nvidia GPU Serial Number, Part Number
Support for serial number and part number fetch is added in inventory
class which uses the Get Inventory Command. Currently we have a retry
policy of 3 retires to account of any failures to get response from the
GPU device.
Tested
- Able to get Serial Number, Part Number updated from the GPU device
```
busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/inventory/NVIDIA_GB200_GPU_0
NAME TYPE SIGNATURE RESULT/VALUE FLAGS
org.freedesktop.DBus.Introspectable interface - - -
.Introspect method - s -
org.freedesktop.DBus.Peer interface - - -
.GetMachineId method - s -
.Ping method - - -
org.freedesktop.DBus.Properties interface - - -
.Get method ss v -
.GetAll method s a{sv} -
.Set method ssv - -
.PropertiesChanged signal sa{sv}as - -
xyz.openbmc_project.Inventory.Decorator.Asset interface - - -
.PartNumber property s "699-2G153-0210-TS1" emits-change
.SerialNumber property s "1330325220002" emits-change
xyz.openbmc_project.Inventory.Item.Accelerator interface - - -
.Type property s "GPU" emits-change
```
Change-Id: Id2b33a66ff6d5480f8e229fa233528afc0bdcfc0
Signed-off-by: Rohit PAI <ropai@nvidia.com>
diff --git a/src/nvidia-gpu/Inventory.cpp b/src/nvidia-gpu/Inventory.cpp
index 901eca1..c7aa153 100644
--- a/src/nvidia-gpu/Inventory.cpp
+++ b/src/nvidia-gpu/Inventory.cpp
@@ -2,41 +2,225 @@
#include "Utils.hpp"
+#include <MctpRequester.hpp>
#include <NvidiaGpuMctpVdm.hpp>
+#include <OcpMctpVdm.hpp>
+#include <boost/asio/io_context.hpp>
#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
-#include <exception>
+#include <cstdint>
#include <memory>
+#include <optional>
#include <string>
+#include <unordered_map>
+#include <variant>
constexpr const char* inventoryPrefix = "/xyz/openbmc_project/inventory/";
constexpr const char* acceleratorIfaceName =
"xyz.openbmc_project.Inventory.Item.Accelerator";
+static constexpr const char* assetIfaceName =
+ "xyz.openbmc_project.Inventory.Decorator.Asset";
Inventory::Inventory(
const std::shared_ptr<sdbusplus::asio::connection>& /*conn*/,
sdbusplus::asio::object_server& objectServer,
- const std::string& inventoryName,
- const gpu::DeviceIdentification deviceType) :
- name(escapeName(inventoryName))
+ const std::string& inventoryName, mctp::MctpRequester& mctpRequester,
+ const gpu::DeviceIdentification deviceTypeIn, const uint8_t eid,
+ boost::asio::io_context& io) :
+ name(escapeName(inventoryName)), mctpRequester(mctpRequester),
+ deviceType(deviceTypeIn), eid(eid), retryTimer(io)
{
+ requestBuffer = std::make_shared<InventoryRequestBuffer>();
+ responseBuffer = std::make_shared<InventoryResponseBuffer>();
+
+ std::string path = inventoryPrefix + name;
+ assetIface = objectServer.add_interface(path, assetIfaceName);
+ assetIface->register_property("Manufacturer", std::string("NVIDIA"));
+ // Register properties which need to be fetched from the device
+ registerProperty(gpu::InventoryPropertyId::SERIAL_NUMBER, assetIface,
+ "SerialNumber");
+ registerProperty(gpu::InventoryPropertyId::BOARD_PART_NUMBER, assetIface,
+ "PartNumber");
+ assetIface->initialize();
+
+ // Static properties
if (deviceType == gpu::DeviceIdentification::DEVICE_GPU)
{
- std::string path = inventoryPrefix + name;
- try
+ acceleratorInterface =
+ objectServer.add_interface(path, acceleratorIfaceName);
+ acceleratorInterface->register_property("Type", std::string("GPU"));
+ acceleratorInterface->initialize();
+ }
+
+ processNextProperty();
+}
+
+void Inventory::registerProperty(
+ gpu::InventoryPropertyId propertyId,
+ const std::shared_ptr<sdbusplus::asio::dbus_interface>& interface,
+ const std::string& propertyName)
+{
+ if (interface)
+ {
+ interface->register_property(propertyName, std::string{});
+ properties[propertyId] = {interface, propertyName, 0, true};
+ }
+}
+
+void Inventory::processInventoryProperty(gpu::InventoryPropertyId propertyId)
+{
+ auto it = properties.find(propertyId);
+ if (it != properties.end())
+ {
+ markPropertyPending(it);
+ std::optional<gpu::InventoryPropertyId> nextProperty =
+ getNextPendingProperty();
+ if (nextProperty && *nextProperty == propertyId)
{
- acceleratorInterface =
- objectServer.add_interface(path, acceleratorIfaceName);
- acceleratorInterface->register_property("Type", std::string("GPU"));
- acceleratorInterface->initialize();
+ processNextProperty();
}
- catch (const std::exception& e)
+ }
+}
+
+void Inventory::markPropertyPending(
+ std::unordered_map<gpu::InventoryPropertyId, PropertyInfo>::iterator it)
+{
+ it->second.isPending = true;
+ it->second.retryCount = 0;
+}
+
+void Inventory::markPropertyProcessed(
+ std::unordered_map<gpu::InventoryPropertyId, PropertyInfo>::iterator it)
+{
+ it->second.isPending = false;
+}
+
+std::optional<gpu::InventoryPropertyId> Inventory::getNextPendingProperty()
+ const
+{
+ for (const auto& [propertyId, info] : properties)
+ {
+ if (info.isPending)
+ {
+ return propertyId;
+ }
+ }
+ return std::nullopt;
+}
+
+void Inventory::sendInventoryPropertyRequest(
+ gpu::InventoryPropertyId propertyId)
+{
+ int rc = gpu::encodeGetInventoryInformationRequest(
+ 0, static_cast<uint8_t>(propertyId), *requestBuffer);
+ if (rc != 0)
+ {
+ lg2::error(
+ "Failed to encode property ID {PROP_ID} request for {NAME}: rc={RC}",
+ "PROP_ID", static_cast<uint8_t>(propertyId), "NAME", name, "RC",
+ rc);
+ return;
+ }
+
+ lg2::info(
+ "Sending inventory request for property ID {PROP_ID} to EID {EID} for {NAME}",
+ "PROP_ID", static_cast<uint8_t>(propertyId), "EID", eid, "NAME", name);
+
+ mctpRequester.sendRecvMsg(eid, *requestBuffer, *responseBuffer,
+ [this, propertyId](int sendRecvMsgResult) {
+ this->handleInventoryPropertyResponse(
+ propertyId, sendRecvMsgResult);
+ });
+}
+
+void Inventory::handleInventoryPropertyResponse(
+ gpu::InventoryPropertyId propertyId, int sendRecvMsgResult)
+{
+ auto it = properties.find(propertyId);
+ if (it == properties.end())
+ {
+ lg2::error("Property ID {PROP_ID} for {NAME} not found", "PROP_ID",
+ static_cast<uint8_t>(propertyId), "NAME", name);
+ processNextProperty();
+ return;
+ }
+
+ bool success = false;
+ if (sendRecvMsgResult == 0)
+ {
+ ocp::accelerator_management::CompletionCode cc{};
+ uint16_t reasonCode = 0;
+ gpu::InventoryValue info;
+ int rc = gpu::decodeGetInventoryInformationResponse(
+ *responseBuffer, cc, reasonCode, propertyId, info);
+
+ lg2::info(
+ "Response for property ID {PROP_ID} from {NAME}, sendRecvMsgResult: {RESULT}, decode_rc: {RC}, completion_code: {CC}, reason_code: {REASON}",
+ "PROP_ID", static_cast<uint8_t>(propertyId), "NAME", name, "RESULT",
+ sendRecvMsgResult, "RC", rc, "CC", static_cast<uint8_t>(cc),
+ "REASON", reasonCode);
+
+ if (rc == 0 &&
+ cc == ocp::accelerator_management::CompletionCode::SUCCESS &&
+ std::holds_alternative<std::string>(info))
+ {
+ std::string value = std::get<std::string>(info);
+ it->second.interface->set_property(it->second.propertyName, value);
+ lg2::info(
+ "Successfully received property ID {PROP_ID} for {NAME} with value: {VALUE}",
+ "PROP_ID", static_cast<uint8_t>(propertyId), "NAME", name,
+ "VALUE", value);
+ success = true;
+ }
+ }
+
+ if (!success)
+ {
+ it->second.retryCount++;
+ if (it->second.retryCount >= maxRetryAttempts)
{
lg2::error(
- "Failed to add accelerator interface. path='{PATH}', error='{ERROR}'",
- "PATH", path, "ERROR", e.what());
+ "Property ID {PROP_ID} for {NAME} failed after {ATTEMPTS} attempts",
+ "PROP_ID", static_cast<uint8_t>(propertyId), "NAME", name,
+ "ATTEMPTS", maxRetryAttempts);
+ markPropertyProcessed(it);
}
+ else
+ {
+ retryTimer.expires_after(retryDelay);
+ retryTimer.async_wait([this](const boost::system::error_code& ec) {
+ if (ec)
+ {
+ lg2::error("Retry timer error for {NAME}: {ERROR}", "NAME",
+ name, "ERROR", ec.message());
+ return;
+ }
+ this->processNextProperty();
+ });
+ return;
+ }
+ }
+ else
+ {
+ markPropertyProcessed(it);
+ }
+
+ processNextProperty();
+}
+
+void Inventory::processNextProperty()
+{
+ std::optional<gpu::InventoryPropertyId> nextProperty =
+ getNextPendingProperty();
+ if (nextProperty)
+ {
+ sendInventoryPropertyRequest(*nextProperty);
+ }
+ else
+ {
+ lg2::info("No pending properties found to process for {NAME}", "NAME",
+ name);
}
}
diff --git a/src/nvidia-gpu/Inventory.hpp b/src/nvidia-gpu/Inventory.hpp
index 8de490d..1d2587b 100644
--- a/src/nvidia-gpu/Inventory.hpp
+++ b/src/nvidia-gpu/Inventory.hpp
@@ -1,23 +1,72 @@
#pragma once
+#include "MctpRequester.hpp"
#include "NvidiaGpuMctpVdm.hpp"
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/steady_timer.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
+#include <array>
+#include <chrono>
+#include <cstdint>
#include <memory>
+#include <optional>
#include <string>
+#include <unordered_map>
-class Inventory
+using InventoryRequestBuffer =
+ std::array<uint8_t, sizeof(gpu::GetInventoryInformationRequest)>;
+using InventoryResponseBuffer =
+ std::array<uint8_t, sizeof(gpu::GetInventoryInformationResponse)>;
+
+class Inventory : public std::enable_shared_from_this<Inventory>
{
public:
Inventory(const std::shared_ptr<sdbusplus::asio::connection>& conn,
sdbusplus::asio::object_server& objectServer,
const std::string& inventoryName,
- gpu::DeviceIdentification deviceType);
+ mctp::MctpRequester& mctpRequester,
+ gpu::DeviceIdentification deviceType, uint8_t eid,
+ boost::asio::io_context& io);
private:
+ struct PropertyInfo
+ {
+ std::shared_ptr<sdbusplus::asio::dbus_interface> interface;
+ std::string propertyName;
+ int retryCount{0};
+ bool isPending{false};
+ };
+ void sendInventoryPropertyRequest(gpu::InventoryPropertyId propertyId);
+ void handleInventoryPropertyResponse(gpu::InventoryPropertyId propertyId,
+ int sendRecvMsgResult);
+ void processNextProperty();
+ void processInventoryProperty(gpu::InventoryPropertyId propertyId);
+ void registerProperty(
+ gpu::InventoryPropertyId propertyId,
+ const std::shared_ptr<sdbusplus::asio::dbus_interface>& interface,
+ const std::string& propertyName);
+ std::optional<gpu::InventoryPropertyId> getNextPendingProperty() const;
+ static void markPropertyPending(
+ std::unordered_map<gpu::InventoryPropertyId, PropertyInfo>::iterator
+ it);
+ static void markPropertyProcessed(
+ std::unordered_map<gpu::InventoryPropertyId, PropertyInfo>::iterator
+ it);
+
+ std::shared_ptr<sdbusplus::asio::dbus_interface> assetIface;
std::shared_ptr<sdbusplus::asio::dbus_interface> acceleratorInterface;
std::string name;
+ mctp::MctpRequester& mctpRequester;
+ gpu::DeviceIdentification deviceType;
+ uint8_t eid;
+ boost::asio::steady_timer retryTimer;
+ std::unordered_map<gpu::InventoryPropertyId, PropertyInfo> properties;
+ std::shared_ptr<InventoryRequestBuffer> requestBuffer;
+ std::shared_ptr<InventoryResponseBuffer> responseBuffer;
+ static constexpr std::chrono::seconds retryDelay{5};
+ static constexpr int maxRetryAttempts = 3;
};
diff --git a/src/nvidia-gpu/NvidiaGpuDevice.cpp b/src/nvidia-gpu/NvidiaGpuDevice.cpp
index d7ad846..a13bcfe 100644
--- a/src/nvidia-gpu/NvidiaGpuDevice.cpp
+++ b/src/nvidia-gpu/NvidiaGpuDevice.cpp
@@ -45,7 +45,8 @@
configs(configs), name(escapeName(name)), path(path)
{
inventory = std::make_shared<Inventory>(
- conn, objectServer, name, gpu::DeviceIdentification::DEVICE_GPU);
+ conn, objectServer, name, mctpRequester,
+ gpu::DeviceIdentification::DEVICE_GPU, eid, io);
makeSensors();
}