gpu : add support for communication to the endpoint The commit uses MCTP VDM protocol to read temperature sensor value from the gpu. The MCTP VDM protocol is an extension of the OCP Accelerator Management Interface specification - ''' https://www.opencompute.org/documents/ocp-gpu-accelerator-management-interfaces-v1-pdf ''' Tested. Build an image for gb200nvl-obmc machine with the following patches cherry picked. This patches are needed to enable the mctp stack. https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312 https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410 https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422 Copy the configuration file on gb200nvl-obmc machine and restart the entity-manager service. ``` root@gb200nvl-obmc:~# rm -rf /var/configuration/ root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service ``` Copy the gpusensor app and run it. ``` root@gb200nvl-obmc:~# ./gpusensor ``` The app is detecting entity-manager configuration on gb200nvl-obmc machine. The app is also able to detect all the endpoints from the mctp service dbus tree. The app is reading temperature sensor value from gpu correctly and the temperature sensor is also present on redfish. ``` $ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU { "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU", "@odata.type": "#Sensor.v1_2_0.Sensor", "Id": "temperature_NVIDIA_GB200_GPU", "Name": "NVIDIA GB200 GPU", "Reading": 36.4375, "ReadingRangeMax": 127.0, "ReadingRangeMin": -128.0, "ReadingType": "Temperature", "ReadingUnits": "Cel", "Status": { "Health": "OK", "State": "Enabled" } }% root@gb200nvl-obmc:~# busctl tree xyz.openbmc_project.GpuSensor └─ /xyz └─ /xyz/openbmc_project └─ /xyz/openbmc_project/sensors └─ /xyz/openbmc_project/sensors/temperature └─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU NAME TYPE SIGNATURE RESULT/VALUE FLAGS org.freedesktop.DBus.Introspectable interface - - - .Introspect method - s - org.freedesktop.DBus.Peer interface - - - .GetMachineId method - s - .Ping method - - - org.freedesktop.DBus.Properties interface - - - .Get method ss v - .GetAll method s a{sv} - .Set method ssv - - .PropertiesChanged signal sa{sv}as - - xyz.openbmc_project.Association.Definitions interface - - - .Associations property a(sss) 1 "chassis" "all_sensors" "/xyz/openbmc… emits-change xyz.openbmc_project.Sensor.Value interface - - - .MaxValue property d 127 emits-change .MinValue property d -128 emits-change .Unit property s "xyz.openbmc_project.Sensor.Value.Unit.… emits-change .Value property d 36.3125 emits-change writable xyz.openbmc_project.Sensor.ValueMutability interface - - - .Mutable property b true emits-change xyz.openbmc_project.State.Decorator.Availability interface - - - .Available property b true emits-change writable xyz.openbmc_project.State.Decorator.OperationalStatus interface - - - .Functional property b true emits-change ``` Change-Id: Ied938b9e5c19751ee283b4b948e16c905c78fb48 Signed-off-by: Harshit Aghera <haghera@nvidia.com>

commit: a3f24f40488c67c382a0050d12e9002d4a886ab8 [log] [tgz]
author: Harshit Aghera <haghera@nvidia.com> Mon Apr 21 20:04:56 2025 +0530
committer: Harshit Aghera <haghera@nvidia.com> Wed Apr 23 18:16:57 2025 +0530
tree: 3181a1a59cab9efa6b88a50816b9fbaeda379a13
parent: acd375ab07dc78049c052f17f34f9407ed975f88 [diff]
diff --git a/src/gpu/tests/GpuSensorTest.cpp b/src/gpu/tests/GpuSensorTest.cpp
new file mode 100644
index 0000000..e4d6d34
--- /dev/null
+++ b/src/gpu/tests/GpuSensorTest.cpp

@@ -0,0 +1,423 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "GpuMctpVdm.hpp"
+#include "OcpMctpVdm.hpp"
+
+#include <endian.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+TEST(PackMessage, goodPathTest)
+{
+    ocp::accelerator_management::BindingPciVidInfo hdr{};
+    hdr.ocp_accelerator_management_msg_type =
+        static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
+    hdr.instance_id = 0;
+    hdr.msg_type = 0x04;
+
+    uint16_t pciVendorId{0x10de};
+
+    ocp::accelerator_management::BindingPciVid msg{};
+
+    auto rc = ocp::accelerator_management::packHeader(pciVendorId, hdr, msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+
+    EXPECT_EQ(msg.pci_vendor_id, htobe16(pciVendorId));
+    EXPECT_EQ(msg.reserved, 0);
+    EXPECT_EQ(msg.datagram, 0);
+    EXPECT_EQ(msg.request, 1);
+    EXPECT_EQ(msg.ocp_type, ocp::accelerator_management::type);
+    EXPECT_EQ(msg.ocp_version, ocp::accelerator_management::version);
+    EXPECT_EQ(msg.ocp_accelerator_management_msg_type, hdr.msg_type);
+    EXPECT_EQ(msg.instance_id, hdr.instance_id);
+}
+
+TEST(PackMessage, badPathTest)
+{
+    ocp::accelerator_management::BindingPciVidInfo hdr{};
+    uint16_t pciVendorId{};
+
+    // Message pointer is NULL test is no longer valid with references
+    // However, we'll leave this test block for completeness
+
+    // Instance ID out of range
+    ocp::accelerator_management::BindingPciVid msg{};
+    hdr.ocp_accelerator_management_msg_type =
+        static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
+    hdr.instance_id = 32;
+    auto rc = ocp::accelerator_management::packHeader(pciVendorId, hdr, msg);
+    EXPECT_EQ(rc,
+              ocp::accelerator_management::CompletionCode::ERR_INVALID_DATA);
+}
+
+TEST(encodeReasonCode, testGoodEncodeReasonCode)
+{
+    std::vector<uint8_t> responseMsg(
+        sizeof(ocp::accelerator_management::BindingPciVid) +
+        sizeof(ocp::accelerator_management::CommonNonSuccessResponse));
+    auto* response = new (responseMsg.data())
+        ocp::accelerator_management::Message;
+
+    uint8_t cc = static_cast<uint8_t>(
+        ocp::accelerator_management::CompletionCode::ERROR);
+    uint16_t reasonCode = static_cast<uint16_t>(
+        ocp::accelerator_management::ReasonCode::REASON_NONE);
+
+    auto rc = ocp::accelerator_management::encodeReasonCode(
+        cc, reasonCode, 0x00, *response);
+
+    ocp::accelerator_management::CommonNonSuccessResponse resp{};
+    std::memcpy(&resp, &response->data, sizeof(resp));
+
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+    EXPECT_EQ(static_cast<uint8_t>(
+                  ocp::accelerator_management::CompletionCode::ERROR),
+              resp.completion_code);
+    EXPECT_EQ(0x00, resp.command);
+    EXPECT_EQ(static_cast<uint16_t>(
+                  ocp::accelerator_management::ReasonCode::REASON_NONE),
+              le16toh(resp.reason_code));
+}
+
+TEST(encodeReasonCode, testBadEncodeReasonCode)
+{
+    // We cannot test null pointer with references
+    // This test is no longer applicable
+}
+
+TEST(decodeReasonCodeCC, testGoodDecodeReasonCode)
+{
+    std::vector<uint8_t> responseMsg{
+        0x10,
+        0xDE, // PCI VID
+        0x00, // RQ=0, D=0, RSVD=0, INSTANCE_ID=0
+        0x89, // OCP_TYPE=8, OCP_VER=9
+        0x00, // MSG_TYPE
+        0x09, // command
+        0x01, // completion code !=
+              // ocp::accelerator_management::CompletionCode::SUCCESS
+        0x00, // reason code
+        0x00};
+
+    auto* response = new (responseMsg.data())
+        ocp::accelerator_management::Message;
+    size_t msgLen = responseMsg.size();
+
+    uint8_t cc = static_cast<uint8_t>(
+        ocp::accelerator_management::CompletionCode::ERROR);
+    uint16_t reasonCode = static_cast<uint16_t>(
+        ocp::accelerator_management::ReasonCode::REASON_NONE);
+
+    auto rc = ocp::accelerator_management::decodeReasonCodeAndCC(
+        *response, msgLen, cc, reasonCode);
+
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+    EXPECT_EQ(cc, static_cast<uint8_t>(
+                      ocp::accelerator_management::CompletionCode::ERROR));
+    EXPECT_EQ(reasonCode, 0x0000);
+}
+
+TEST(decodeReasonCodeCC, testGoodDecodeCompletionCode)
+{
+    std::vector<uint8_t> responseMsg{
+        0x10,
+        0xDE, // PCI VID
+        0x00, // RQ=0, D=0, RSVD=0, INSTANCE_ID=0
+        0x89, // OCP_TYPE=8, OCP_VER=9
+        0x00, // MSG_TYPE
+        0x09, // command
+        0x00, // completion code =
+              // ocp::accelerator_management::CompletionCode::SUCCESS
+        0x00, // reason code
+        0x02};
+
+    auto* response = new (responseMsg.data())
+        ocp::accelerator_management::Message;
+    size_t msgLen = responseMsg.size();
+
+    uint8_t cc = static_cast<uint8_t>(
+        ocp::accelerator_management::CompletionCode::ERROR);
+    uint16_t reasonCode = static_cast<uint16_t>(
+        ocp::accelerator_management::ReasonCode::REASON_NONE);
+
+    auto rc = ocp::accelerator_management::decodeReasonCodeAndCC(
+        *response, msgLen, cc, reasonCode);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+    EXPECT_EQ(cc, static_cast<uint8_t>(
+                      ocp::accelerator_management::CompletionCode::SUCCESS));
+    EXPECT_EQ(reasonCode,
+              static_cast<uint16_t>(
+                  ocp::accelerator_management::ReasonCode::REASON_NONE));
+}
+
+TEST(decodeReasonCode, testBadDecodeReasonCode)
+{
+    std::vector<uint8_t> responseMsg{
+        0x10,
+        0xDE, // PCI VID
+        0x00, // RQ=0, D=0, RSVD=0, INSTANCE_ID=0
+        0x89, // OCP_TYPE=8, OCP_VER=9
+        0x00, // MSG_TYPE
+        0x09, // command
+        0x01, // completion code
+        0x00, // reason code
+        0x00};
+
+    auto* response = new (responseMsg.data())
+        ocp::accelerator_management::Message;
+    size_t msgLen = responseMsg.size();
+
+    uint8_t cc = static_cast<uint8_t>(
+        ocp::accelerator_management::CompletionCode::SUCCESS);
+    uint16_t reasonCode = static_cast<uint16_t>(
+        ocp::accelerator_management::ReasonCode::REASON_NONE);
+
+    // Null pointer tests are no longer applicable with references
+
+    auto rc = ocp::accelerator_management::decodeReasonCodeAndCC(
+        *response, msgLen - 2, cc,
+        reasonCode); // sending msg len less then expected
+    EXPECT_EQ(
+        rc,
+        ocp::accelerator_management::CompletionCode::ERR_INVALID_DATA_LENGTH);
+}
+
+TEST(GpuCommonPackTest, PackHeader)
+{
+    ocp::accelerator_management::BindingPciVidInfo hdr{};
+    ocp::accelerator_management::BindingPciVid msg{};
+
+    hdr.ocp_accelerator_management_msg_type =
+        static_cast<uint8_t>(ocp::accelerator_management::MessageType::REQUEST);
+    hdr.instance_id = 0x04;
+    hdr.msg_type = 0x03;
+
+    auto rc = gpu::packHeader(hdr, msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+    EXPECT_EQ(msg.ocp_version, ocp::accelerator_management::version);
+
+    // Null pointer test is no longer applicable with references
+
+    // Instance ID out of range
+    hdr.instance_id = 32;
+    rc = gpu::packHeader(hdr, msg);
+    EXPECT_EQ(rc,
+              ocp::accelerator_management::CompletionCode::ERR_INVALID_DATA);
+}
+
+class GpuCommonTest : public ::testing::Test
+{
+  protected:
+    ocp::accelerator_management::BindingPciVidInfo hdr{};
+    ocp::accelerator_management::Message* msg{};
+    std::vector<uint8_t> buf;
+    uint8_t instance_id{};
+    uint8_t type{};
+    uint8_t command{};
+    uint8_t cc{};
+    uint16_t reason_code{};
+    uint16_t data_size{};
+    size_t msg_len{};
+    uint16_t pci_vendor_id = gpu::nvidiaPciVendorId;
+
+    void SetUp() override
+    {
+        buf.resize(1024, 0);
+        msg_len = buf.size();
+        msg = new (buf.data()) ocp::accelerator_management::Message;
+    }
+
+    void setOcpVersionAndVendorId()
+    {
+        msg->hdr.ocp_type = ocp::accelerator_management::type;
+        msg->hdr.ocp_version = ocp::accelerator_management::version;
+        msg->hdr.pci_vendor_id = be16toh(gpu::nvidiaPciVendorId);
+    }
+
+    void changeVendorId()
+    {
+        msg->hdr.pci_vendor_id = 0x1234;
+    }
+};
+
+TEST_F(GpuCommonTest, EncodeReasonCode)
+{
+    auto rc = gpu::encodeReasonCode(cc, reason_code, command, *msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+}
+
+TEST_F(GpuCommonTest, DecodeReasonCode)
+{
+    ocp::accelerator_management::CompletionCode rc{};
+
+    setOcpVersionAndVendorId();
+    rc = gpu::decodeReasonCodeAndCC(*msg, msg_len, cc, reason_code);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+
+    changeVendorId();
+    rc = gpu::decodeReasonCodeAndCC(*msg, msg_len, cc, reason_code);
+    EXPECT_EQ(rc,
+              ocp::accelerator_management::CompletionCode::ERR_INVALID_DATA);
+}
+
+class GpuSensorsTest : public ::testing::Test
+{
+  protected:
+    ocp::accelerator_management::BindingPciVidInfo hdr{};
+    ocp::accelerator_management::Message* msg{};
+    std::vector<uint8_t> buf;
+    uint8_t instance_id = 0;
+    uint8_t device_instance = 1;
+    uint8_t device_id =
+        static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU);
+    uint8_t cc = static_cast<uint8_t>(
+        ocp::accelerator_management::CompletionCode::SUCCESS);
+    uint16_t reason_code = static_cast<uint16_t>(
+        ocp::accelerator_management::ReasonCode::REASON_NONE);
+    uint8_t sensor_id = 0;
+    double temperature = 25.5;
+    size_t msg_len{};
+
+    void SetUp() override
+    {
+        buf.resize(1024, 0);
+        msg = new (buf.data()) ocp::accelerator_management::Message;
+        msg_len = buf.size();
+    }
+};
+
+TEST_F(GpuSensorsTest, QueryDeviceIdentificationRequestEncode)
+{
+    auto rc = gpu::encodeQueryDeviceIdentificationRequest(instance_id, *msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+
+    // Check that header is properly set
+    EXPECT_EQ(msg->hdr.ocp_type, ocp::accelerator_management::type);
+    EXPECT_EQ(msg->hdr.ocp_version, ocp::accelerator_management::version);
+    EXPECT_EQ(msg->hdr.instance_id, instance_id);
+
+    // Check payload
+    gpu::QueryDeviceIdentificationRequest request{};
+    std::memcpy(&request, &msg->data, sizeof(request));
+
+    EXPECT_EQ(request.hdr.command,
+              static_cast<uint8_t>(gpu::DeviceCapabilityDiscoveryCommands::
+                                       QUERY_DEVICE_IDENTIFICATION));
+    EXPECT_EQ(request.hdr.data_size, 0);
+}
+
+TEST_F(GpuSensorsTest, QueryDeviceIdentificationResponseEncode)
+{
+    auto rc = gpu::encodeQueryDeviceIdentificationResponse(
+        instance_id, cc, reason_code, device_id, device_instance, *msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+
+    // Test with error condition
+    uint8_t errorCc = static_cast<uint8_t>(
+        ocp::accelerator_management::CompletionCode::ERROR);
+    rc = gpu::encodeQueryDeviceIdentificationResponse(
+        instance_id, errorCc, reason_code, device_id, device_instance, *msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+}
+
+TEST_F(GpuSensorsTest, QueryDeviceIdentificationResponseDecode)
+{
+    // First encode a response
+    auto rc = gpu::encodeQueryDeviceIdentificationResponse(
+        instance_id, cc, reason_code, device_id, device_instance, *msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+
+    // Then decode it
+    uint8_t decodedCc{};
+    uint16_t decodedReasonCode{};
+    uint8_t decodedDeviceId{};
+    uint8_t decodedDeviceInstance{};
+
+    rc = gpu::decodeQueryDeviceIdentificationResponse(
+        *msg, msg_len, decodedCc, decodedReasonCode, decodedDeviceId,
+        decodedDeviceInstance);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+    EXPECT_EQ(decodedCc, cc);
+    EXPECT_EQ(decodedReasonCode, reason_code);
+    EXPECT_EQ(decodedDeviceId, device_id);
+    EXPECT_EQ(decodedDeviceInstance, device_instance);
+}
+
+TEST_F(GpuSensorsTest, GetTemperatureReadingRequestEncode)
+{
+    auto rc =
+        gpu::encodeGetTemperatureReadingRequest(instance_id, sensor_id, *msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+
+    // Check that header is properly set
+    EXPECT_EQ(msg->hdr.ocp_type, ocp::accelerator_management::type);
+    EXPECT_EQ(msg->hdr.ocp_version, ocp::accelerator_management::version);
+    EXPECT_EQ(msg->hdr.instance_id, instance_id);
+
+    // Check payload
+    gpu::GetTemperatureReadingRequest request{};
+    std::memcpy(&request, &msg->data, sizeof(request));
+
+    EXPECT_EQ(request.hdr.command,
+              static_cast<uint8_t>(
+                  gpu::PlatformEnvironmentalCommands::GET_TEMPERATURE_READING));
+    EXPECT_EQ(request.hdr.data_size, sizeof(sensor_id));
+    EXPECT_EQ(request.sensor_id, sensor_id);
+}
+
+TEST_F(GpuSensorsTest, GetTemperatureReadingRequestDecode)
+{
+    // First encode a request
+    auto rc =
+        gpu::encodeGetTemperatureReadingRequest(instance_id, sensor_id, *msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+
+    // Then decode it
+    uint8_t decodedSensorId = 0;
+    rc =
+        gpu::decodeGetTemperatureReadingRequest(*msg, msg_len, decodedSensorId);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+    EXPECT_EQ(decodedSensorId, sensor_id);
+}
+
+TEST_F(GpuSensorsTest, GetTemperatureReadingResponseEncode)
+{
+    auto rc = gpu::encodeGetTemperatureReadingResponse(
+        instance_id, cc, reason_code, temperature, *msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+
+    // Test with error condition
+    uint8_t errorCc = static_cast<uint8_t>(
+        ocp::accelerator_management::CompletionCode::ERROR);
+    rc = gpu::encodeGetTemperatureReadingResponse(
+        instance_id, errorCc, reason_code, temperature, *msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+}
+
+TEST_F(GpuSensorsTest, GetTemperatureReadingResponseDecode)
+{
+    // First encode a response
+    auto rc = gpu::encodeGetTemperatureReadingResponse(
+        instance_id, cc, reason_code, temperature, *msg);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+
+    // Then decode it
+    uint8_t decodedCc{};
+    uint16_t decodedReasonCode{};
+    double decodedTemperature{};
+
+    rc = gpu::decodeGetTemperatureReadingResponse(
+        *msg, msg_len, decodedCc, decodedReasonCode, decodedTemperature);
+    EXPECT_EQ(rc, ocp::accelerator_management::CompletionCode::SUCCESS);
+    EXPECT_EQ(decodedCc, cc);
+    EXPECT_EQ(decodedReasonCode, reason_code);
+    EXPECT_DOUBLE_EQ(decodedTemperature, temperature);
+}

diff --git a/src/gpu/tests/meson.build b/src/gpu/tests/meson.build
new file mode 100644
index 0000000..e823de2
--- /dev/null
+++ b/src/gpu/tests/meson.build

@@ -0,0 +1,29 @@
+gtest_dep = dependency('gtest', main: true, disabler: true, required: false)
+gmock_dep = dependency('gmock', disabler: true, required: false)
+if not gtest_dep.found() or not gmock_dep.found()
+    gtest_proj = import('cmake').subproject('googletest', required: true)
+    gtest_dep = declare_dependency(
+        dependencies: [
+            dependency('threads'),
+            gtest_proj.dependency('gtest'),
+            gtest_proj.dependency('gtest_main'),
+        ],
+    )
+    gmock_dep = gtest_proj.dependency('gmock')
+endif
+
+gpusensor_test_include_dirs = [gpusensor_include_dir]
+
+test(
+    'gpusensor_test',
+    executable(
+        'gpusensor_test',
+        'GpuSensorTest.cpp',
+        '../OcpMctpVdm.cpp',
+        '../GpuMctpVdm.cpp',
+        implicit_include_directories: false,
+        include_directories: gpusensor_test_include_dirs,
+        dependencies: [gtest_dep, gmock_dep],
+    ),
+    workdir: meson.current_source_dir(),
+)
commit	a3f24f40488c67c382a0050d12e9002d4a886ab8	[log] [tgz]
author	Harshit Aghera <haghera@nvidia.com>	Mon Apr 21 20:04:56 2025 +0530
committer	Harshit Aghera <haghera@nvidia.com>	Wed Apr 23 18:16:57 2025 +0530
tree	3181a1a59cab9efa6b88a50816b9fbaeda379a13
parent	acd375ab07dc78049c052f17f34f9407ed975f88 [diff]