gpu : add support for communication to the endpoint
The commit uses MCTP VDM protocol to read temperature sensor value from
the gpu.
The MCTP VDM protocol is an extension of the OCP Accelerator Management
Interface specification -
'''
https://www.opencompute.org/documents/ocp-gpu-accelerator-management-interfaces-v1-pdf
'''
Tested.
Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```
Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```
The app is detecting entity-manager configuration on gb200nvl-obmc
machine. The app is also able to detect all the endpoints from the mctp
service dbus tree. The app is reading temperature sensor value from gpu
correctly and the temperature sensor is also present on redfish.
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU",
"Name": "NVIDIA GB200 GPU",
"Reading": 36.4375,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
root@gb200nvl-obmc:~# busctl tree xyz.openbmc_project.GpuSensor
└─ /xyz
└─ /xyz/openbmc_project
└─ /xyz/openbmc_project/sensors
└─ /xyz/openbmc_project/sensors/temperature
└─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU
root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU
NAME TYPE SIGNATURE RESULT/VALUE FLAGS
org.freedesktop.DBus.Introspectable interface - - -
.Introspect method - s -
org.freedesktop.DBus.Peer interface - - -
.GetMachineId method - s -
.Ping method - - -
org.freedesktop.DBus.Properties interface - - -
.Get method ss v -
.GetAll method s a{sv} -
.Set method ssv - -
.PropertiesChanged signal sa{sv}as - -
xyz.openbmc_project.Association.Definitions interface - - -
.Associations property a(sss) 1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Sensor.Value interface - - -
.MaxValue property d 127 emits-change
.MinValue property d -128 emits-change
.Unit property s "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value property d 36.3125 emits-change writable
xyz.openbmc_project.Sensor.ValueMutability interface - - -
.Mutable property b true emits-change
xyz.openbmc_project.State.Decorator.Availability interface - - -
.Available property b true emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface - - -
.Functional property b true emits-change
```
Change-Id: Ied938b9e5c19751ee283b4b948e16c905c78fb48
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/OcpMctpVdm.hpp b/src/gpu/OcpMctpVdm.hpp
new file mode 100644
index 0000000..8c6a0b4
--- /dev/null
+++ b/src/gpu/OcpMctpVdm.hpp
@@ -0,0 +1,215 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <asm/byteorder.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ocp
+{
+namespace accelerator_management
+{
+
+/** @brief OCP MCTP VDM Message Type
+ *
+ * v1 spec section 3.6.1.2.1
+ */
+constexpr uint8_t messageType = 0x7E;
+
+/**
+ * @defgroup OCP Version
+ *
+ * v1 spec section 3.6.1.2.1
+ * @{
+ */
+constexpr uint8_t type = 8;
+constexpr uint8_t version = 9;
+/** @} */
+
+/**
+ * @defgroup OCP MCTP VDM Instance Id
+ *
+ * v1 spec section 3.6.1.2.1
+ * @{
+ */
+constexpr uint8_t instanceMin = 0;
+constexpr uint8_t instanceIdMask = 0x1F;
+constexpr uint8_t instanceMax = 31;
+/** @} */
+
+/** @brief OCP MCTP VDM completion codes
+ *
+ * v1 spec section 3.6.2
+ */
+enum class CompletionCode : uint8_t
+{
+ SUCCESS = 0x00,
+ ERROR = 0x01,
+ ERR_INVALID_DATA = 0x02,
+ ERR_INVALID_DATA_LENGTH = 0x03,
+ ERR_NOT_READY = 0x04,
+ ERR_UNSUPPORTED_COMMAND_CODE = 0x05,
+ ERR_UNSUPPORTED_MSG_TYPE = 0x06,
+ ERR_BUS_ACCESS = 0x7f,
+ ERR_NULL = 0x80,
+};
+
+/** @brief OCP MCTP VDM reason codes
+ *
+ * v1 spec section 3.6.3
+ */
+enum class ReasonCode : uint16_t
+{
+ REASON_NONE = 0x00,
+};
+
+/** @brief OCP MCTP VDM MessageType
+ *
+ * v1 spec section 3.6.1.2.1
+ */
+enum class MessageType : uint8_t
+{
+ RESPONSE = 0, //!< OCP MCTP VDM response message
+ REQUEST = 2, //!< OCP MCTP VDM request message
+};
+
+/** @struct BindingPciVid
+ *
+ * Structure representing OCP MCTP VDM VDM binding using PCI vendor ID
+ * v1 spec section 3.6.1.2
+ */
+struct BindingPciVid
+{
+ uint16_t pci_vendor_id; //!< PCI defined vendor ID
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ uint8_t instance_id:5; //!< Instance ID
+ uint8_t reserved:1; //!< Reserved
+ uint8_t datagram:1; //!< Datagram bit
+ uint8_t request:1; //!< Request bit
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ uint8_t request:1; //!< Request bit
+ uint8_t datagram:1; //!< Datagram bit
+ uint8_t reserved:1; //!< Reserved
+ uint8_t instance_id:5; //!< Instance ID
+#endif
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ uint8_t ocp_version:4; //!< OCP version
+ uint8_t ocp_type:4; //!< OCP type
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ uint8_t ocp_type:4; //!< OCP type
+ uint8_t ocp_version:4; //!< OCP version
+#endif
+
+ uint8_t ocp_accelerator_management_msg_type; //!< Message Type
+} __attribute__((packed));
+
+/** @struct Message
+ *
+ * Structure representing OCP MCTP VDM message
+ * v1 spec section 3.6.1.2
+ */
+struct Message
+{
+ BindingPciVid hdr; //!< OCP MCTP VDM message header
+ char data; //!< beginning of the payload
+} __attribute__((packed));
+
+/** @struct BindingPciVidInfo
+ *
+ * The information needed to prepare OCP MCTP VDM header and this is passed to
+ * the PackHeader API. v1 spec section 3.6.1.2
+ */
+struct BindingPciVidInfo
+{
+ uint8_t ocp_accelerator_management_msg_type;
+ uint8_t instance_id;
+ uint8_t msg_type;
+};
+
+/** @struct CommonRequest
+ *
+ * Structure representing OCP MCTP VDM request without data (OCP version 1).
+ * v1 spec section 3.6.1.4.1
+ */
+struct CommonRequest
+{
+ uint8_t command;
+ uint8_t data_size;
+} __attribute__((packed));
+
+/** @struct CommonResponse
+ *
+ * Structure representing OCP MCTP VDM response with data
+ * v1 spec section 3.6.1.4.4
+ */
+struct CommonResponse
+{
+ uint8_t command;
+ uint8_t completion_code;
+ uint16_t reserved;
+ uint16_t data_size;
+} __attribute__((packed));
+
+/** @struct CommonNonSuccessResponse
+ *
+ * Structure representing OCP MCTP VDM response with reason code when CC !=
+ * Success v1 spec section 3.6.1.4.5
+ */
+struct CommonNonSuccessResponse
+{
+ uint8_t command;
+ uint8_t completion_code;
+ uint16_t reason_code;
+} __attribute__((packed));
+
+/**
+ * @brief Populate the OCP MCTP VDM message with the OCP MCTP VDM header. OCP
+ * MCTP VDM header OCP Version will be populated with value 1. The caller of
+ * this API allocates buffer for the OCP MCTP VDM header when forming the OCP
+ * MCTP VDM message. The buffer is passed to this API to pack the OCP MCTP VDM
+ * header.
+ *
+ * @param[in] pci_vendor_id - PCI Vendor ID
+ * @param[in] hdr - Pointer to the OCP MCTP VDM header information
+ * @param[out] msg - Reference to OCP MCTP VDM message header
+ *
+ * @return CompletionCode::SUCCESS on success, otherwise appropriate error
+ * code.
+ * @note Caller is responsible for alloc and dealloc of msg
+ * and hdr params
+ */
+CompletionCode packHeader(uint16_t pciVendorId, const BindingPciVidInfo& hdr,
+ BindingPciVid& msg);
+
+/** @brief Encode reason code into an OCP MCTP VDM response message.
+ * This function does not populate or modifies the message header.
+ *
+ * @param[in] cc - Completion Code
+ * @param[in] reason_code - reason code
+ * @param[in] command_code - command code
+ * @param[out] msg - Reference to message
+ * @return CompletionCode
+ */
+CompletionCode encodeReasonCode(uint8_t cc, uint16_t reasonCode,
+ uint8_t commandCode, Message& msg);
+
+/** @brief Decode the reason code
+ *
+ * @param[in] msg - response message
+ * @param[in] msg_len - Length of response message
+ * @param[out] cc - reference to completion code
+ * @param[out] reason_code - reference to reason_code
+ * @return CompletionCode
+ */
+CompletionCode decodeReasonCodeAndCC(const Message& msg, size_t msgLen,
+ uint8_t& cc, uint16_t& reasonCode);
+
+} // namespace accelerator_management
+} // namespace ocp