gpu : add support for communication to the endpoint
The commit uses MCTP VDM protocol to read temperature sensor value from
the gpu.
The MCTP VDM protocol is an extension of the OCP Accelerator Management
Interface specification -
'''
https://www.opencompute.org/documents/ocp-gpu-accelerator-management-interfaces-v1-pdf
'''
Tested.
Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422
Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```
Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```
The app is detecting entity-manager configuration on gb200nvl-obmc
machine. The app is also able to detect all the endpoints from the mctp
service dbus tree. The app is reading temperature sensor value from gpu
correctly and the temperature sensor is also present on redfish.
```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU
{
"@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU",
"@odata.type": "#Sensor.v1_2_0.Sensor",
"Id": "temperature_NVIDIA_GB200_GPU",
"Name": "NVIDIA GB200 GPU",
"Reading": 36.4375,
"ReadingRangeMax": 127.0,
"ReadingRangeMin": -128.0,
"ReadingType": "Temperature",
"ReadingUnits": "Cel",
"Status": {
"Health": "OK",
"State": "Enabled"
}
}%
root@gb200nvl-obmc:~# busctl tree xyz.openbmc_project.GpuSensor
└─ /xyz
└─ /xyz/openbmc_project
└─ /xyz/openbmc_project/sensors
└─ /xyz/openbmc_project/sensors/temperature
└─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU
root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU
NAME TYPE SIGNATURE RESULT/VALUE FLAGS
org.freedesktop.DBus.Introspectable interface - - -
.Introspect method - s -
org.freedesktop.DBus.Peer interface - - -
.GetMachineId method - s -
.Ping method - - -
org.freedesktop.DBus.Properties interface - - -
.Get method ss v -
.GetAll method s a{sv} -
.Set method ssv - -
.PropertiesChanged signal sa{sv}as - -
xyz.openbmc_project.Association.Definitions interface - - -
.Associations property a(sss) 1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Sensor.Value interface - - -
.MaxValue property d 127 emits-change
.MinValue property d -128 emits-change
.Unit property s "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value property d 36.3125 emits-change writable
xyz.openbmc_project.Sensor.ValueMutability interface - - -
.Mutable property b true emits-change
xyz.openbmc_project.State.Decorator.Availability interface - - -
.Available property b true emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface - - -
.Functional property b true emits-change
```
Change-Id: Ied938b9e5c19751ee283b4b948e16c905c78fb48
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuMctpVdm.hpp b/src/gpu/GpuMctpVdm.hpp
new file mode 100644
index 0000000..21c69cd
--- /dev/null
+++ b/src/gpu/GpuMctpVdm.hpp
@@ -0,0 +1,246 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <asm/byteorder.h>
+
+#include <OcpMctpVdm.hpp>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace gpu
+{
+
+/** @brief NVIDIA PCI vendor ID */
+constexpr uint16_t nvidiaPciVendorId = 0x10de;
+
+/** @brief GPU message types
+ *
+ * Enumeration of different message types used in GPU protocol.
+ * These types categorize different classes of messages for device management
+ * and monitoring.
+ */
+enum class MessageType : uint8_t
+{
+ DEVICE_CAPABILITY_DISCOVERY = 0,
+ PLATFORM_ENVIRONMENTAL = 3
+};
+
+/** @brief Type0 Device Capability Discovery Commands
+ */
+enum class DeviceCapabilityDiscoveryCommands : uint8_t
+{
+ QUERY_DEVICE_IDENTIFICATION = 0x09,
+};
+
+/** @brief Type3 platform environmental commands
+ */
+enum class PlatformEnvironmentalCommands : uint8_t
+{
+ GET_TEMPERATURE_READING = 0x00,
+};
+
+/** @brief device identification types
+ *
+ * Enumeration of different device types that can be identified in the system.
+ * This is used to distinguish between various components during device
+ * discovery.
+ */
+enum class DeviceIdentification : uint8_t
+{
+ DEVICE_GPU = 0
+};
+
+/** @struct QueryDeviceIdentificationRequest
+ *
+ * Structure representing query device identification request
+ */
+struct QueryDeviceIdentificationRequest
+{
+ ocp::accelerator_management::CommonRequest hdr;
+} __attribute__((packed));
+
+/** @struct QueryDeviceIdentificationResponse
+ *
+ * Structure representing query device identification response.
+ */
+struct QueryDeviceIdentificationResponse
+{
+ ocp::accelerator_management::CommonResponse hdr;
+ uint8_t device_identification;
+ uint8_t instance_id;
+} __attribute__((packed));
+
+/** @struct GetNumericSensorReadingRequest
+ *
+ * Structure representing request to get reading of certain numeric
+ * sensors.
+ */
+struct GetNumericSensorReadingRequest
+{
+ ocp::accelerator_management::CommonRequest hdr;
+ uint8_t sensor_id;
+} __attribute__((packed));
+
+/** @struct GetTemperatureReadingRequest
+ *
+ * Structure representing get temperature reading request.
+ */
+using GetTemperatureReadingRequest = GetNumericSensorReadingRequest;
+
+/** @struct GetTemperatureReadingResponse
+ *
+ * Structure representing get temperature reading response.
+ */
+struct GetTemperatureReadingResponse
+{
+ ocp::accelerator_management::CommonResponse hdr;
+ int32_t reading;
+} __attribute__((packed));
+
+/**
+ * @brief Populate the GPU message with the GPU header.
+ * The caller of this API allocates buffer for the GPU header
+ * when forming the GPU message.
+ * The buffer is passed to this API to pack the GPU header.
+ *
+ * @param[in] hdr - Reference to the OCP MCTP VDM header information
+ * @param[out] msg - Reference to GPU message header
+ *
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ * @note Caller is responsible for alloc and dealloc of msg
+ * and hdr params
+ */
+ocp::accelerator_management::CompletionCode packHeader(
+ const ocp::accelerator_management::BindingPciVidInfo& hdr,
+ ocp::accelerator_management::BindingPciVid& msg);
+
+/** @brief Encode reason code
+ *
+ * @param[in] cc - Completion Code
+ * @param[in] reason_code - reason code
+ * @param[in] command_code - command code
+ * @param[out] msg - Reference to message
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode encodeReasonCode(
+ uint8_t cc, uint16_t reasonCode, uint8_t commandCode,
+ ocp::accelerator_management::Message& msg);
+
+/** @brief Decode to get reason code
+ *
+ * @param[in] msg - response message
+ * @param[in] msg_len - Length of response message
+ * @param[out] cc - reference to completion code
+ * @param[out] reason_code - reference to reason_code
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode decodeReasonCodeAndCC(
+ const ocp::accelerator_management::Message& msg, size_t msgLen, uint8_t& cc,
+ uint16_t& reasonCode);
+
+/** @brief Create a Query device identification request message
+ *
+ * @param[in] instance_id - instance ID
+ * @param[out] msg - Reference to message that will be written to
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode
+ encodeQueryDeviceIdentificationRequest(
+ uint8_t instanceId, ocp::accelerator_management::Message& msg);
+
+/** @brief Encode a Query device identification response message
+ *
+ * @param[in] instance_id - instance ID
+ * @param[in] cc - completion code
+ * @param[in] reason_code - reason code
+ * @param[in] device_identification - device identification
+ * @param[in] device_instance - device instance id
+ * @param[out] msg - Reference to message that will be written to
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode
+ encodeQueryDeviceIdentificationResponse(
+ uint8_t instanceId, uint8_t cc, uint16_t reasonCode,
+ uint8_t deviceIdentification, uint8_t deviceInstance,
+ ocp::accelerator_management::Message& msg);
+
+/** @brief Decode a Query device identification response message
+ *
+ * @param[in] msg - response message
+ * @param[in] msg_len - Length of response message
+ * @param[out] cc - reference to completion code
+ * @param[out] reason_code - reference to reason code
+ * @param[out] device_identification - reference to device_identification
+ * @param[out] device_instance - reference to instance id
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode
+ decodeQueryDeviceIdentificationResponse(
+ const ocp::accelerator_management::Message& msg, size_t msgLen,
+ uint8_t& cc, uint16_t& reasonCode, uint8_t& deviceIdentification,
+ uint8_t& deviceInstance);
+
+/** @brief Encode a Get temperature readings request message
+ *
+ * @param[in] instance_id - instance ID
+ * @param[in] sensor_id - sensor id
+ * @param[out] msg - Reference to message that will be written to
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode encodeGetTemperatureReadingRequest(
+ uint8_t instanceId, uint8_t sensorId,
+ ocp::accelerator_management::Message& msg);
+
+/** @brief Decode a Get temperature readings request message
+ *
+ * @param[in] msg - request message
+ * @param[in] msg_len - Length of request message
+ * @param[out] sensor_id - reference to sensor id
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode decodeGetTemperatureReadingRequest(
+ const ocp::accelerator_management::Message& msg, size_t msgLen,
+ uint8_t& sensorId);
+
+/** @brief Encode a Get temperature readings response message
+ *
+ * @param[in] instance_id - instance ID
+ * @param[in] cc - pointer to response message completion code
+ * @param[in] reason_code - reason code
+ * @param[in] temperature_reading - temperature reading
+ * @param[out] msg - Reference to message that will be written to
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode encodeGetTemperatureReadingResponse(
+ uint8_t instanceId, uint8_t cc, uint16_t reasonCode,
+ double temperatureReading, ocp::accelerator_management::Message& msg);
+
+/** @brief Decode a Get temperature readings response message
+ *
+ * @param[in] msg - response message
+ * @param[in] msg_len - Length of response message
+ * @param[out] cc - reference to response message completion code
+ * @param[out] reason_code - reference to reason code
+ * @param[out] temperature_reading - reference to temperature_reading
+ * @return ocp::accelerator_management::CompletionCode::SUCCESS on success,
+ * otherwise appropriate error code.
+ */
+ocp::accelerator_management::CompletionCode decodeGetTemperatureReadingResponse(
+ const ocp::accelerator_management::Message& msg, size_t msgLen, uint8_t& cc,
+ uint16_t& reasonCode, double& temperatureReading);
+
+} // namespace gpu