gpu : add support for communication to the endpoint

The commit uses MCTP VDM protocol to read temperature sensor value from
the gpu.

The MCTP VDM protocol is an extension of the OCP Accelerator Management
Interface specification -
'''
  https://www.opencompute.org/documents/ocp-gpu-accelerator-management-interfaces-v1-pdf
'''

Tested.

Build an image for gb200nvl-obmc machine with the following patches
cherry picked. This patches are needed to enable the mctp stack.

https://gerrit.openbmc.org/c/openbmc/openbmc/+/79312
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79410
https://gerrit.openbmc.org/c/openbmc/openbmc/+/79422

Copy the configuration file on gb200nvl-obmc machine and restart the
entity-manager service.
```
root@gb200nvl-obmc:~# rm -rf /var/configuration/
root@gb200nvl-obmc:~# systemctl restart xyz.openbmc_project.EntityManager.service
```

Copy the gpusensor app and run it.
```
root@gb200nvl-obmc:~# ./gpusensor
```

The app is detecting entity-manager configuration on gb200nvl-obmc
machine. The app is also able to detect all the endpoints from the mctp
service dbus tree. The app is reading temperature sensor value from gpu
correctly and the temperature sensor is also present on redfish.

```
$ curl -k -u 'root:0penBmc' https://10.137.203.137/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU
{
  "@odata.id": "/redfish/v1/Chassis/NVIDIA_GB200_1/Sensors/temperature_NVIDIA_GB200_GPU",
  "@odata.type": "#Sensor.v1_2_0.Sensor",
  "Id": "temperature_NVIDIA_GB200_GPU",
  "Name": "NVIDIA GB200 GPU",
  "Reading": 36.4375,
  "ReadingRangeMax": 127.0,
  "ReadingRangeMin": -128.0,
  "ReadingType": "Temperature",
  "ReadingUnits": "Cel",
  "Status": {
    "Health": "OK",
    "State": "Enabled"
  }
}%

root@gb200nvl-obmc:~# busctl tree xyz.openbmc_project.GpuSensor
└─ /xyz
  └─ /xyz/openbmc_project
    └─ /xyz/openbmc_project/sensors
      └─ /xyz/openbmc_project/sensors/temperature
        └─ /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU

root@gb200nvl-obmc:~# busctl introspect xyz.openbmc_project.GpuSensor /xyz/openbmc_project/sensors/temperature/NVIDIA_GB200_GPU
NAME                                                  TYPE      SIGNATURE RESULT/VALUE                             FLAGS
org.freedesktop.DBus.Introspectable                   interface -         -                                        -
.Introspect                                           method    -         s                                        -
org.freedesktop.DBus.Peer                             interface -         -                                        -
.GetMachineId                                         method    -         s                                        -
.Ping                                                 method    -         -                                        -
org.freedesktop.DBus.Properties                       interface -         -                                        -
.Get                                                  method    ss        v                                        -
.GetAll                                               method    s         a{sv}                                    -
.Set                                                  method    ssv       -                                        -
.PropertiesChanged                                    signal    sa{sv}as  -                                        -
xyz.openbmc_project.Association.Definitions           interface -         -                                        -
.Associations                                         property  a(sss)    1 "chassis" "all_sensors" "/xyz/openbmc… emits-change
xyz.openbmc_project.Sensor.Value                      interface -         -                                        -
.MaxValue                                             property  d         127                                      emits-change
.MinValue                                             property  d         -128                                     emits-change
.Unit                                                 property  s         "xyz.openbmc_project.Sensor.Value.Unit.… emits-change
.Value                                                property  d         36.3125                                  emits-change writable
xyz.openbmc_project.Sensor.ValueMutability            interface -         -                                        -
.Mutable                                              property  b         true                                     emits-change
xyz.openbmc_project.State.Decorator.Availability      interface -         -                                        -
.Available                                            property  b         true                                     emits-change writable
xyz.openbmc_project.State.Decorator.OperationalStatus interface -         -                                        -
.Functional                                           property  b         true                                     emits-change
```

Change-Id: Ied938b9e5c19751ee283b4b948e16c905c78fb48
Signed-off-by: Harshit Aghera <haghera@nvidia.com>
diff --git a/src/gpu/GpuSensor.cpp b/src/gpu/GpuSensor.cpp
index ed81339..119554d 100644
--- a/src/gpu/GpuSensor.cpp
+++ b/src/gpu/GpuSensor.cpp
@@ -5,12 +5,16 @@
 
 #include "GpuSensor.hpp"
 
+#include "SensorPaths.hpp"
 #include "Thresholds.hpp"
 #include "Utils.hpp"
 #include "sensor.hpp"
 
 #include <bits/basic_string.h>
 
+#include <GpuMctpVdm.hpp>
+#include <MctpRequester.hpp>
+#include <OcpMctpVdm.hpp>
 #include <boost/asio/io_context.hpp>
 #include <boost/container/flat_map.hpp>
 #include <phosphor-logging/lg2.hpp>
@@ -23,6 +27,7 @@
 #include <chrono>
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
@@ -32,20 +37,24 @@
 
 using namespace std::literals;
 
+constexpr uint8_t gpuTempSensorId{0};
+constexpr std::chrono::milliseconds samplingInterval{1000ms};
 static constexpr double gpuTempSensorMaxReading = 127;
 static constexpr double gpuTempSensorMinReading = -128;
 
 GpuTempSensor::GpuTempSensor(
     std::shared_ptr<sdbusplus::asio::connection>& conn,
-    boost::asio::io_context& io, const std::string& name,
-    const std::string& sensorConfiguration,
+    boost::asio::io_context& io, mctp::MctpRequester& mctpRequester,
+    const std::string& name, const std::string& sensorConfiguration,
     sdbusplus::asio::object_server& objectServer,
-    std::vector<thresholds::Threshold>&& thresholdData) :
+    std::vector<thresholds::Threshold>&& thresholdData,
+    std::chrono::milliseconds pollRate) :
     Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
            "temperature", false, true, gpuTempSensorMaxReading,
            gpuTempSensorMinReading, conn),
-    waitTimer(io, std::chrono::steady_clock::duration(0)), conn(conn),
-    objectServer(objectServer)
+    sensorId{gpuTempSensorId}, sensorPollMs(pollRate),
+    waitTimer(io, std::chrono::steady_clock::duration(0)),
+    mctpRequester(mctpRequester), conn(conn), objectServer(objectServer)
 {
     std::string dbusPath =
         sensorPathPrefix + "temperature/"s + escapeName(name);
@@ -86,6 +95,147 @@
     discoverGpus();
 }
 
+void GpuTempSensor::read()
+{
+    update();
+
+    waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
+    waitTimer.async_wait([this](const boost::system::error_code& ec) {
+        if (ec)
+        {
+            return;
+        }
+        read();
+    });
+}
+
+void GpuTempSensor::update()
+{
+    std::vector<uint8_t> reqMsg(
+        sizeof(ocp::accelerator_management::BindingPciVid) +
+        sizeof(gpu::GetTemperatureReadingRequest));
+
+    auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
+
+    auto rc = gpu::encodeGetTemperatureReadingRequest(0, sensorId, *msg);
+    if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        lg2::error(
+            "GpuTempSensor::update(): gpuEncodeGetTemperatureReadingRequest failed, rc={RC}",
+            "RC", static_cast<int>(rc));
+        return;
+    }
+
+    mctpRequester.sendRecvMsg(
+        eid, reqMsg,
+        [this](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
+            if (sendRecvMsgResult != 0)
+            {
+                lg2::error(
+                    "GpuTempSensor::update(): MctpRequester::sendRecvMsg() failed, rc={RC}",
+                    "RC", sendRecvMsgResult);
+                return;
+            }
+
+            if (respMsg.empty())
+            {
+                lg2::error(
+                    "GpuTempSensor::update(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
+                return;
+            }
+
+            uint8_t cc = 0;
+            uint16_t reasonCode = 0;
+            double tempValue = 0;
+
+            auto rc = gpu::decodeGetTemperatureReadingResponse(
+                *new (respMsg.data()) ocp::accelerator_management::Message,
+                respMsg.size(), cc, reasonCode, tempValue);
+
+            if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
+                cc != static_cast<uint8_t>(
+                          ocp::accelerator_management::CompletionCode::SUCCESS))
+            {
+                lg2::error(
+                    "GpuTempSensor::update(): gpuDecodeGetTemperatureReadingResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
+                    "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
+                return;
+            }
+
+            updateValue(tempValue);
+        });
+}
+
+void GpuTempSensor::processGpuEndpoint(uint8_t eid)
+{
+    std::vector<uint8_t> reqMsg(
+        sizeof(ocp::accelerator_management::BindingPciVid) +
+        sizeof(gpu::QueryDeviceIdentificationRequest));
+
+    auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
+
+    auto rc = gpu::encodeQueryDeviceIdentificationRequest(0, *msg);
+    if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
+    {
+        lg2::error(
+            "GpuTempSensor::processGpuEndPoint(): gpuEncodeQueryDeviceIdentificationRequest failed, rc={RC}",
+            "RC", static_cast<int>(rc));
+        return;
+    }
+
+    mctpRequester.sendRecvMsg(
+        eid, reqMsg,
+        [this, eid](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
+            if (sendRecvMsgResult != 0)
+            {
+                lg2::error(
+                    "GpuTempSensor::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, rc={RC}",
+                    "RC", sendRecvMsgResult);
+                return;
+            }
+
+            if (respMsg.empty())
+            {
+                lg2::error(
+                    "GpuTempSensor::processGpuEndPoint(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
+                return;
+            }
+
+            uint8_t cc = 0;
+            uint16_t reasonCode = 0;
+            uint8_t responseDeviceType = 0;
+            uint8_t responseInstanceId = 0;
+
+            auto rc = gpu::decodeQueryDeviceIdentificationResponse(
+                *new (respMsg.data()) ocp::accelerator_management::Message,
+                respMsg.size(), cc, reasonCode, responseDeviceType,
+                responseInstanceId);
+
+            if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
+                cc != static_cast<uint8_t>(
+                          ocp::accelerator_management::CompletionCode::SUCCESS))
+            {
+                lg2::error(
+                    "GpuTempSensor::processGpuEndPoint(): gpuDecodeQueryDeviceIdentificationResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
+                    "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
+                return;
+            }
+
+            if (responseDeviceType ==
+                static_cast<uint8_t>(gpu::DeviceIdentification::DEVICE_GPU))
+            {
+                lg2::info(
+                    "GpuTempSensor::processGpuEndPoint(): found the GPU with EID {EID}, DeviceType {DEVTYPE}, InstanceId {IID}.",
+                    "EID", eid, "DEVTYPE", responseDeviceType, "IID",
+                    responseInstanceId);
+
+                this->eid = eid;
+                setInitialProperties(sensor_paths::unitDegreesC);
+                this->read();
+            }
+        });
+}
+
 void GpuTempSensor::processMctpEndpoints(const boost::system::error_code& ec,
                                          const getSubTreeRet& ret)
 {
@@ -132,7 +282,7 @@
         return;
     }
 
-    [[maybe_unused]] uint8_t eid{};
+    uint8_t eid{};
     std::vector<uint8_t> mctpTypes{};
 
     auto hasEid = configs.find("EID");
@@ -180,9 +330,14 @@
         return;
     }
 
-    // if the OCP MCTP VDM Message type (0x7E) is found in mctpTypes
-    // process the endpoint further.
-    (void)this;
+    if (std::find(mctpTypes.begin(), mctpTypes.end(),
+                  ocp::accelerator_management::messageType) != mctpTypes.end())
+    {
+        lg2::info(
+            "GpuTempSensor::discoverGpus(): Found OCP MCTP VDM Endpoint with ID {EID}",
+            "EID", eid);
+        this->processGpuEndpoint(eid);
+    }
 }
 
 void GpuTempSensor::discoverGpus()
@@ -205,7 +360,7 @@
     boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
         sensors,
     std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
-    const ManagedObjectType& resp)
+    mctp::MctpRequester& mctpRequester, const ManagedObjectType& resp)
 {
     for (const auto& [path, interfaces] : resp)
     {
@@ -219,8 +374,8 @@
             std::string name = loadVariant<std::string>(cfg, "Name");
 
             sensors[name] = std::make_shared<GpuTempSensor>(
-                dbusConnection, io, name, path, objectServer,
-                std::vector<thresholds::Threshold>{});
+                dbusConnection, io, mctpRequester, name, path, objectServer,
+                std::vector<thresholds::Threshold>{}, samplingInterval);
 
             lg2::info(
                 "Added GPU Temperature Sensor {NAME} with chassis path: {PATH}.",
@@ -233,7 +388,8 @@
     boost::asio::io_context& io, sdbusplus::asio::object_server& objectServer,
     boost::container::flat_map<std::string, std::shared_ptr<GpuTempSensor>>&
         sensors,
-    std::shared_ptr<sdbusplus::asio::connection>& dbusConnection)
+    std::shared_ptr<sdbusplus::asio::connection>& dbusConnection,
+    mctp::MctpRequester& mctpRequester)
 {
     if (!dbusConnection)
     {
@@ -241,9 +397,8 @@
         return;
     }
     dbusConnection->async_method_call(
-        [&sensors, &dbusConnection, &io,
-         &objectServer](const boost::system::error_code& ec,
-                        const ManagedObjectType& resp) {
+        [&sensors, &mctpRequester, &dbusConnection, &io, &objectServer](
+            boost::system::error_code ec, const ManagedObjectType& resp) {
             if (ec)
             {
                 lg2::error("Error contacting entity manager");
@@ -251,7 +406,7 @@
             }
 
             processSensorConfigs(io, objectServer, sensors, dbusConnection,
-                                 resp);
+                                 mctpRequester, resp);
         },
         entityManagerName, "/xyz/openbmc_project/inventory",
         "org.freedesktop.DBus.ObjectManager", "GetManagedObjects");