blob: 35f51ac3ea4f740e3f0663c04d346cb2b6ed46f8 [file] [log] [blame]
Harshit Agherad837b562025-04-21 19:50:10 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
Harshit Aghera560e6af2025-04-21 20:04:56 +05303 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
Harshit Agherad837b562025-04-21 19:50:10 +05305 */
6
7#include "NvidiaGpuSensor.hpp"
8
Harshit Aghera560e6af2025-04-21 20:04:56 +05309#include "SensorPaths.hpp"
Harshit Agherad837b562025-04-21 19:50:10 +053010#include "Thresholds.hpp"
11#include "Utils.hpp"
12#include "sensor.hpp"
13
14#include <bits/basic_string.h>
15
Harshit Aghera560e6af2025-04-21 20:04:56 +053016#include <MctpRequester.hpp>
Harshit Agherafa2a5b92025-05-22 11:35:39 +053017#include <NvidiaDeviceDiscovery.hpp>
Harshit Aghera560e6af2025-04-21 20:04:56 +053018#include <NvidiaGpuMctpVdm.hpp>
19#include <OcpMctpVdm.hpp>
Harshit Agherad837b562025-04-21 19:50:10 +053020#include <phosphor-logging/lg2.hpp>
21#include <sdbusplus/asio/connection.hpp>
22#include <sdbusplus/asio/object_server.hpp>
Harshit Agherad837b562025-04-21 19:50:10 +053023
Harshit Agherad837b562025-04-21 19:50:10 +053024#include <cstddef>
25#include <cstdint>
Harshit Aghera560e6af2025-04-21 20:04:56 +053026#include <functional>
Harshit Agherad837b562025-04-21 19:50:10 +053027#include <memory>
28#include <string>
29#include <utility>
Harshit Agherad837b562025-04-21 19:50:10 +053030#include <vector>
31
32using namespace std::literals;
33
Harshit Aghera560e6af2025-04-21 20:04:56 +053034constexpr uint8_t gpuTempSensorId{0};
Harshit Agherad837b562025-04-21 19:50:10 +053035static constexpr double gpuTempSensorMaxReading = 127;
36static constexpr double gpuTempSensorMinReading = -128;
37
Harshit Agherafa2a5b92025-05-22 11:35:39 +053038NvidiaGpuTempSensor::NvidiaGpuTempSensor(
Harshit Agherad837b562025-04-21 19:50:10 +053039 std::shared_ptr<sdbusplus::asio::connection>& conn,
Harshit Agherafa2a5b92025-05-22 11:35:39 +053040 mctp::MctpRequester& mctpRequester, const std::string& name,
41 const std::string& sensorConfiguration, const uint8_t eid,
Harshit Agherad837b562025-04-21 19:50:10 +053042 sdbusplus::asio::object_server& objectServer,
Harshit Agherafa2a5b92025-05-22 11:35:39 +053043 std::vector<thresholds::Threshold>&& thresholdData) :
Harshit Agherad837b562025-04-21 19:50:10 +053044 Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
45 "temperature", false, true, gpuTempSensorMaxReading,
46 gpuTempSensorMinReading, conn),
Harshit Agherafa2a5b92025-05-22 11:35:39 +053047 eid(eid), sensorId{gpuTempSensorId}, mctpRequester(mctpRequester),
48 objectServer(objectServer)
Harshit Agherad837b562025-04-21 19:50:10 +053049{
50 std::string dbusPath =
51 sensorPathPrefix + "temperature/"s + escapeName(name);
52
53 sensorInterface = objectServer.add_interface(
54 dbusPath, "xyz.openbmc_project.Sensor.Value");
55
56 for (const auto& threshold : thresholds)
57 {
58 std::string interface = thresholds::getInterface(threshold.level);
59 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
60 objectServer.add_interface(dbusPath, interface);
61 }
62
63 association = objectServer.add_interface(dbusPath, association::interface);
64
Harshit Agherafa2a5b92025-05-22 11:35:39 +053065 // Sensor values are only updated when the difference between the new and
66 // previous value exceeds the hysteresisPublish threshold. This threshold
67 // defaults to ((max - min) * 0.0001). Since this sensor lacks defined
68 // min/max values, theoretical limits are used instead, creating a large
69 // hysteresisPublish value that blocks D-Bus updates. Setting
70 // hysteresisPublish to 0 forces all sensor value changes to be published
71 // to D-Bus.
72 hysteresisPublish = 0;
73
74 setInitialProperties(sensor_paths::unitDegreesC);
Harshit Agherad837b562025-04-21 19:50:10 +053075}
76
Harshit Agherafa2a5b92025-05-22 11:35:39 +053077NvidiaGpuTempSensor::~NvidiaGpuTempSensor()
Harshit Agherad837b562025-04-21 19:50:10 +053078{
Harshit Agherad837b562025-04-21 19:50:10 +053079 for (const auto& iface : thresholdInterfaces)
80 {
81 objectServer.remove_interface(iface);
82 }
83 objectServer.remove_interface(association);
84 objectServer.remove_interface(sensorInterface);
85}
86
Harshit Agherafa2a5b92025-05-22 11:35:39 +053087void NvidiaGpuTempSensor::checkThresholds()
Harshit Agherad837b562025-04-21 19:50:10 +053088{
89 thresholds::checkThresholds(this);
90}
91
Harshit Agherafa2a5b92025-05-22 11:35:39 +053092void NvidiaGpuTempSensor::processResponse(int sendRecvMsgResult)
Harshit Aghera560e6af2025-04-21 20:04:56 +053093{
94 if (sendRecvMsgResult != 0)
95 {
96 lg2::error(
Harshit Agherafa2a5b92025-05-22 11:35:39 +053097 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
98 "EID", eid, "SID", sensorId, "RC", sendRecvMsgResult);
Harshit Aghera560e6af2025-04-21 20:04:56 +053099 return;
100 }
101
102 ocp::accelerator_management::CompletionCode cc{};
103 uint16_t reasonCode = 0;
104 double tempValue = 0;
105
106 auto rc = gpu::decodeGetTemperatureReadingResponse(
107 getTemperatureReadingResponse, cc, reasonCode, tempValue);
108
109 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
110 {
111 lg2::error(
Harshit Agherafa2a5b92025-05-22 11:35:39 +0530112 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : decode failed. "
113 "rc={RC}, cc={CC}, reasonCode={RESC}",
114 "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
115 reasonCode);
Harshit Aghera560e6af2025-04-21 20:04:56 +0530116 return;
117 }
118
119 updateValue(tempValue);
120}
121
Harshit Agherafa2a5b92025-05-22 11:35:39 +0530122void NvidiaGpuTempSensor::update()
Harshit Aghera560e6af2025-04-21 20:04:56 +0530123{
124 auto rc = gpu::encodeGetTemperatureReadingRequest(
125 0, sensorId, getTemperatureReadingRequest);
Harshit Agherafa2a5b92025-05-22 11:35:39 +0530126
Harshit Aghera560e6af2025-04-21 20:04:56 +0530127 if (rc != 0)
128 {
Harshit Agherafa2a5b92025-05-22 11:35:39 +0530129 lg2::error(
130 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
131 "EID", eid, "SID", sensorId, "RC", rc);
Harshit Aghera560e6af2025-04-21 20:04:56 +0530132 }
133
134 mctpRequester.sendRecvMsg(
135 eid, getTemperatureReadingRequest, getTemperatureReadingResponse,
136 [this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
137}