blob: a3bc43d413550f454ecc0e1b9977ce66431e35f8 [file] [log] [blame]
Harshit Agherad837b562025-04-21 19:50:10 +05301/*
Ed Tanousb5e823f2025-10-09 20:28:42 -04002 * SPDX-FileCopyrightText: Copyright OpenBMC Authors
Harshit Aghera560e6af2025-04-21 20:04:56 +05303 * SPDX-License-Identifier: Apache-2.0
Harshit Agherad837b562025-04-21 19:50:10 +05304 */
5
6#include "NvidiaGpuSensor.hpp"
7
Harshit Aghera560e6af2025-04-21 20:04:56 +05308#include "SensorPaths.hpp"
Harshit Agherad837b562025-04-21 19:50:10 +05309#include "Thresholds.hpp"
10#include "Utils.hpp"
11#include "sensor.hpp"
12
13#include <bits/basic_string.h>
14
Harshit Aghera560e6af2025-04-21 20:04:56 +053015#include <MctpRequester.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053016#include <NvidiaDeviceDiscovery.hpp>
Harshit Aghera560e6af2025-04-21 20:04:56 +053017#include <NvidiaGpuMctpVdm.hpp>
18#include <OcpMctpVdm.hpp>
Harshit Agherad837b562025-04-21 19:50:10 +053019#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
Harshit Agherad837b562025-04-21 19:50:10 +053022
Harshit Agherad837b562025-04-21 19:50:10 +053023#include <cstddef>
24#include <cstdint>
Harshit Aghera560e6af2025-04-21 20:04:56 +053025#include <functional>
Harshit Agherad837b562025-04-21 19:50:10 +053026#include <memory>
Marc Olberdingd0125c92025-10-08 14:37:19 -070027#include <span>
Harshit Agherad837b562025-04-21 19:50:10 +053028#include <string>
Marc Olberdingd0125c92025-10-08 14:37:19 -070029#include <system_error>
Harshit Agherad837b562025-04-21 19:50:10 +053030#include <utility>
Harshit Agherad837b562025-04-21 19:50:10 +053031#include <vector>
32
33using namespace std::literals;
34
35static constexpr double gpuTempSensorMaxReading = 127;
36static constexpr double gpuTempSensorMinReading = -128;
37
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053038NvidiaGpuTempSensor::NvidiaGpuTempSensor(
Harshit Agherad837b562025-04-21 19:50:10 +053039 std::shared_ptr<sdbusplus::asio::connection>& conn,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053040 mctp::MctpRequester& mctpRequester, const std::string& name,
Harshit Agheraba138da2025-05-05 12:26:35 +053041 const std::string& sensorConfiguration, const uint8_t eid, uint8_t sensorId,
Harshit Agherad837b562025-04-21 19:50:10 +053042 sdbusplus::asio::object_server& objectServer,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053043 std::vector<thresholds::Threshold>&& thresholdData) :
Harshit Agherad837b562025-04-21 19:50:10 +053044 Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
45 "temperature", false, true, gpuTempSensorMaxReading,
46 gpuTempSensorMinReading, conn),
Harshit Agheraba138da2025-05-05 12:26:35 +053047 eid(eid), sensorId{sensorId}, mctpRequester(mctpRequester),
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053048 objectServer(objectServer)
Harshit Agherad837b562025-04-21 19:50:10 +053049{
50 std::string dbusPath =
51 sensorPathPrefix + "temperature/"s + escapeName(name);
52
53 sensorInterface = objectServer.add_interface(
54 dbusPath, "xyz.openbmc_project.Sensor.Value");
55
56 for (const auto& threshold : thresholds)
57 {
58 std::string interface = thresholds::getInterface(threshold.level);
59 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
60 objectServer.add_interface(dbusPath, interface);
61 }
62
63 association = objectServer.add_interface(dbusPath, association::interface);
64
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053065 setInitialProperties(sensor_paths::unitDegreesC);
Harshit Aghera3f6bc732025-07-23 14:22:01 +053066
67 if (sensorId == gpuTLimitSensorId)
68 {
69 sensorTypeInterface = objectServer.add_interface(
70 dbusPath, "xyz.openbmc_project.Sensor.Type");
71
72 sensorTypeInterface->register_property(
73 "ReadingBasis",
74 "xyz.openbmc_project.Sensor.Type.ReadingBasisType.Headroom"s);
75 sensorTypeInterface->register_property(
76 "Implementation",
77 "xyz.openbmc_project.Sensor.Type.ImplementationType.Synthesized"s);
78
79 if (!sensorTypeInterface->initialize())
80 {
81 lg2::error(
82 "Error initializing Type Interface for Temperature Sensor for eid {EID} and sensor id {SID}",
83 "EID", eid, "SID", sensorId);
84 }
85 }
Harshit Agherad837b562025-04-21 19:50:10 +053086}
87
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053088NvidiaGpuTempSensor::~NvidiaGpuTempSensor()
Harshit Agherad837b562025-04-21 19:50:10 +053089{
Harshit Agherad837b562025-04-21 19:50:10 +053090 for (const auto& iface : thresholdInterfaces)
91 {
92 objectServer.remove_interface(iface);
93 }
94 objectServer.remove_interface(association);
95 objectServer.remove_interface(sensorInterface);
Harshit Aghera3f6bc732025-07-23 14:22:01 +053096 if (sensorTypeInterface)
97 {
98 objectServer.remove_interface(sensorTypeInterface);
99 }
Harshit Agherad837b562025-04-21 19:50:10 +0530100}
101
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530102void NvidiaGpuTempSensor::checkThresholds()
Harshit Agherad837b562025-04-21 19:50:10 +0530103{
104 thresholds::checkThresholds(this);
105}
106
Marc Olberdingd0125c92025-10-08 14:37:19 -0700107void NvidiaGpuTempSensor::processResponse(const std::error_code& ec,
108 std::span<const uint8_t> buffer)
Harshit Aghera560e6af2025-04-21 20:04:56 +0530109{
Marc Olberdingd0125c92025-10-08 14:37:19 -0700110 if (ec)
Harshit Aghera560e6af2025-04-21 20:04:56 +0530111 {
112 lg2::error(
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530113 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
Marc Olberdingd0125c92025-10-08 14:37:19 -0700114 "EID", eid, "SID", sensorId, "RC", ec.message());
Harshit Aghera560e6af2025-04-21 20:04:56 +0530115 return;
116 }
117
118 ocp::accelerator_management::CompletionCode cc{};
119 uint16_t reasonCode = 0;
120 double tempValue = 0;
121
Marc Olberdingd0125c92025-10-08 14:37:19 -0700122 auto rc = gpu::decodeGetTemperatureReadingResponse(buffer, cc, reasonCode,
123 tempValue);
Harshit Aghera560e6af2025-04-21 20:04:56 +0530124
125 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
126 {
127 lg2::error(
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530128 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : decode failed. "
129 "rc={RC}, cc={CC}, reasonCode={RESC}",
130 "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
131 reasonCode);
Harshit Aghera560e6af2025-04-21 20:04:56 +0530132 return;
133 }
134
135 updateValue(tempValue);
136}
137
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530138void NvidiaGpuTempSensor::update()
Harshit Aghera560e6af2025-04-21 20:04:56 +0530139{
140 auto rc = gpu::encodeGetTemperatureReadingRequest(
141 0, sensorId, getTemperatureReadingRequest);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530142
Harshit Aghera560e6af2025-04-21 20:04:56 +0530143 if (rc != 0)
144 {
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530145 lg2::error(
146 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
147 "EID", eid, "SID", sensorId, "RC", rc);
Harshit Aghera560e6af2025-04-21 20:04:56 +0530148 }
149
150 mctpRequester.sendRecvMsg(
Marc Olberdingd0125c92025-10-08 14:37:19 -0700151 eid, getTemperatureReadingRequest,
Marc Olberdingfd4a3772025-09-24 16:31:02 -0700152 [weak{weak_from_this()}](const std::error_code& ec,
153 std::span<const uint8_t> buffer) {
154 std::shared_ptr<NvidiaGpuTempSensor> self = weak.lock();
155 if (!self)
156 {
157 lg2::error("Invalid reference to NvidiaGpuTempSensor");
158 return;
159 }
160 self->processResponse(ec, buffer);
Marc Olberdingd0125c92025-10-08 14:37:19 -0700161 });
Harshit Aghera560e6af2025-04-21 20:04:56 +0530162}