blob: 42d03badca3e7976dd61dfa3888ddee70e35d8f0 [file] [log] [blame]
Harshit Agherad837b562025-04-21 19:50:10 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
Harshit Aghera560e6af2025-04-21 20:04:56 +05303 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
Harshit Agherad837b562025-04-21 19:50:10 +05305 */
6
7#include "NvidiaGpuSensor.hpp"
8
Harshit Aghera560e6af2025-04-21 20:04:56 +05309#include "SensorPaths.hpp"
Harshit Agherad837b562025-04-21 19:50:10 +053010#include "Thresholds.hpp"
11#include "Utils.hpp"
12#include "sensor.hpp"
13
14#include <bits/basic_string.h>
15
Harshit Aghera560e6af2025-04-21 20:04:56 +053016#include <MctpRequester.hpp>
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053017#include <NvidiaDeviceDiscovery.hpp>
Harshit Aghera560e6af2025-04-21 20:04:56 +053018#include <NvidiaGpuMctpVdm.hpp>
19#include <OcpMctpVdm.hpp>
Harshit Agherad837b562025-04-21 19:50:10 +053020#include <phosphor-logging/lg2.hpp>
21#include <sdbusplus/asio/connection.hpp>
22#include <sdbusplus/asio/object_server.hpp>
Harshit Agherad837b562025-04-21 19:50:10 +053023
Harshit Agherad837b562025-04-21 19:50:10 +053024#include <cstddef>
25#include <cstdint>
Harshit Aghera560e6af2025-04-21 20:04:56 +053026#include <functional>
Harshit Agherad837b562025-04-21 19:50:10 +053027#include <memory>
Marc Olberdingd0125c92025-10-08 14:37:19 -070028#include <span>
Harshit Agherad837b562025-04-21 19:50:10 +053029#include <string>
Marc Olberdingd0125c92025-10-08 14:37:19 -070030#include <system_error>
Harshit Agherad837b562025-04-21 19:50:10 +053031#include <utility>
Harshit Agherad837b562025-04-21 19:50:10 +053032#include <vector>
33
34using namespace std::literals;
35
36static constexpr double gpuTempSensorMaxReading = 127;
37static constexpr double gpuTempSensorMinReading = -128;
38
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053039NvidiaGpuTempSensor::NvidiaGpuTempSensor(
Harshit Agherad837b562025-04-21 19:50:10 +053040 std::shared_ptr<sdbusplus::asio::connection>& conn,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053041 mctp::MctpRequester& mctpRequester, const std::string& name,
Harshit Agheraba138da2025-05-05 12:26:35 +053042 const std::string& sensorConfiguration, const uint8_t eid, uint8_t sensorId,
Harshit Agherad837b562025-04-21 19:50:10 +053043 sdbusplus::asio::object_server& objectServer,
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053044 std::vector<thresholds::Threshold>&& thresholdData) :
Harshit Agherad837b562025-04-21 19:50:10 +053045 Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
46 "temperature", false, true, gpuTempSensorMaxReading,
47 gpuTempSensorMinReading, conn),
Harshit Agheraba138da2025-05-05 12:26:35 +053048 eid(eid), sensorId{sensorId}, mctpRequester(mctpRequester),
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053049 objectServer(objectServer)
Harshit Agherad837b562025-04-21 19:50:10 +053050{
51 std::string dbusPath =
52 sensorPathPrefix + "temperature/"s + escapeName(name);
53
54 sensorInterface = objectServer.add_interface(
55 dbusPath, "xyz.openbmc_project.Sensor.Value");
56
57 for (const auto& threshold : thresholds)
58 {
59 std::string interface = thresholds::getInterface(threshold.level);
60 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
61 objectServer.add_interface(dbusPath, interface);
62 }
63
64 association = objectServer.add_interface(dbusPath, association::interface);
65
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053066 setInitialProperties(sensor_paths::unitDegreesC);
Harshit Aghera3f6bc732025-07-23 14:22:01 +053067
68 if (sensorId == gpuTLimitSensorId)
69 {
70 sensorTypeInterface = objectServer.add_interface(
71 dbusPath, "xyz.openbmc_project.Sensor.Type");
72
73 sensorTypeInterface->register_property(
74 "ReadingBasis",
75 "xyz.openbmc_project.Sensor.Type.ReadingBasisType.Headroom"s);
76 sensorTypeInterface->register_property(
77 "Implementation",
78 "xyz.openbmc_project.Sensor.Type.ImplementationType.Synthesized"s);
79
80 if (!sensorTypeInterface->initialize())
81 {
82 lg2::error(
83 "Error initializing Type Interface for Temperature Sensor for eid {EID} and sensor id {SID}",
84 "EID", eid, "SID", sensorId);
85 }
86 }
Harshit Agherad837b562025-04-21 19:50:10 +053087}
88
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +053089NvidiaGpuTempSensor::~NvidiaGpuTempSensor()
Harshit Agherad837b562025-04-21 19:50:10 +053090{
Harshit Agherad837b562025-04-21 19:50:10 +053091 for (const auto& iface : thresholdInterfaces)
92 {
93 objectServer.remove_interface(iface);
94 }
95 objectServer.remove_interface(association);
96 objectServer.remove_interface(sensorInterface);
Harshit Aghera3f6bc732025-07-23 14:22:01 +053097 if (sensorTypeInterface)
98 {
99 objectServer.remove_interface(sensorTypeInterface);
100 }
Harshit Agherad837b562025-04-21 19:50:10 +0530101}
102
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530103void NvidiaGpuTempSensor::checkThresholds()
Harshit Agherad837b562025-04-21 19:50:10 +0530104{
105 thresholds::checkThresholds(this);
106}
107
Marc Olberdingd0125c92025-10-08 14:37:19 -0700108void NvidiaGpuTempSensor::processResponse(const std::error_code& ec,
109 std::span<const uint8_t> buffer)
Harshit Aghera560e6af2025-04-21 20:04:56 +0530110{
Marc Olberdingd0125c92025-10-08 14:37:19 -0700111 if (ec)
Harshit Aghera560e6af2025-04-21 20:04:56 +0530112 {
113 lg2::error(
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530114 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
Marc Olberdingd0125c92025-10-08 14:37:19 -0700115 "EID", eid, "SID", sensorId, "RC", ec.message());
Harshit Aghera560e6af2025-04-21 20:04:56 +0530116 return;
117 }
118
119 ocp::accelerator_management::CompletionCode cc{};
120 uint16_t reasonCode = 0;
121 double tempValue = 0;
122
Marc Olberdingd0125c92025-10-08 14:37:19 -0700123 auto rc = gpu::decodeGetTemperatureReadingResponse(buffer, cc, reasonCode,
124 tempValue);
Harshit Aghera560e6af2025-04-21 20:04:56 +0530125
126 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
127 {
128 lg2::error(
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530129 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : decode failed. "
130 "rc={RC}, cc={CC}, reasonCode={RESC}",
131 "EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
132 reasonCode);
Harshit Aghera560e6af2025-04-21 20:04:56 +0530133 return;
134 }
135
136 updateValue(tempValue);
137}
138
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530139void NvidiaGpuTempSensor::update()
Harshit Aghera560e6af2025-04-21 20:04:56 +0530140{
141 auto rc = gpu::encodeGetTemperatureReadingRequest(
142 0, sensorId, getTemperatureReadingRequest);
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530143
Harshit Aghera560e6af2025-04-21 20:04:56 +0530144 if (rc != 0)
145 {
Harshit Aghera4ecdfaa2025-05-22 11:35:39 +0530146 lg2::error(
147 "Error updating Temperature Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
148 "EID", eid, "SID", sensorId, "RC", rc);
Harshit Aghera560e6af2025-04-21 20:04:56 +0530149 }
150
151 mctpRequester.sendRecvMsg(
Marc Olberdingd0125c92025-10-08 14:37:19 -0700152 eid, getTemperatureReadingRequest,
Marc Olberdingfd4a3772025-09-24 16:31:02 -0700153 [weak{weak_from_this()}](const std::error_code& ec,
154 std::span<const uint8_t> buffer) {
155 std::shared_ptr<NvidiaGpuTempSensor> self = weak.lock();
156 if (!self)
157 {
158 lg2::error("Invalid reference to NvidiaGpuTempSensor");
159 return;
160 }
161 self->processResponse(ec, buffer);
Marc Olberdingd0125c92025-10-08 14:37:19 -0700162 });
Harshit Aghera560e6af2025-04-21 20:04:56 +0530163}