blob: 88955ce6fdcb055d776198e7ce652c68d1a15425 [file] [log] [blame]
Harshit Aghera32e3b2b2025-05-05 12:26:35 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "GpuTLimitSensor.hpp"
7
8#include "SensorPaths.hpp"
9#include "Thresholds.hpp"
10#include "UpdatableSensor.hpp"
11#include "Utils.hpp"
12
13#include <bits/basic_string.h>
14
15#include <GpuDevice.hpp>
16#include <GpuMctpVdm.hpp>
17#include <MctpRequester.hpp>
18#include <OcpMctpVdm.hpp>
19#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
22
23#include <cstddef>
24#include <cstdint>
25#include <functional>
26#include <memory>
27#include <string>
28#include <utility>
29#include <vector>
30
31using namespace std::literals;
32
33constexpr uint8_t gpuTLimitSensorIdm{2};
34static constexpr double gpuTLimitSensorMaxReading = 127;
35static constexpr double gpuTLimitSensorMinReading = -128;
36
37GpuTLimitSensor::GpuTLimitSensor(
38 std::shared_ptr<sdbusplus::asio::connection>& conn,
39 mctp::MctpRequester& mctpRequester, const std::string& name,
40 const std::string& sensorConfiguration, uint8_t eid,
41 sdbusplus::asio::object_server& objectServer,
42 std::vector<thresholds::Threshold>&& thresholdData) :
43 GpuSensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
44 "temperature", false, true, gpuTLimitSensorMaxReading,
45 gpuTLimitSensorMinReading, conn),
46 eid(eid), sensorId{gpuTLimitSensorIdm}, mctpRequester(mctpRequester),
47 objectServer(objectServer)
48{
49 std::string dbusPath =
50 sensorPathPrefix + "temperature/"s + escapeName(name);
51
52 sensorInterface = objectServer.add_interface(
53 dbusPath, "xyz.openbmc_project.Sensor.Value");
54
55 for (const auto& threshold : thresholds)
56 {
57 std::string interface = thresholds::getInterface(threshold.level);
58 thresholdInterfaces[static_cast<size_t>(threshold.level)] =
59 objectServer.add_interface(dbusPath, interface);
60 }
61
62 association = objectServer.add_interface(dbusPath, association::interface);
63
Harshit Aghera5e8a59c2025-05-06 17:39:57 +053064 descriptionInterface = objectServer.add_interface(
65 dbusPath, "xyz.openbmc_project.Inventory.Item");
66
67 descriptionInterface->register_property(
68 "PrettyName",
69 "Thermal Limit(TLIMIT) Temperature is the distance in deg C from the GPU temperature to the first throttle limit."s);
70
71 descriptionInterface->initialize();
72
Harshit Aghera32e3b2b2025-05-05 12:26:35 +053073 setInitialProperties(sensor_paths::unitDegreesC);
74}
75
76GpuTLimitSensor::~GpuTLimitSensor()
77{
78 for (const auto& iface : thresholdInterfaces)
79 {
80 objectServer.remove_interface(iface);
81 }
82 objectServer.remove_interface(sensorInterface);
83 objectServer.remove_interface(association);
Harshit Aghera5e8a59c2025-05-06 17:39:57 +053084 objectServer.remove_interface(descriptionInterface);
Harshit Aghera32e3b2b2025-05-05 12:26:35 +053085}
86
87void GpuTLimitSensor::checkThresholds()
88{
89 thresholds::checkThresholds(this);
90}
91
92void GpuTLimitSensor::update()
93{
94 std::vector<uint8_t> reqMsg(
95 sizeof(ocp::accelerator_management::BindingPciVid) +
96 sizeof(gpu::GetTemperatureReadingRequest));
97
98 auto* msg = new (reqMsg.data()) ocp::accelerator_management::Message;
99
100 auto rc = gpu::encodeGetTemperatureReadingRequest(0, sensorId, *msg);
101 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS)
102 {
103 lg2::error(
104 "GpuTLimitSensor::update(): gpuEncodeGetTemperatureReadingRequest failed, rc={RC}",
105 "RC", static_cast<int>(rc));
106 return;
107 }
108
109 mctpRequester.sendRecvMsg(
110 eid, reqMsg,
111 [this](int sendRecvMsgResult, std::vector<uint8_t> respMsg) {
112 if (sendRecvMsgResult != 0)
113 {
114 lg2::error(
115 "GpuTLimitSensor::update(): MctpRequester::sendRecvMsg() failed, rc={RC}",
116 "RC", sendRecvMsgResult);
117 return;
118 }
119
120 if (respMsg.empty())
121 {
122 lg2::error(
123 "GpuTLimitSensor::update(): MctpRequester::sendRecvMsg() failed, respMsgLen=0");
124 return;
125 }
126
127 uint8_t cc = 0;
128 uint16_t reasonCode = 0;
129 double tempValue = 0;
130
131 auto rc = gpu::decodeGetTemperatureReadingResponse(
132 *new (respMsg.data()) ocp::accelerator_management::Message,
133 respMsg.size(), cc, reasonCode, tempValue);
134
135 if (rc != ocp::accelerator_management::CompletionCode::SUCCESS ||
136 cc != static_cast<uint8_t>(
137 ocp::accelerator_management::CompletionCode::SUCCESS))
138 {
139 lg2::error(
140 "GpuTLimitSensor::update(): gpuDecodeGetTemperatureReadingResponse() failed, rc={RC} cc={CC} reasonCode={RESC}",
141 "RC", static_cast<int>(rc), "CC", cc, "RESC", reasonCode);
142 return;
143 }
144
145 updateValue(tempValue);
146 });
147}