blob: 727c40828fe7eb21cebdfde609412a32d295199c [file] [log] [blame]
/*
* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
#include "NvidiaGpuDevice.hpp"
#include "Thresholds.hpp"
#include "Utils.hpp"
#include <Inventory.hpp>
#include <MctpRequester.hpp>
#include <NvidiaDeviceDiscovery.hpp>
#include <NvidiaGpuEnergySensor.hpp>
#include <NvidiaGpuMctpVdm.hpp>
#include <NvidiaGpuPowerPeakReading.hpp>
#include <NvidiaGpuPowerSensor.hpp>
#include <NvidiaGpuSensor.hpp>
#include <NvidiaGpuVoltageSensor.hpp>
#include <OcpMctpVdm.hpp>
#include <boost/asio/io_context.hpp>
#include <phosphor-logging/lg2.hpp>
#include <sdbusplus/asio/connection.hpp>
#include <sdbusplus/asio/object_server.hpp>
#include <array>
#include <chrono>
#include <cstdint>
#include <functional>
#include <memory>
#include <span>
#include <string>
#include <system_error>
#include <utility>
#include <vector>
static constexpr uint8_t gpuTLimitCriticalThresholdId{1};
static constexpr uint8_t gpuTLimitWarningThresholdId{2};
static constexpr uint8_t gpuTLimitHardshutDownThresholdId{4};
// nota bene: the order has to match the order in processTLimitThresholds
static constexpr std::array<uint8_t, 3> thresholdIds{
gpuTLimitWarningThresholdId, gpuTLimitCriticalThresholdId,
gpuTLimitHardshutDownThresholdId};
GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
const std::string& path,
const std::shared_ptr<sdbusplus::asio::connection>& conn,
uint8_t eid, boost::asio::io_context& io,
mctp::MctpRequester& mctpRequester,
sdbusplus::asio::object_server& objectServer) :
eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
waitTimer(io, std::chrono::steady_clock::duration(0)),
mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
configs(configs), name(escapeName(name)), path(path)
{
inventory = std::make_shared<Inventory>(
conn, objectServer, name, mctpRequester,
gpu::DeviceIdentification::DEVICE_GPU, eid, io);
}
void GpuDevice::init()
{
makeSensors();
inventory->init();
}
void GpuDevice::makeSensors()
{
tempSensor = std::make_shared<NvidiaGpuTempSensor>(
conn, mctpRequester, name + "_TEMP_0", path, eid, gpuTempSensorId,
objectServer, std::vector<thresholds::Threshold>{});
dramTempSensor = std::make_shared<NvidiaGpuTempSensor>(
conn, mctpRequester, name + "_DRAM_0_TEMP_0", path, eid,
gpuDramTempSensorId, objectServer,
std::vector<thresholds::Threshold>{thresholds::Threshold{
thresholds::Level::CRITICAL, thresholds::Direction::HIGH, 95.0}});
powerSensor = std::make_shared<NvidiaGpuPowerSensor>(
conn, mctpRequester, name + "_Power_0", path, eid, gpuPowerSensorId,
objectServer, std::vector<thresholds::Threshold>{});
peakPower = std::make_shared<NvidiaGpuPowerPeakReading>(
mctpRequester, name + "_Power_0", eid, gpuPeakPowerSensorId,
objectServer);
energySensor = std::make_shared<NvidiaGpuEnergySensor>(
conn, mctpRequester, name + "_Energy_0", path, eid, gpuEnergySensorId,
objectServer, std::vector<thresholds::Threshold>{});
voltageSensor = std::make_shared<NvidiaGpuVoltageSensor>(
conn, mctpRequester, name + "_Voltage_0", path, eid, gpuVoltageSensorId,
objectServer, std::vector<thresholds::Threshold>{});
getTLimitThresholds();
lg2::info("Added GPU {NAME} Sensors with chassis path: {PATH}.", "NAME",
name, "PATH", path);
read();
}
void GpuDevice::getTLimitThresholds()
{
thresholds = {};
current_threshold_index = 0;
getNextThermalParameter();
}
void GpuDevice::readThermalParameterCallback(const std::error_code& ec,
std::span<const uint8_t> buffer)
{
if (ec)
{
lg2::error(
"Error reading thermal parameter: sending message over MCTP failed, rc={RC}",
"RC", ec.message());
processTLimitThresholds(ec);
return;
}
ocp::accelerator_management::CompletionCode cc{};
uint16_t reasonCode = 0;
int32_t threshold = 0;
int rc = gpu::decodeReadThermalParametersResponse(buffer, cc, reasonCode,
threshold);
if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
{
lg2::error(
"Error reading thermal parameter: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
"RC", rc, "CC", cc, "RESC", reasonCode);
processTLimitThresholds(ec);
return;
}
thresholds[current_threshold_index] = threshold;
current_threshold_index++;
if (current_threshold_index < thresholdIds.size())
{
getNextThermalParameter();
return;
}
processTLimitThresholds(std::error_code{});
}
void GpuDevice::getNextThermalParameter()
{
uint8_t id = thresholdIds[current_threshold_index];
auto rc =
gpu::encodeReadThermalParametersRequest(0, id, thermalParamReqMsg);
if (rc != 0)
{
lg2::error(
"Error reading thermal parameter for eid {EID} and parameter id {PID} : encode failed. rc={RC}",
"EID", eid, "PID", id, "RC", rc);
processTLimitThresholds(
std::make_error_code(static_cast<std::errc>(rc)));
return;
}
mctpRequester.sendRecvMsg(
eid, thermalParamReqMsg,
[weak{weak_from_this()}](const std::error_code& ec,
std::span<const uint8_t> buffer) {
std::shared_ptr<GpuDevice> self = weak.lock();
if (!self)
{
lg2::error("Failed to get lock on GpuDevice");
return;
}
self->readThermalParameterCallback(ec, buffer);
});
}
void GpuDevice::processTLimitThresholds(const std::error_code& ec)
{
std::vector<thresholds::Threshold> tLimitThresholds{};
if (!ec)
{
tLimitThresholds = {
thresholds::Threshold{thresholds::Level::WARNING,
thresholds::Direction::LOW,
static_cast<double>(thresholds[0])},
thresholds::Threshold{thresholds::Level::CRITICAL,
thresholds::Direction::LOW,
static_cast<double>(thresholds[1])},
thresholds::Threshold{thresholds::Level::HARDSHUTDOWN,
thresholds::Direction::LOW,
static_cast<double>(thresholds[2])}};
}
tLimitSensor = std::make_shared<NvidiaGpuTempSensor>(
conn, mctpRequester, name + "_TEMP_1", path, eid, gpuTLimitSensorId,
objectServer, std::move(tLimitThresholds));
}
void GpuDevice::read()
{
tempSensor->update();
if (tLimitSensor)
{
tLimitSensor->update();
}
dramTempSensor->update();
powerSensor->update();
peakPower->update();
energySensor->update();
voltageSensor->update();
waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
waitTimer.async_wait(
[weak{weak_from_this()}](const boost::system::error_code& ec) {
std::shared_ptr<GpuDevice> self = weak.lock();
if (!self)
{
lg2::error("Invalid reference to GpuDevice");
return;
}
if (ec)
{
return;
}
self->read();
});
}