Blame - src/nvidia-gpu/NvidiaGpuPowerSensor.cpp - openbmc/dbus-sensors

blob: 9f7c7141b816bfc52ac06b042379d1bd22e8dfd1 [file] [log] [blame]

Harshit Aghera	c8dab72	2025-05-08 15:57:42 +0530	[diff] [blame^]	1	/*
				2	* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
				3	* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
				4	*/
				5
				6	#include "NvidiaGpuPowerSensor.hpp"
				7
				8	#include "MctpRequester.hpp"
				9	#include "SensorPaths.hpp"
				10	#include "Thresholds.hpp"
				11	#include "Utils.hpp"
				12	#include "sensor.hpp"
				13
				14	#include <bits/basic_string.h>
				15
				16	#include <NvidiaDeviceDiscovery.hpp>
				17	#include <NvidiaGpuMctpVdm.hpp>
				18	#include <OcpMctpVdm.hpp>
				19	#include <phosphor-logging/lg2.hpp>
				20	#include <sdbusplus/asio/connection.hpp>
				21	#include <sdbusplus/asio/object_server.hpp>
				22
				23	#include <cstddef>
				24	#include <cstdint>
				25	#include <functional>
				26	#include <limits>
				27	#include <memory>
				28	#include <string>
				29	#include <utility>
				30	#include <vector>
				31
				32	using namespace std::literals;
				33
				34	// GPU Power Sensor Averaging Interval in seconds, 0 implies default
				35	constexpr uint8_t gpuPowerAveragingIntervalInSec{0};
				36
				37	static constexpr double gpuPowerSensorMaxReading =
				38	std::numeric_limits<uint32_t>::max();
				39	static constexpr double gpuPowerSensorMinReading =
				40	std::numeric_limits<uint32_t>::min();
				41
				42	NvidiaGpuPowerSensor::NvidiaGpuPowerSensor(
				43	std::shared_ptr<sdbusplus::asio::connection>& conn,
				44	mctp::MctpRequester& mctpRequester, const std::string& name,
				45	const std::string& sensorConfiguration, uint8_t eid, uint8_t sensorId,
				46	sdbusplus::asio::object_server& objectServer,
				47	std::vector<thresholds::Threshold>&& thresholdData) :
				48	Sensor(escapeName(name), std::move(thresholdData), sensorConfiguration,
				49	"power", false, true, gpuPowerSensorMaxReading,
				50	gpuPowerSensorMinReading, conn),
				51	eid(eid), sensorId{sensorId},
				52	averagingInterval{gpuPowerAveragingIntervalInSec},
				53	mctpRequester(mctpRequester), objectServer(objectServer)
				54
				55	{
				56	std::string dbusPath = sensorPathPrefix + "power/"s + escapeName(name);
				57
				58	sensorInterface = objectServer.add_interface(
				59	dbusPath, "xyz.openbmc_project.Sensor.Value");
				60
				61	for (const auto& threshold : thresholds)
				62	{
				63	std::string interface = thresholds::getInterface(threshold.level);
				64	thresholdInterfaces[static_cast<size_t>(threshold.level)] =
				65	objectServer.add_interface(dbusPath, interface);
				66	}
				67
				68	association = objectServer.add_interface(dbusPath, association::interface);
				69
				70	// Sensor values are only updated when the difference between the new and
				71	// previous value exceeds the hysteresisPublish threshold. This threshold
				72	// defaults to ((max - min) * 0.0001). Since this sensor lacks defined
				73	// min/max values, theoretical limits are used instead, creating a large
				74	// hysteresisPublish value that blocks D-Bus updates. Setting
				75	// hysteresisPublish to 0 forces all sensor value changes to be published
				76	// to D-Bus.
				77	hysteresisPublish = 0;
				78
				79	setInitialProperties(sensor_paths::unitWatts);
				80	}
				81
				82	NvidiaGpuPowerSensor::~NvidiaGpuPowerSensor()
				83	{
				84	for (const auto& iface : thresholdInterfaces)
				85	{
				86	objectServer.remove_interface(iface);
				87	}
				88	objectServer.remove_interface(association);
				89	objectServer.remove_interface(sensorInterface);
				90	}
				91
				92	void NvidiaGpuPowerSensor::checkThresholds()
				93	{
				94	thresholds::checkThresholds(this);
				95	}
				96
				97	void NvidiaGpuPowerSensor::processResponse(int sendRecvMsgResult)
				98	{
				99	if (sendRecvMsgResult != 0)
				100	{
				101	lg2::error(
				102	"Error updating Power Sensor for eid {EID} and sensor id {SID} : sending message over MCTP failed, rc={RC}",
				103	"EID", eid, "SID", sensorId, "RC", sendRecvMsgResult);
				104	return;
				105	}
				106
				107	ocp::accelerator_management::CompletionCode cc{};
				108	uint16_t reasonCode = 0;
				109	uint32_t power = 0;
				110
				111	auto rc =
				112	gpu::decodeGetCurrentPowerDrawResponse(response, cc, reasonCode, power);
				113
				114	if (rc != 0 \|\| cc != ocp::accelerator_management::CompletionCode::SUCCESS)
				115	{
				116	lg2::error(
				117	"Error updating Power Sensor eid {EID} and sensor id {SID} : decode failed, rc={RC}, cc={CC}, reasonCode={RESC}",
				118	"EID", eid, "SID", sensorId, "RC", rc, "CC", cc, "RESC",
				119	reasonCode);
				120	return;
				121	}
				122
				123	// Reading from the device is in milliwatts and unit set on the dbus
				124	// is watts.
				125	updateValue(power / 1000.0);
				126	}
				127
				128	void NvidiaGpuPowerSensor::update()
				129	{
				130	auto rc = gpu::encodeGetCurrentPowerDrawRequest(0, sensorId,
				131	averagingInterval, request);
				132
				133	if (rc != 0)
				134	{
				135	lg2::error(
				136	"Error updating Temperature Sensor for eid {EID} and sensor id {SID} : encode failed, rc={RC}",
				137	"EID", eid, "SID", sensorId, "RC", rc);
				138	}
				139
				140	mctpRequester.sendRecvMsg(
				141	eid, request, response,
				142	[this](int sendRecvMsgResult) { processResponse(sendRecvMsgResult); });
				143	}