blob: ee56a72a7cb1d1d7c57f48d4034a84ce8f0d4ad2 [file] [log] [blame]
Harshit Agherae0b80e12025-08-28 14:05:29 +05301/*
2 * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3 * SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "NvidiaPcieDevice.hpp"
7
8#include "NvidiaDeviceDiscovery.hpp"
Harshit Aghera7427aee2025-10-17 18:59:09 +05309#include "NvidiaEthPort.hpp"
Harshit Aghera68a8e2d2025-09-29 15:22:25 +053010#include "NvidiaGpuMctpVdm.hpp"
Harshit Agherae0b80e12025-08-28 14:05:29 +053011#include "NvidiaPcieInterface.hpp"
Harshit Aghera68a8e2d2025-09-29 15:22:25 +053012#include "NvidiaPciePort.hpp"
Harshit Aghera1180ed42025-09-30 16:55:39 +053013#include "NvidiaPciePortMetrics.hpp"
Harshit Agherae0b80e12025-08-28 14:05:29 +053014#include "Utils.hpp"
15
16#include <MctpRequester.hpp>
Harshit Aghera68a8e2d2025-09-29 15:22:25 +053017#include <OcpMctpVdm.hpp>
Harshit Agherae0b80e12025-08-28 14:05:29 +053018#include <boost/asio/io_context.hpp>
19#include <phosphor-logging/lg2.hpp>
20#include <sdbusplus/asio/connection.hpp>
21#include <sdbusplus/asio/object_server.hpp>
Harshit Aghera7427aee2025-10-17 18:59:09 +053022#include <sdbusplus/message/native_types.hpp>
Harshit Agherae0b80e12025-08-28 14:05:29 +053023
24#include <chrono>
25#include <cstdint>
Harshit Aghera68a8e2d2025-09-29 15:22:25 +053026#include <format>
Harshit Agherae0b80e12025-08-28 14:05:29 +053027#include <memory>
Harshit Aghera68a8e2d2025-09-29 15:22:25 +053028#include <span>
Harshit Agherae0b80e12025-08-28 14:05:29 +053029#include <string>
Harshit Aghera68a8e2d2025-09-29 15:22:25 +053030#include <system_error>
Harshit Aghera7427aee2025-10-17 18:59:09 +053031#include <utility>
32#include <vector>
Harshit Agherae0b80e12025-08-28 14:05:29 +053033
34PcieDevice::PcieDevice(const SensorConfigs& configs, const std::string& name,
35 const std::string& path,
36 const std::shared_ptr<sdbusplus::asio::connection>& conn,
37 uint8_t eid, boost::asio::io_context& io,
38 mctp::MctpRequester& mctpRequester,
39 sdbusplus::asio::object_server& objectServer) :
40 eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
41 waitTimer(io, std::chrono::steady_clock::duration(0)),
42 mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
43 configs(configs), name(escapeName(name)), path(path)
44{}
45
46void PcieDevice::init()
47{
Harshit Aghera7427aee2025-10-17 18:59:09 +053048 sdbusplus::message::object_path networkAdapterPath =
49 sdbusplus::message::object_path(nicPathPrefix) / (name + "_NIC");
50
51 networkAdapterInterface = objectServer.add_interface(
52 networkAdapterPath,
53 "xyz.openbmc_project.Inventory.Item.NetworkAdapter");
54
55 std::vector<Association> associations;
56 associations.emplace_back(
57 "contained_by", "containing",
58 sdbusplus::message::object_path(path).parent_path());
59
60 networkAdapterAssociationInterface =
61 objectServer.add_interface(networkAdapterPath, association::interface);
62 networkAdapterAssociationInterface->register_property(
63 "Associations", associations);
64
65 if (!networkAdapterInterface->initialize())
66 {
67 lg2::error(
68 "Failed to initialize network adapter interface for for eid {EID}",
69 "EID", eid);
70 }
71
72 if (!networkAdapterAssociationInterface->initialize())
73 {
74 lg2::error(
75 "Error initializing Association Interface for Network Adapter for eid {EID}",
76 "EID", eid);
77 }
78
Harshit Aghera68a8e2d2025-09-29 15:22:25 +053079 getPciePortCounts();
Harshit Aghera7427aee2025-10-17 18:59:09 +053080
81 for (uint64_t k = 0; k < configs.nicNetworkPortCount; ++k)
82 {
83 getNetworkPortAddresses(static_cast<uint16_t>(k + 1));
84 }
Harshit Aghera68a8e2d2025-09-29 15:22:25 +053085}
86
87void PcieDevice::getPciePortCounts()
88{
89 const int rc = gpu::encodeListPciePortsRequest(0, getPciePortCountsRequest);
90
91 if (rc != 0)
92 {
93 lg2::error(
94 "Error updating PCIe Port Counts: encode failed, rc={RC}, EID={EID}",
95 "RC", rc, "EID", eid);
96 return;
97 }
98
99 mctpRequester.sendRecvMsg(
100 eid, getPciePortCountsRequest,
101 [weak{weak_from_this()}](const std::error_code& ec,
102 std::span<const uint8_t> buffer) {
103 std::shared_ptr<PcieDevice> self = weak.lock();
104 if (!self)
105 {
106 lg2::error("Invalid reference to PcieDevice");
107 return;
108 }
109 self->processPciePortCountsResponse(ec, buffer);
110 });
111}
112
113void PcieDevice::processPciePortCountsResponse(
114 const std::error_code& ec, std::span<const uint8_t> response)
115{
116 if (ec)
117 {
118 lg2::error(
119 "Error processing PCIe Port Counts response: sending message over MCTP failed, rc={RC}, EID={EID}",
120 "RC", ec.message(), "EID", eid);
121 return;
122 }
123
124 ocp::accelerator_management::CompletionCode cc{};
125 uint16_t reasonCode = 0;
126
127 const int rc = gpu::decodeListPciePortsResponse(
128 response, cc, reasonCode, pcieDeviceInfo.numUpstreamPorts,
129 pcieDeviceInfo.numDownstreamPorts);
130
131 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
132 {
133 lg2::error(
134 "Error processing PCIe Port Counts response: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}",
135 "RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
136 eid);
137 return;
138 }
139
140 lg2::info("PCIe Device with eid {EID} has {UP} upstream ports.", "EID", eid,
141 "UP", pcieDeviceInfo.numUpstreamPorts);
142
Harshit Agherae0b80e12025-08-28 14:05:29 +0530143 makeSensors();
144}
145
Harshit Aghera7427aee2025-10-17 18:59:09 +0530146void PcieDevice::getNetworkPortAddresses(const uint16_t portNumber)
147{
148 const int rc = gpu::encodeGetPortNetworkAddressesRequest(
149 0, portNumber, getPortNetworkAddressesRequest);
150
151 if (rc != 0)
152 {
153 lg2::error(
154 "Error updating Network Port Addresses: encode failed, rc={RC}, EID={EID}",
155 "RC", rc, "EID", eid);
156 return;
157 }
158
159 mctpRequester.sendRecvMsg(
160 eid, getPortNetworkAddressesRequest,
161 [portNumber, weak{weak_from_this()}](const std::error_code& ec,
162 std::span<const uint8_t> buffer) {
163 std::shared_ptr<PcieDevice> self = weak.lock();
164 if (!self)
165 {
166 lg2::error("Invalid reference to PcieDevice, EID={EID}", "EID",
167 self->eid);
168 return;
169 }
170 self->processGetNetworkPortAddressesResponse(portNumber, ec,
171 buffer);
172 });
173}
174
175void PcieDevice::processGetNetworkPortAddressesResponse(
176 const uint16_t portNumber, const std::error_code& ec,
177 std::span<const uint8_t> response)
178{
179 if (ec)
180 {
181 lg2::error(
182 "Error processing Network Port Addresses response: sending message over MCTP failed, rc={RC}, EID={EID}",
183 "RC", ec.message(), "EID", eid);
184 return;
185 }
186
187 ocp::accelerator_management::CompletionCode cc{};
188 uint16_t reasonCode = 0;
189 gpu::NetworkPortLinkType linkType = gpu::NetworkPortLinkType::UNKNOWN;
190 std::vector<std::pair<uint8_t, uint64_t>> addresses;
191
192 const int rc = gpu::decodeGetPortNetworkAddressesResponse(
193 response, cc, reasonCode, linkType, addresses);
194
195 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
196 {
197 lg2::error(
198 "Error processing Network Port Addresses response: decode failed, rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}",
199 "RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
200 eid);
201 return;
202 }
203
204 if (linkType == gpu::NetworkPortLinkType::ETHERNET)
205 {
206 lg2::info(
207 "Port {PN} of PCIe Device with eid {EID} is of type Ethernet.",
208 "EID", eid, "PN", portNumber);
209
210 const std::string nicDeviceName = name + "_NIC";
211
212 const std::string portName = std::format("Port_{}", portNumber);
213
214 ethPortMetrics.emplace_back(std::make_shared<NvidiaEthPortMetrics>(
215 conn, mctpRequester, portName, nicDeviceName, path, eid, portNumber,
216 objectServer));
217 }
218}
219
Harshit Agherae0b80e12025-08-28 14:05:29 +0530220void PcieDevice::makeSensors()
221{
Harshit Aghera1180ed42025-09-30 16:55:39 +0530222 const std::string pcieDeviceName = name + "_PCIe";
223
Harshit Agherae0b80e12025-08-28 14:05:29 +0530224 pcieInterface = std::make_shared<NvidiaPcieInterface>(
Harshit Aghera1180ed42025-09-30 16:55:39 +0530225 conn, mctpRequester, pcieDeviceName, path, eid, objectServer);
Harshit Agherae0b80e12025-08-28 14:05:29 +0530226
Harshit Aghera68a8e2d2025-09-29 15:22:25 +0530227 uint64_t downstreamPortIndex = 0;
228
229 for (uint64_t i = 0; i < pcieDeviceInfo.numUpstreamPorts; ++i)
230 {
231 const std::string portName = std::format("UP_{}", i);
232
233 pciePorts.emplace_back(std::make_shared<NvidiaPciePortInfo>(
Harshit Aghera1180ed42025-09-30 16:55:39 +0530234 conn, mctpRequester, portName, pcieDeviceName, path, eid,
235 gpu::PciePortType::UPSTREAM, i, i, objectServer));
236
237 pciePortMetrics.emplace_back(makeNvidiaPciePortErrors(
238 conn, mctpRequester, portName, pcieDeviceName, path, eid,
239 gpu::PciePortType::UPSTREAM, i, i, objectServer));
240
241 pciePortMetrics.emplace_back(makeNvidiaPciePortCounters(
242 conn, mctpRequester, portName, pcieDeviceName, path, eid,
243 gpu::PciePortType::UPSTREAM, i, i, objectServer));
244
245 pciePortMetrics.emplace_back(makeNvidiaPciePortL0ToRecoveryCount(
246 conn, mctpRequester, portName, pcieDeviceName, path, eid,
Harshit Aghera68a8e2d2025-09-29 15:22:25 +0530247 gpu::PciePortType::UPSTREAM, i, i, objectServer));
248
249 for (uint64_t j = 0; j < pcieDeviceInfo.numDownstreamPorts[i]; ++j)
250 {
251 const std::string portName =
252 std::format("DOWN_{}", downstreamPortIndex);
253
254 pciePorts.emplace_back(std::make_shared<NvidiaPciePortInfo>(
Harshit Aghera1180ed42025-09-30 16:55:39 +0530255 conn, mctpRequester, portName, pcieDeviceName, path, eid,
256 gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
257 objectServer));
258
259 pciePortMetrics.emplace_back(makeNvidiaPciePortErrors(
260 conn, mctpRequester, portName, pcieDeviceName, path, eid,
261 gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
262 objectServer));
263
264 pciePortMetrics.emplace_back(makeNvidiaPciePortCounters(
265 conn, mctpRequester, portName, pcieDeviceName, path, eid,
266 gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
267 objectServer));
268
269 pciePortMetrics.emplace_back(makeNvidiaPciePortL0ToRecoveryCount(
270 conn, mctpRequester, portName, pcieDeviceName, path, eid,
Harshit Aghera68a8e2d2025-09-29 15:22:25 +0530271 gpu::PciePortType::DOWNSTREAM, i, downstreamPortIndex,
272 objectServer));
273
274 ++downstreamPortIndex;
275 }
276 }
277
Harshit Agherae0b80e12025-08-28 14:05:29 +0530278 lg2::info("Added PCIe {NAME} Sensors with chassis path: {PATH}.", "NAME",
279 name, "PATH", path);
280
281 read();
282}
283
284void PcieDevice::read()
285{
286 pcieInterface->update();
287
Harshit Aghera68a8e2d2025-09-29 15:22:25 +0530288 for (auto& port : pciePorts)
289 {
290 port->update();
291 }
292
Harshit Aghera1180ed42025-09-30 16:55:39 +0530293 for (auto& portMetrics : pciePortMetrics)
294 {
295 portMetrics->update();
296 }
297
Harshit Aghera7427aee2025-10-17 18:59:09 +0530298 for (auto& ethPortMetric : ethPortMetrics)
299 {
300 ethPortMetric->update();
301 }
302
Harshit Agherae0b80e12025-08-28 14:05:29 +0530303 waitTimer.expires_after(std::chrono::milliseconds(sensorPollMs));
304 waitTimer.async_wait([this](const boost::system::error_code& ec) {
305 if (ec)
306 {
307 return;
308 }
309 read();
310 });
311}