blob: 456ff8034a2e29c93fb9869eaaf66ea0962660da [file] [log] [blame]
Harshit Aghera1180ed42025-09-30 16:55:39 +05301/*
2 * SPDX-FileCopyrightText: Copyright OpenBMC Authors
3 * SPDX-License-Identifier: Apache-2.0
4 */
5
6#include "NvidiaPciePortMetrics.hpp"
7
8#include "NvidiaUtils.hpp"
9#include "Utils.hpp"
10
11#include <bits/basic_string.h>
12
13#include <MctpRequester.hpp>
14#include <NvidiaGpuMctpVdm.hpp>
15#include <NvidiaPcieDevice.hpp>
16#include <OcpMctpVdm.hpp>
17#include <phosphor-logging/lg2.hpp>
18#include <sdbusplus/asio/connection.hpp>
19#include <sdbusplus/asio/object_server.hpp>
20#include <sdbusplus/message/native_types.hpp>
21
22#include <cstddef>
23#include <cstdint>
24#include <format>
25#include <functional>
26#include <memory>
27#include <span>
28#include <string>
29#include <system_error>
30#include <vector>
31
32using std::string;
33
34using namespace std::literals;
35
36constexpr const char* metricInterface = "xyz.openbmc_project.Metric.Value";
37
38NvidiaPciePortMetrics::NvidiaPciePortMetrics(
39 std::shared_ptr<sdbusplus::asio::connection>& conn,
40 mctp::MctpRequester& mctpRequester, const std::string& name,
41 const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
42 gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
43 sdbusplus::asio::object_server& objectServer, uint8_t scalarGroupId,
44 const std::vector<NvidiaMetricInfo>& metricsInfo) :
45 eid(eid), portType(portType), upstreamPortNumber(upstreamPortNumber),
46 portNumber(portNumber), scalarGroupId(scalarGroupId), path(path),
47 conn(conn), mctpRequester(mctpRequester)
48{
49 const std::string metricsDbusPathPrefix =
50 metricPath + std::format("port_{}_{}", pcieDeviceName, name);
51
52 const sdbusplus::message::object_path portDbusPath =
53 sdbusplus::message::object_path(pcieDevicePathPrefix) / pcieDeviceName /
54 name;
55
56 for (const auto& [id, name] : metricsInfo)
57 {
58 const std::string metricsDbusPath = metricsDbusPathPrefix + name;
59
60 metricValueInterfaces[id] =
61 objectServer.add_interface(metricsDbusPath, metricInterface);
62 metricValueInterfaces[id]->register_property(
63 "Unit", "xyz.openbmc_project.Metric.Value.Unit.Count"s);
64 metricValueInterfaces[id]->register_property("Value", 0.0);
65
66 std::vector<Association> associations;
67 associations.emplace_back("measuring", "measured_by", portDbusPath);
68
69 metricAssociationInterfaces[id] =
70 objectServer.add_interface(metricsDbusPath, association::interface);
71 metricAssociationInterfaces[id]->register_property("Associations",
72 associations);
73
74 if (!metricValueInterfaces[id]->initialize())
75 {
76 lg2::error(
77 "Error initializing PCIe Port Metric Interface for EID={EID}, "
78 "PortType={PT}, PortNumber={PN}, ScalarGroup={SG}, Metric={MN}",
79 "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
80 portNumber, "EID", eid, "PN", portNumber, "SG", scalarGroupId,
81 "MN", name);
82 }
83
84 if (!metricAssociationInterfaces[id]->initialize())
85 {
86 lg2::error(
87 "Error initializing PCIe Port Metric Association Interface for EID={EID}, "
88 "PortType={PT}, PortNumber={PN}, ScalarGroup={SG}, Metric={MN}",
89 "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
90 portNumber, "EID", eid, "PN", portNumber, "SG", scalarGroupId,
91 "MN", name);
92 }
93 }
94}
95
96void NvidiaPciePortMetrics::processResponse(
97 const std::error_code& sendRecvMsgResult, std::span<const uint8_t> response)
98{
99 if (sendRecvMsgResult)
100 {
101 lg2::error(
102 "Error updating PCIe Port Metrics: sending message over MCTP failed, "
103 "rc={RC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
104 "RC", sendRecvMsgResult.message(), "EID", eid, "PT",
105 static_cast<uint8_t>(portType), "PN", portNumber, "SG",
106 scalarGroupId);
107 return;
108 }
109
110 ocp::accelerator_management::CompletionCode cc{};
111 uint16_t reasonCode = 0;
112 size_t numTelemetryValue = 0;
113
114 int rc = gpu::decodeQueryScalarGroupTelemetryV2Response(
115 response, cc, reasonCode, numTelemetryValue, telemetryValues);
116
117 if (rc != 0 || cc != ocp::accelerator_management::CompletionCode::SUCCESS)
118 {
119 lg2::error(
120 "Error updating PCIe Port Errors: decode failed, "
121 "rc={RC}, cc={CC}, reasonCode={RESC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
122 "RC", rc, "CC", static_cast<uint8_t>(cc), "RESC", reasonCode, "EID",
123 eid, "PT", static_cast<uint8_t>(portType), "PN", portNumber, "SG",
124 scalarGroupId);
125 return;
126 }
127
128 for (size_t i = 0; i < numTelemetryValue; ++i)
129 {
130 if (metricValueInterfaces[i] != nullptr)
131 {
132 metricValueInterfaces[i]->set_property(
133 "Value", static_cast<double>(telemetryValues[i]));
134 }
135 }
136}
137
138void NvidiaPciePortMetrics::update()
139{
140 auto rc = gpu::encodeQueryScalarGroupTelemetryV2Request(
141 0, portType, upstreamPortNumber, portNumber, scalarGroupId, request);
142
143 if (rc != 0)
144 {
145 lg2::error(
146 "Error updating PCIe Port Errors: encode failed, rc={RC}, EID={EID}, PortType={PT}, PortNumber={PN}, ScalarGroup={SG}",
147 "RC", rc, "EID", eid, "PT", static_cast<uint8_t>(portType), "PN",
148 portNumber, "SG", scalarGroupId);
149 return;
150 }
151
152 mctpRequester.sendRecvMsg(
153 eid, request,
154 [weak{weak_from_this()}](const std::error_code& ec,
155 std::span<const uint8_t> buffer) {
156 std::shared_ptr<NvidiaPciePortMetrics> self = weak.lock();
157 if (!self)
158 {
159 lg2::error("Invalid reference to NvidiaPciePortErrors");
160 return;
161 }
162 self->processResponse(ec, buffer);
163 });
164}
165
166std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortErrors(
167 std::shared_ptr<sdbusplus::asio::connection>& conn,
168 mctp::MctpRequester& mctpRequester, const std::string& name,
169 const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
170 gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
171 sdbusplus::asio::object_server& objectServer)
172{
173 static constexpr uint8_t nvidiaPciePortErrorScalarGroupId = 2;
174
175 return std::make_shared<NvidiaPciePortMetrics>(
176 conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
177 upstreamPortNumber, portNumber, objectServer,
178 nvidiaPciePortErrorScalarGroupId,
179 std::vector<NvidiaMetricInfo>{
180 {0, "/pcie/non_fatal_error_count"},
181 {1, "/pcie/fatal_error_count"},
182 {2, "/pcie/unsupported_request_count"},
183 {3, "/pcie/correctable_error_count"},
184 });
185}
186
187std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortCounters(
188 std::shared_ptr<sdbusplus::asio::connection>& conn,
189 mctp::MctpRequester& mctpRequester, const std::string& name,
190 const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
191 gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
192 sdbusplus::asio::object_server& objectServer)
193{
194 static constexpr uint8_t nvidiaPciePortCounterScalarGroupId = 4;
195
196 return std::make_shared<NvidiaPciePortMetrics>(
197 conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
198 upstreamPortNumber, portNumber, objectServer,
199 nvidiaPciePortCounterScalarGroupId,
200 std::vector<NvidiaMetricInfo>{
201 {1, "/pcie/nak_received_count"},
202 {2, "/pcie/nak_sent_count"},
203 {4, "/pcie/replay_rollover_count"},
204 {6, "/pcie/replay_count"},
205 });
206}
207
208std::shared_ptr<NvidiaPciePortMetrics> makeNvidiaPciePortL0ToRecoveryCount(
209 std::shared_ptr<sdbusplus::asio::connection>& conn,
210 mctp::MctpRequester& mctpRequester, const std::string& name,
211 const std::string& pcieDeviceName, const std::string& path, uint8_t eid,
212 gpu::PciePortType portType, uint8_t upstreamPortNumber, uint8_t portNumber,
213 sdbusplus::asio::object_server& objectServer)
214{
215 static constexpr uint8_t nvidiaPciePortL0ToRecoveryCountScalarGroupId = 3;
216
217 return std::make_shared<NvidiaPciePortMetrics>(
218 conn, mctpRequester, name, pcieDeviceName, path, eid, portType,
219 upstreamPortNumber, portNumber, objectServer,
220 nvidiaPciePortL0ToRecoveryCountScalarGroupId,
221 std::vector<NvidiaMetricInfo>{
222 {0, "/pcie/l0_to_recovery_count"},
223 });
224}