blob: 59954c432e571d7099fff32919cb6ead6b1578fc [file] [log] [blame]
Brandon Kimdab96f12021-02-18 11:21:37 -08001// Copyright 2021 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Sui Chen03eba282021-02-11 11:35:56 -080015#include "metric.hpp"
16
William A. Kennington III7f493702024-02-08 01:09:11 -080017#include "metricblob.pb.n.h"
Sui Chen03eba282021-02-11 11:35:56 -080018
19#include "util.hpp"
20
William A. Kennington III7f493702024-02-08 01:09:11 -080021#include <pb_encode.h>
Sui Chen03eba282021-02-11 11:35:56 -080022#include <sys/statvfs.h>
23
24#include <phosphor-logging/log.hpp>
25
26#include <cstdint>
27#include <filesystem>
28#include <sstream>
29#include <string>
30#include <string_view>
31
32namespace metric_blob
33{
34
35using phosphor::logging::entry;
36using phosphor::logging::log;
37using level = phosphor::logging::level;
38
39BmcHealthSnapshot::BmcHealthSnapshot() :
40 done(false), stringId(0), ticksPerSec(0)
41{}
42
William A. Kennington III7f493702024-02-08 01:09:11 -080043template <typename T>
44static constexpr auto pbEncodeStr = [](pb_ostream_t* stream,
45 const pb_field_iter_t* field,
46 void* const* arg) noexcept {
47 static_assert(sizeof(*std::declval<T>().data()) == sizeof(pb_byte_t));
48 const auto& s = *reinterpret_cast<const T*>(*arg);
49 return pb_encode_tag_for_field(stream, field) &&
50 pb_encode_string(
51 stream, reinterpret_cast<const pb_byte_t*>(s.data()), s.size());
52};
53
54template <typename T>
55static pb_callback_t pbStrEncoder(const T& t) noexcept
56{
57 return {{.encode = pbEncodeStr<T>}, const_cast<T*>(&t)};
58}
59
60template <auto fields, typename T>
61static constexpr auto pbEncodeSubs = [](pb_ostream_t* stream,
62 const pb_field_iter_t* field,
63 void* const* arg) noexcept {
64 for (const auto& sub : *reinterpret_cast<const std::vector<T>*>(*arg))
65 {
66 if (!pb_encode_tag_for_field(stream, field) ||
67 !pb_encode_submessage(stream, fields, &sub))
68 {
69 return false;
70 }
71 }
72 return true;
73};
74
75template <auto fields, typename T>
76static pb_callback_t pbSubsEncoder(const std::vector<T>& t)
77{
78 return {{.encode = pbEncodeSubs<fields, T>},
79 const_cast<std::vector<T>*>(&t)};
80}
81
Sui Chen03eba282021-02-11 11:35:56 -080082struct ProcStatEntry
83{
84 std::string cmdline;
85 std::string tcomm;
86 float utime;
87 float stime;
88
89 // Processes with the longest utime + stime are ranked first.
90 // Tie breaking is done with cmdline then tcomm.
91 bool operator<(const ProcStatEntry& other) const
92 {
93 const float negTime = -(utime + stime);
94 const float negOtherTime = -(other.utime + other.stime);
95 return std::tie(negTime, cmdline, tcomm) <
96 std::tie(negOtherTime, other.cmdline, other.tcomm);
97 }
98};
99
William A. Kennington III7f493702024-02-08 01:09:11 -0800100static bmcmetrics_metricproto_BmcProcStatMetric getProcStatMetric(
101 BmcHealthSnapshot& obj, long ticksPerSec,
102 std::vector<bmcmetrics_metricproto_BmcProcStatMetric_BmcProcStat>& procs,
103 bool& use) noexcept
Sui Chen03eba282021-02-11 11:35:56 -0800104{
William A. Kennington III7f493702024-02-08 01:09:11 -0800105 if (ticksPerSec == 0)
106 {
107 return {};
108 }
Sui Chen03eba282021-02-11 11:35:56 -0800109 constexpr std::string_view procPath = "/proc/";
110
Sui Chen03eba282021-02-11 11:35:56 -0800111 std::vector<ProcStatEntry> entries;
112
113 for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
114 {
115 const std::string& path = procEntry.path();
116 int pid = -1;
117 if (isNumericPath(path, pid))
118 {
119 ProcStatEntry entry;
120
121 try
122 {
123 entry.cmdline = getCmdLine(pid);
124 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
125 entry.tcomm = t.tcomm;
126 entry.utime = t.utime;
127 entry.stime = t.stime;
128
129 entries.push_back(entry);
130 }
131 catch (const std::exception& e)
132 {
133 log<level::ERR>("Could not obtain process stats");
134 }
135 }
136 }
137
138 std::sort(entries.begin(), entries.end());
139
140 bool isOthers = false;
141 ProcStatEntry others;
142 others.cmdline = "(Others)";
143 others.utime = others.stime = 0;
144
145 // Only show this many processes and aggregate all remaining ones into
146 // "others" in order to keep the size of the snapshot reasonably small.
147 // With 10 process stat entries and 10 FD count entries, the size of the
148 // snapshot reaches around 1.5KiB. This is non-trivial, and we have to set
149 // the collection interval long enough so as not to over-stress the IPMI
150 // interface and the data collection service. The value of 10 is chosen
151 // empirically, it might be subject to adjustments when the system is
152 // launched later.
153 constexpr int topN = 10;
154
155 for (size_t i = 0; i < entries.size(); ++i)
156 {
157 if (i >= topN)
158 {
159 isOthers = true;
160 }
161
William A. Kennington III7f493702024-02-08 01:09:11 -0800162 const ProcStatEntry& entry = entries[i];
Sui Chen03eba282021-02-11 11:35:56 -0800163
164 if (isOthers)
165 {
166 others.utime += entry.utime;
167 others.stime += entry.stime;
168 }
169 else
170 {
Sui Chen03eba282021-02-11 11:35:56 -0800171 std::string fullCmdline = entry.cmdline;
172 if (entry.tcomm.size() > 0)
173 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800174 fullCmdline += " ";
175 fullCmdline += entry.tcomm;
Sui Chen03eba282021-02-11 11:35:56 -0800176 }
William A. Kennington III7f493702024-02-08 01:09:11 -0800177 procs.emplace_back(
178 bmcmetrics_metricproto_BmcProcStatMetric_BmcProcStat{
179 .sidx_cmdline = obj.getStringID(fullCmdline),
180 .utime = entry.utime,
181 .stime = entry.stime,
182 });
Sui Chen03eba282021-02-11 11:35:56 -0800183 }
184 }
185
186 if (isOthers)
187 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800188 procs.emplace_back(bmcmetrics_metricproto_BmcProcStatMetric_BmcProcStat{
189 .sidx_cmdline = obj.getStringID(others.cmdline),
190 .utime = others.utime,
191 .stime = others.stime,
192
193 });
Sui Chen03eba282021-02-11 11:35:56 -0800194 }
195
William A. Kennington III7f493702024-02-08 01:09:11 -0800196 use = true;
197 return bmcmetrics_metricproto_BmcProcStatMetric{
198 .stats = pbSubsEncoder<
199 bmcmetrics_metricproto_BmcProcStatMetric_BmcProcStat_fields>(procs),
200 };
Sui Chen03eba282021-02-11 11:35:56 -0800201}
202
203int getFdCount(int pid)
204{
205 const std::string& fdPath = "/proc/" + std::to_string(pid) + "/fd";
206 return std::distance(std::filesystem::directory_iterator(fdPath),
207 std::filesystem::directory_iterator{});
208}
209
210struct FdStatEntry
211{
212 int fdCount;
213 std::string cmdline;
214 std::string tcomm;
215
216 // Processes with the largest fdCount goes first.
217 // Tie-breaking using cmdline then tcomm.
218 bool operator<(const FdStatEntry& other) const
219 {
220 const int negFdCount = -fdCount;
221 const int negOtherFdCount = -other.fdCount;
222 return std::tie(negFdCount, cmdline, tcomm) <
223 std::tie(negOtherFdCount, other.cmdline, other.tcomm);
224 }
225};
226
William A. Kennington III7f493702024-02-08 01:09:11 -0800227static bmcmetrics_metricproto_BmcFdStatMetric getFdStatMetric(
228 BmcHealthSnapshot& obj, long ticksPerSec,
229 std::vector<bmcmetrics_metricproto_BmcFdStatMetric_BmcFdStat>& fds,
230 bool& use) noexcept
Sui Chen03eba282021-02-11 11:35:56 -0800231{
William A. Kennington III7f493702024-02-08 01:09:11 -0800232 if (ticksPerSec == 0)
233 {
234 return {};
235 }
Sui Chen03eba282021-02-11 11:35:56 -0800236
237 // Sort by fd count, no tie-breaking
238 std::vector<FdStatEntry> entries;
239
240 const std::string_view procPath = "/proc/";
241 for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
242 {
243 const std::string& path = procEntry.path();
244 int pid = 0;
245 FdStatEntry entry;
246 if (isNumericPath(path, pid))
247 {
248 try
249 {
250 entry.fdCount = getFdCount(pid);
251 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
252 entry.cmdline = getCmdLine(pid);
253 entry.tcomm = t.tcomm;
254 entries.push_back(entry);
255 }
256 catch (const std::exception& e)
257 {
258 log<level::ERR>("Could not get file descriptor stats");
259 }
260 }
261 }
262
263 std::sort(entries.begin(), entries.end());
264
265 bool isOthers = false;
266
267 // Only report the detailed fd count and cmdline for the top 10 entries,
268 // and collapse all others into "others".
269 constexpr int topN = 10;
270
271 FdStatEntry others;
272 others.cmdline = "(Others)";
273 others.fdCount = 0;
274
275 for (size_t i = 0; i < entries.size(); ++i)
276 {
277 if (i >= topN)
278 {
279 isOthers = true;
280 }
281
282 const FdStatEntry& entry = entries[i];
283 if (isOthers)
284 {
285 others.fdCount += entry.fdCount;
286 }
287 else
288 {
Sui Chen03eba282021-02-11 11:35:56 -0800289 std::string fullCmdline = entry.cmdline;
290 if (entry.tcomm.size() > 0)
291 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800292 fullCmdline += " ";
293 fullCmdline += entry.tcomm;
Sui Chen03eba282021-02-11 11:35:56 -0800294 }
William A. Kennington III7f493702024-02-08 01:09:11 -0800295 fds.emplace_back(bmcmetrics_metricproto_BmcFdStatMetric_BmcFdStat{
296 .sidx_cmdline = obj.getStringID(fullCmdline),
297 .fd_count = entry.fdCount,
298 });
Sui Chen03eba282021-02-11 11:35:56 -0800299 }
300 }
301
302 if (isOthers)
303 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800304 fds.emplace_back(bmcmetrics_metricproto_BmcFdStatMetric_BmcFdStat{
305 .sidx_cmdline = obj.getStringID(others.cmdline),
306 .fd_count = others.fdCount,
307 });
Sui Chen03eba282021-02-11 11:35:56 -0800308 }
309
William A. Kennington III7f493702024-02-08 01:09:11 -0800310 use = true;
311 return bmcmetrics_metricproto_BmcFdStatMetric{
312 .stats = pbSubsEncoder<
313 bmcmetrics_metricproto_BmcFdStatMetric_BmcFdStat_fields>(fds),
314 };
315}
316
Willy Tu4dba2202024-04-08 21:27:20 +0000317static bmcmetrics_metricproto_BmcECCMetric getECCMetric(bool& use) noexcept
318{
319 EccCounts eccCounts;
320 use = getECCErrorCounts(eccCounts);
321 if (!use)
322 {
323 return {};
324 }
325 return bmcmetrics_metricproto_BmcECCMetric{
326 .correctable_error_count = eccCounts.correctableErrCount,
327 .uncorrectable_error_count = eccCounts.uncorrectableErrCount,
328 };
329}
330
William A. Kennington III7f493702024-02-08 01:09:11 -0800331static bmcmetrics_metricproto_BmcMemoryMetric getMemMetric() noexcept
332{
333 bmcmetrics_metricproto_BmcMemoryMetric ret = {};
334 auto data = readFileThenGrepIntoString("/proc/meminfo");
335 int value;
336 if (parseMeminfoValue(data, "MemAvailable:", value))
337 {
338 ret.mem_available = value;
339 }
340 if (parseMeminfoValue(data, "Slab:", value))
341 {
342 ret.slab = value;
343 }
344
345 if (parseMeminfoValue(data, "KernelStack:", value))
346 {
347 ret.kernel_stack = value;
348 }
Sui Chen03eba282021-02-11 11:35:56 -0800349 return ret;
350}
351
William A. Kennington III7f493702024-02-08 01:09:11 -0800352static bmcmetrics_metricproto_BmcUptimeMetric
353 getUptimeMetric(bool& use) noexcept
Sui Chen03eba282021-02-11 11:35:56 -0800354{
William A. Kennington III7f493702024-02-08 01:09:11 -0800355 bmcmetrics_metricproto_BmcUptimeMetric ret = {};
Sui Chen03eba282021-02-11 11:35:56 -0800356
Michael Shenb63d6312021-04-26 13:30:57 +0800357 double uptime = 0;
Michael Shenb63d6312021-04-26 13:30:57 +0800358 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800359 auto data = readFileThenGrepIntoString("/proc/uptime");
360 double idleProcessTime = 0;
361 if (!parseProcUptime(data, uptime, idleProcessTime))
362 {
363 log<level::ERR>("Error parsing /proc/uptime");
364 return ret;
365 }
366 ret.uptime = uptime;
367 ret.idle_process_time = idleProcessTime;
Michael Shenb63d6312021-04-26 13:30:57 +0800368 }
William A. Kennington III7f493702024-02-08 01:09:11 -0800369
370 BootTimesMonotonic btm;
371 if (!getBootTimesMonotonic(btm))
Michael Shenb63d6312021-04-26 13:30:57 +0800372 {
373 log<level::ERR>("Could not get boot time");
William A. Kennington III7f493702024-02-08 01:09:11 -0800374 return ret;
375 }
376 if (btm.firmwareTime == 0 && btm.powerOnSecCounterTime != 0)
377 {
378 ret.firmware_boot_time_sec =
379 static_cast<double>(btm.powerOnSecCounterTime) - uptime;
Michael Shenb63d6312021-04-26 13:30:57 +0800380 }
381 else
Sui Chen03eba282021-02-11 11:35:56 -0800382 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800383 ret.firmware_boot_time_sec =
384 static_cast<double>(btm.firmwareTime - btm.loaderTime) / 1e6;
385 }
386 ret.loader_boot_time_sec = static_cast<double>(btm.loaderTime) / 1e6;
387 if (btm.initrdTime != 0)
388 {
389 ret.kernel_boot_time_sec = static_cast<double>(btm.initrdTime) / 1e6;
390 ret.initrd_boot_time_sec =
391 static_cast<double>(btm.userspaceTime - btm.initrdTime) / 1e6;
392 ret.userspace_boot_time_sec =
393 static_cast<double>(btm.finishTime - btm.userspaceTime) / 1e6;
394 }
395 else
396 {
397 ret.kernel_boot_time_sec = static_cast<double>(btm.userspaceTime) / 1e6;
398 ret.initrd_boot_time_sec = 0;
399 ret.userspace_boot_time_sec =
400 static_cast<double>(btm.finishTime - btm.userspaceTime) / 1e6;
Sui Chen03eba282021-02-11 11:35:56 -0800401 }
Sui Chen03eba282021-02-11 11:35:56 -0800402
William A. Kennington III7f493702024-02-08 01:09:11 -0800403 use = true;
404 return ret;
405}
406
407static bmcmetrics_metricproto_BmcDiskSpaceMetric
408 getStorageMetric(bool& use) noexcept
409{
410 bmcmetrics_metricproto_BmcDiskSpaceMetric ret = {};
Sui Chen03eba282021-02-11 11:35:56 -0800411 struct statvfs fiData;
William A. Kennington III7f493702024-02-08 01:09:11 -0800412 if (statvfs("/", &fiData) < 0)
Sui Chen03eba282021-02-11 11:35:56 -0800413 {
414 log<level::ERR>("Could not call statvfs");
415 }
416 else
417 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800418 ret.rwfs_kib_available = (fiData.f_bsize * fiData.f_bfree) / 1024;
419 use = true;
Sui Chen03eba282021-02-11 11:35:56 -0800420 }
William A. Kennington III7f493702024-02-08 01:09:11 -0800421 return ret;
422}
Sui Chen03eba282021-02-11 11:35:56 -0800423
William A. Kennington III7f493702024-02-08 01:09:11 -0800424void BmcHealthSnapshot::doWork()
425{
Sui Chen03eba282021-02-11 11:35:56 -0800426 // The next metrics require a sane ticks_per_sec value, typically 100 on
427 // the BMC. In the very rare circumstance when it's 0, exit early and return
428 // a partially complete snapshot (no process).
429 ticksPerSec = getTicksPerSec();
430
William A. Kennington III7f493702024-02-08 01:09:11 -0800431 static constexpr auto stcb = [](pb_ostream_t* stream,
432 const pb_field_t* field,
433 void* const* arg) noexcept {
434 auto& self = *reinterpret_cast<BmcHealthSnapshot*>(*arg);
435 std::vector<std::string_view> strs(self.stringTable.size());
436 for (const auto& [str, i] : self.stringTable)
437 {
438 strs[i] = str;
439 }
440 for (auto& str : strs)
441 {
442 bmcmetrics_metricproto_BmcStringTable_StringEntry msg = {
443 .value = pbStrEncoder(str),
444 };
445 if (!pb_encode_tag_for_field(stream, field) ||
446 !pb_encode_submessage(
447 stream,
448 bmcmetrics_metricproto_BmcStringTable_StringEntry_fields,
449 &msg))
450 {
451 return false;
452 }
453 }
454 return true;
455 };
456 std::vector<bmcmetrics_metricproto_BmcProcStatMetric_BmcProcStat> procs;
457 std::vector<bmcmetrics_metricproto_BmcFdStatMetric_BmcFdStat> fds;
458 bmcmetrics_metricproto_BmcMetricSnapshot snapshot = {
459 .has_string_table = true,
460 .string_table =
461 {
462 .entries = {{.encode = stcb}, this},
463 },
464 .has_memory_metric = true,
465 .memory_metric = getMemMetric(),
466 .has_uptime_metric = false,
467 .uptime_metric = getUptimeMetric(snapshot.has_uptime_metric),
468 .has_storage_space_metric = false,
469 .storage_space_metric =
470 getStorageMetric(snapshot.has_storage_space_metric),
471 .has_procstat_metric = false,
472 .procstat_metric = getProcStatMetric(*this, ticksPerSec, procs,
473 snapshot.has_procstat_metric),
474 .has_fdstat_metric = false,
475 .fdstat_metric = getFdStatMetric(*this, ticksPerSec, fds,
476 snapshot.has_fdstat_metric),
Willy Tu4dba2202024-04-08 21:27:20 +0000477 .has_ecc_metric = false,
478 .ecc_metric = getECCMetric(snapshot.has_ecc_metric),
William A. Kennington III7f493702024-02-08 01:09:11 -0800479 };
480 pb_ostream_t nost = {};
481 if (!pb_encode(&nost, bmcmetrics_metricproto_BmcMetricSnapshot_fields,
482 &snapshot))
Sui Chen03eba282021-02-11 11:35:56 -0800483 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800484 auto msg = std::format("Getting pb size: {}", PB_GET_ERROR(&nost));
485 log<level::ERR>(msg.c_str());
Sui Chen03eba282021-02-11 11:35:56 -0800486 return;
487 }
William A. Kennington III7f493702024-02-08 01:09:11 -0800488 pbDump.resize(nost.bytes_written);
489 auto ost = pb_ostream_from_buffer(
490 reinterpret_cast<pb_byte_t*>(pbDump.data()), pbDump.size());
491 if (!pb_encode(&ost, bmcmetrics_metricproto_BmcMetricSnapshot_fields,
492 &snapshot))
Sui Chen03eba282021-02-11 11:35:56 -0800493 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800494 auto msg = std::format("Writing pb msg: {}", PB_GET_ERROR(&ost));
495 log<level::ERR>(msg.c_str());
496 return;
Sui Chen03eba282021-02-11 11:35:56 -0800497 }
Sui Chen03eba282021-02-11 11:35:56 -0800498 done = true;
499}
500
501// BmcBlobSessionStat (9) but passing meta as reference instead of pointer,
502// since the metadata must not be null at this point.
503bool BmcHealthSnapshot::stat(blobs::BlobMeta& meta)
504{
505 if (!done)
506 {
507 // Bits 8~15 are blob-specific state flags.
508 // For this blob, bit 8 is set when metric collection is still in
509 // progress.
510 meta.blobState |= (1 << 8);
511 }
512 else
513 {
514 meta.blobState = 0;
515 meta.blobState = blobs::StateFlags::open_read;
516 meta.size = pbDump.size();
517 }
518 return true;
519}
520
521std::string_view BmcHealthSnapshot::read(uint32_t offset,
522 uint32_t requestedSize)
523{
524 uint32_t size = static_cast<uint32_t>(pbDump.size());
525 if (offset >= size)
526 {
527 return {};
528 }
529 return std::string_view(pbDump.data() + offset,
530 std::min(requestedSize, size - offset));
531}
532
533int BmcHealthSnapshot::getStringID(const std::string_view s)
534{
535 int ret = 0;
536 auto itr = stringTable.find(s.data());
537 if (itr == stringTable.end())
538 {
539 stringTable[s.data()] = stringId;
540 ret = stringId;
541 ++stringId;
542 }
543 else
544 {
545 ret = itr->second;
546 }
547 return ret;
548}
549
Patrick Williams2be45232023-05-10 07:51:22 -0500550} // namespace metric_blob