blob: d3fe0e218948e52850f9ae8ae6bcbf09dc3e5449 [file] [log] [blame]
Brandon Kimdab96f12021-02-18 11:21:37 -08001// Copyright 2021 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Sui Chen03eba282021-02-11 11:35:56 -080015#include "metric.hpp"
16
William A. Kennington III7f493702024-02-08 01:09:11 -080017#include "metricblob.pb.n.h"
Sui Chen03eba282021-02-11 11:35:56 -080018
19#include "util.hpp"
20
William A. Kennington III7f493702024-02-08 01:09:11 -080021#include <pb_encode.h>
Sui Chen03eba282021-02-11 11:35:56 -080022#include <sys/statvfs.h>
23
24#include <phosphor-logging/log.hpp>
25
26#include <cstdint>
27#include <filesystem>
28#include <sstream>
29#include <string>
30#include <string_view>
31
32namespace metric_blob
33{
34
35using phosphor::logging::entry;
36using phosphor::logging::log;
37using level = phosphor::logging::level;
38
39BmcHealthSnapshot::BmcHealthSnapshot() :
40 done(false), stringId(0), ticksPerSec(0)
41{}
42
William A. Kennington III7f493702024-02-08 01:09:11 -080043template <typename T>
Patrick Williamsc66ebc32024-08-16 15:21:56 -040044static constexpr auto pbEncodeStr =
45 [](pb_ostream_t* stream, const pb_field_iter_t* field,
46 void* const* arg) noexcept {
47 static_assert(sizeof(*std::declval<T>().data()) == sizeof(pb_byte_t));
48 const auto& s = *reinterpret_cast<const T*>(*arg);
49 return pb_encode_tag_for_field(stream, field) &&
50 pb_encode_string(stream,
51 reinterpret_cast<const pb_byte_t*>(s.data()),
52 s.size());
53 };
William A. Kennington III7f493702024-02-08 01:09:11 -080054
55template <typename T>
56static pb_callback_t pbStrEncoder(const T& t) noexcept
57{
58 return {{.encode = pbEncodeStr<T>}, const_cast<T*>(&t)};
59}
60
61template <auto fields, typename T>
Patrick Williamsc66ebc32024-08-16 15:21:56 -040062static constexpr auto pbEncodeSubs =
63 [](pb_ostream_t* stream, const pb_field_iter_t* field,
64 void* const* arg) noexcept {
65 for (const auto& sub : *reinterpret_cast<const std::vector<T>*>(*arg))
William A. Kennington III7f493702024-02-08 01:09:11 -080066 {
Patrick Williamsc66ebc32024-08-16 15:21:56 -040067 if (!pb_encode_tag_for_field(stream, field) ||
68 !pb_encode_submessage(stream, fields, &sub))
69 {
70 return false;
71 }
William A. Kennington III7f493702024-02-08 01:09:11 -080072 }
Patrick Williamsc66ebc32024-08-16 15:21:56 -040073 return true;
74 };
William A. Kennington III7f493702024-02-08 01:09:11 -080075
76template <auto fields, typename T>
77static pb_callback_t pbSubsEncoder(const std::vector<T>& t)
78{
79 return {{.encode = pbEncodeSubs<fields, T>},
80 const_cast<std::vector<T>*>(&t)};
81}
82
Sui Chen03eba282021-02-11 11:35:56 -080083struct ProcStatEntry
84{
85 std::string cmdline;
86 std::string tcomm;
87 float utime;
88 float stime;
89
90 // Processes with the longest utime + stime are ranked first.
91 // Tie breaking is done with cmdline then tcomm.
92 bool operator<(const ProcStatEntry& other) const
93 {
94 const float negTime = -(utime + stime);
95 const float negOtherTime = -(other.utime + other.stime);
96 return std::tie(negTime, cmdline, tcomm) <
97 std::tie(negOtherTime, other.cmdline, other.tcomm);
98 }
99};
100
William A. Kennington III7f493702024-02-08 01:09:11 -0800101static bmcmetrics_metricproto_BmcProcStatMetric getProcStatMetric(
102 BmcHealthSnapshot& obj, long ticksPerSec,
103 std::vector<bmcmetrics_metricproto_BmcProcStatMetric_BmcProcStat>& procs,
104 bool& use) noexcept
Sui Chen03eba282021-02-11 11:35:56 -0800105{
William A. Kennington III7f493702024-02-08 01:09:11 -0800106 if (ticksPerSec == 0)
107 {
108 return {};
109 }
Sui Chen03eba282021-02-11 11:35:56 -0800110 constexpr std::string_view procPath = "/proc/";
111
Sui Chen03eba282021-02-11 11:35:56 -0800112 std::vector<ProcStatEntry> entries;
113
114 for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
115 {
116 const std::string& path = procEntry.path();
117 int pid = -1;
118 if (isNumericPath(path, pid))
119 {
120 ProcStatEntry entry;
121
122 try
123 {
124 entry.cmdline = getCmdLine(pid);
125 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
126 entry.tcomm = t.tcomm;
127 entry.utime = t.utime;
128 entry.stime = t.stime;
129
130 entries.push_back(entry);
131 }
132 catch (const std::exception& e)
133 {
134 log<level::ERR>("Could not obtain process stats");
135 }
136 }
137 }
138
139 std::sort(entries.begin(), entries.end());
140
141 bool isOthers = false;
142 ProcStatEntry others;
143 others.cmdline = "(Others)";
144 others.utime = others.stime = 0;
145
146 // Only show this many processes and aggregate all remaining ones into
147 // "others" in order to keep the size of the snapshot reasonably small.
148 // With 10 process stat entries and 10 FD count entries, the size of the
149 // snapshot reaches around 1.5KiB. This is non-trivial, and we have to set
150 // the collection interval long enough so as not to over-stress the IPMI
151 // interface and the data collection service. The value of 10 is chosen
152 // empirically, it might be subject to adjustments when the system is
153 // launched later.
154 constexpr int topN = 10;
155
156 for (size_t i = 0; i < entries.size(); ++i)
157 {
158 if (i >= topN)
159 {
160 isOthers = true;
161 }
162
William A. Kennington III7f493702024-02-08 01:09:11 -0800163 const ProcStatEntry& entry = entries[i];
Sui Chen03eba282021-02-11 11:35:56 -0800164
165 if (isOthers)
166 {
167 others.utime += entry.utime;
168 others.stime += entry.stime;
169 }
170 else
171 {
Sui Chen03eba282021-02-11 11:35:56 -0800172 std::string fullCmdline = entry.cmdline;
173 if (entry.tcomm.size() > 0)
174 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800175 fullCmdline += " ";
176 fullCmdline += entry.tcomm;
Sui Chen03eba282021-02-11 11:35:56 -0800177 }
William A. Kennington III7f493702024-02-08 01:09:11 -0800178 procs.emplace_back(
179 bmcmetrics_metricproto_BmcProcStatMetric_BmcProcStat{
180 .sidx_cmdline = obj.getStringID(fullCmdline),
181 .utime = entry.utime,
182 .stime = entry.stime,
183 });
Sui Chen03eba282021-02-11 11:35:56 -0800184 }
185 }
186
187 if (isOthers)
188 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800189 procs.emplace_back(bmcmetrics_metricproto_BmcProcStatMetric_BmcProcStat{
190 .sidx_cmdline = obj.getStringID(others.cmdline),
191 .utime = others.utime,
192 .stime = others.stime,
193
194 });
Sui Chen03eba282021-02-11 11:35:56 -0800195 }
196
William A. Kennington III7f493702024-02-08 01:09:11 -0800197 use = true;
198 return bmcmetrics_metricproto_BmcProcStatMetric{
199 .stats = pbSubsEncoder<
200 bmcmetrics_metricproto_BmcProcStatMetric_BmcProcStat_fields>(procs),
201 };
Sui Chen03eba282021-02-11 11:35:56 -0800202}
203
204int getFdCount(int pid)
205{
206 const std::string& fdPath = "/proc/" + std::to_string(pid) + "/fd";
207 return std::distance(std::filesystem::directory_iterator(fdPath),
208 std::filesystem::directory_iterator{});
209}
210
211struct FdStatEntry
212{
213 int fdCount;
214 std::string cmdline;
215 std::string tcomm;
216
217 // Processes with the largest fdCount goes first.
218 // Tie-breaking using cmdline then tcomm.
219 bool operator<(const FdStatEntry& other) const
220 {
221 const int negFdCount = -fdCount;
222 const int negOtherFdCount = -other.fdCount;
223 return std::tie(negFdCount, cmdline, tcomm) <
224 std::tie(negOtherFdCount, other.cmdline, other.tcomm);
225 }
226};
227
William A. Kennington III7f493702024-02-08 01:09:11 -0800228static bmcmetrics_metricproto_BmcFdStatMetric getFdStatMetric(
229 BmcHealthSnapshot& obj, long ticksPerSec,
230 std::vector<bmcmetrics_metricproto_BmcFdStatMetric_BmcFdStat>& fds,
231 bool& use) noexcept
Sui Chen03eba282021-02-11 11:35:56 -0800232{
William A. Kennington III7f493702024-02-08 01:09:11 -0800233 if (ticksPerSec == 0)
234 {
235 return {};
236 }
Sui Chen03eba282021-02-11 11:35:56 -0800237
238 // Sort by fd count, no tie-breaking
239 std::vector<FdStatEntry> entries;
240
241 const std::string_view procPath = "/proc/";
242 for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
243 {
244 const std::string& path = procEntry.path();
245 int pid = 0;
246 FdStatEntry entry;
247 if (isNumericPath(path, pid))
248 {
249 try
250 {
251 entry.fdCount = getFdCount(pid);
252 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
253 entry.cmdline = getCmdLine(pid);
254 entry.tcomm = t.tcomm;
255 entries.push_back(entry);
256 }
257 catch (const std::exception& e)
258 {
259 log<level::ERR>("Could not get file descriptor stats");
260 }
261 }
262 }
263
264 std::sort(entries.begin(), entries.end());
265
266 bool isOthers = false;
267
268 // Only report the detailed fd count and cmdline for the top 10 entries,
269 // and collapse all others into "others".
270 constexpr int topN = 10;
271
272 FdStatEntry others;
273 others.cmdline = "(Others)";
274 others.fdCount = 0;
275
276 for (size_t i = 0; i < entries.size(); ++i)
277 {
278 if (i >= topN)
279 {
280 isOthers = true;
281 }
282
283 const FdStatEntry& entry = entries[i];
284 if (isOthers)
285 {
286 others.fdCount += entry.fdCount;
287 }
288 else
289 {
Sui Chen03eba282021-02-11 11:35:56 -0800290 std::string fullCmdline = entry.cmdline;
291 if (entry.tcomm.size() > 0)
292 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800293 fullCmdline += " ";
294 fullCmdline += entry.tcomm;
Sui Chen03eba282021-02-11 11:35:56 -0800295 }
William A. Kennington III7f493702024-02-08 01:09:11 -0800296 fds.emplace_back(bmcmetrics_metricproto_BmcFdStatMetric_BmcFdStat{
297 .sidx_cmdline = obj.getStringID(fullCmdline),
298 .fd_count = entry.fdCount,
299 });
Sui Chen03eba282021-02-11 11:35:56 -0800300 }
301 }
302
303 if (isOthers)
304 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800305 fds.emplace_back(bmcmetrics_metricproto_BmcFdStatMetric_BmcFdStat{
306 .sidx_cmdline = obj.getStringID(others.cmdline),
307 .fd_count = others.fdCount,
308 });
Sui Chen03eba282021-02-11 11:35:56 -0800309 }
310
William A. Kennington III7f493702024-02-08 01:09:11 -0800311 use = true;
312 return bmcmetrics_metricproto_BmcFdStatMetric{
313 .stats = pbSubsEncoder<
314 bmcmetrics_metricproto_BmcFdStatMetric_BmcFdStat_fields>(fds),
315 };
316}
317
Willy Tu4dba2202024-04-08 21:27:20 +0000318static bmcmetrics_metricproto_BmcECCMetric getECCMetric(bool& use) noexcept
319{
320 EccCounts eccCounts;
321 use = getECCErrorCounts(eccCounts);
322 if (!use)
323 {
324 return {};
325 }
326 return bmcmetrics_metricproto_BmcECCMetric{
327 .correctable_error_count = eccCounts.correctableErrCount,
328 .uncorrectable_error_count = eccCounts.uncorrectableErrCount,
329 };
330}
331
William A. Kennington III7f493702024-02-08 01:09:11 -0800332static bmcmetrics_metricproto_BmcMemoryMetric getMemMetric() noexcept
333{
334 bmcmetrics_metricproto_BmcMemoryMetric ret = {};
335 auto data = readFileThenGrepIntoString("/proc/meminfo");
336 int value;
337 if (parseMeminfoValue(data, "MemAvailable:", value))
338 {
339 ret.mem_available = value;
340 }
341 if (parseMeminfoValue(data, "Slab:", value))
342 {
343 ret.slab = value;
344 }
345
346 if (parseMeminfoValue(data, "KernelStack:", value))
347 {
348 ret.kernel_stack = value;
349 }
Sui Chen03eba282021-02-11 11:35:56 -0800350 return ret;
351}
352
William A. Kennington III7f493702024-02-08 01:09:11 -0800353static bmcmetrics_metricproto_BmcUptimeMetric
354 getUptimeMetric(bool& use) noexcept
Sui Chen03eba282021-02-11 11:35:56 -0800355{
William A. Kennington III7f493702024-02-08 01:09:11 -0800356 bmcmetrics_metricproto_BmcUptimeMetric ret = {};
Sui Chen03eba282021-02-11 11:35:56 -0800357
Michael Shenb63d6312021-04-26 13:30:57 +0800358 double uptime = 0;
Michael Shenb63d6312021-04-26 13:30:57 +0800359 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800360 auto data = readFileThenGrepIntoString("/proc/uptime");
361 double idleProcessTime = 0;
362 if (!parseProcUptime(data, uptime, idleProcessTime))
363 {
364 log<level::ERR>("Error parsing /proc/uptime");
365 return ret;
366 }
367 ret.uptime = uptime;
368 ret.idle_process_time = idleProcessTime;
Michael Shenb63d6312021-04-26 13:30:57 +0800369 }
William A. Kennington III7f493702024-02-08 01:09:11 -0800370
371 BootTimesMonotonic btm;
372 if (!getBootTimesMonotonic(btm))
Michael Shenb63d6312021-04-26 13:30:57 +0800373 {
374 log<level::ERR>("Could not get boot time");
William A. Kennington III7f493702024-02-08 01:09:11 -0800375 return ret;
376 }
377 if (btm.firmwareTime == 0 && btm.powerOnSecCounterTime != 0)
378 {
379 ret.firmware_boot_time_sec =
380 static_cast<double>(btm.powerOnSecCounterTime) - uptime;
Michael Shenb63d6312021-04-26 13:30:57 +0800381 }
382 else
Sui Chen03eba282021-02-11 11:35:56 -0800383 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800384 ret.firmware_boot_time_sec =
385 static_cast<double>(btm.firmwareTime - btm.loaderTime) / 1e6;
386 }
387 ret.loader_boot_time_sec = static_cast<double>(btm.loaderTime) / 1e6;
388 if (btm.initrdTime != 0)
389 {
390 ret.kernel_boot_time_sec = static_cast<double>(btm.initrdTime) / 1e6;
391 ret.initrd_boot_time_sec =
392 static_cast<double>(btm.userspaceTime - btm.initrdTime) / 1e6;
393 ret.userspace_boot_time_sec =
394 static_cast<double>(btm.finishTime - btm.userspaceTime) / 1e6;
395 }
396 else
397 {
398 ret.kernel_boot_time_sec = static_cast<double>(btm.userspaceTime) / 1e6;
399 ret.initrd_boot_time_sec = 0;
400 ret.userspace_boot_time_sec =
401 static_cast<double>(btm.finishTime - btm.userspaceTime) / 1e6;
Sui Chen03eba282021-02-11 11:35:56 -0800402 }
Sui Chen03eba282021-02-11 11:35:56 -0800403
William A. Kennington III7f493702024-02-08 01:09:11 -0800404 use = true;
405 return ret;
406}
407
408static bmcmetrics_metricproto_BmcDiskSpaceMetric
409 getStorageMetric(bool& use) noexcept
410{
411 bmcmetrics_metricproto_BmcDiskSpaceMetric ret = {};
Abby119e7922024-10-18 21:28:36 +0000412 struct statvfs rwFiData, tmpFiData;
413 if (statvfs("/", &rwFiData) < 0)
Sui Chen03eba282021-02-11 11:35:56 -0800414 {
415 log<level::ERR>("Could not call statvfs");
416 }
417 else
418 {
Abby119e7922024-10-18 21:28:36 +0000419 ret.rwfs_kib_available = (rwFiData.f_bsize * rwFiData.f_bfree) / 1024;
420 use = true;
421 }
422 if (statvfs("/tmp", &tmpFiData) < 0)
423 {
424 log<level::ERR>("Could not call statvfs");
425 }
426 else
427 {
428 ret.tmpfs_kib_available =
429 (tmpFiData.f_bsize * tmpFiData.f_bfree) / 1024;
William A. Kennington III7f493702024-02-08 01:09:11 -0800430 use = true;
Sui Chen03eba282021-02-11 11:35:56 -0800431 }
William A. Kennington III7f493702024-02-08 01:09:11 -0800432 return ret;
433}
Sui Chen03eba282021-02-11 11:35:56 -0800434
William A. Kennington III7f493702024-02-08 01:09:11 -0800435void BmcHealthSnapshot::doWork()
436{
Sui Chen03eba282021-02-11 11:35:56 -0800437 // The next metrics require a sane ticks_per_sec value, typically 100 on
438 // the BMC. In the very rare circumstance when it's 0, exit early and return
439 // a partially complete snapshot (no process).
440 ticksPerSec = getTicksPerSec();
441
William A. Kennington III7f493702024-02-08 01:09:11 -0800442 static constexpr auto stcb = [](pb_ostream_t* stream,
443 const pb_field_t* field,
444 void* const* arg) noexcept {
445 auto& self = *reinterpret_cast<BmcHealthSnapshot*>(*arg);
446 std::vector<std::string_view> strs(self.stringTable.size());
447 for (const auto& [str, i] : self.stringTable)
448 {
449 strs[i] = str;
450 }
451 for (auto& str : strs)
452 {
453 bmcmetrics_metricproto_BmcStringTable_StringEntry msg = {
454 .value = pbStrEncoder(str),
455 };
456 if (!pb_encode_tag_for_field(stream, field) ||
457 !pb_encode_submessage(
458 stream,
459 bmcmetrics_metricproto_BmcStringTable_StringEntry_fields,
460 &msg))
461 {
462 return false;
463 }
464 }
465 return true;
466 };
467 std::vector<bmcmetrics_metricproto_BmcProcStatMetric_BmcProcStat> procs;
468 std::vector<bmcmetrics_metricproto_BmcFdStatMetric_BmcFdStat> fds;
469 bmcmetrics_metricproto_BmcMetricSnapshot snapshot = {
470 .has_string_table = true,
471 .string_table =
472 {
473 .entries = {{.encode = stcb}, this},
474 },
475 .has_memory_metric = true,
476 .memory_metric = getMemMetric(),
477 .has_uptime_metric = false,
478 .uptime_metric = getUptimeMetric(snapshot.has_uptime_metric),
479 .has_storage_space_metric = false,
480 .storage_space_metric =
481 getStorageMetric(snapshot.has_storage_space_metric),
482 .has_procstat_metric = false,
483 .procstat_metric = getProcStatMetric(*this, ticksPerSec, procs,
484 snapshot.has_procstat_metric),
485 .has_fdstat_metric = false,
486 .fdstat_metric = getFdStatMetric(*this, ticksPerSec, fds,
487 snapshot.has_fdstat_metric),
Willy Tu4dba2202024-04-08 21:27:20 +0000488 .has_ecc_metric = false,
489 .ecc_metric = getECCMetric(snapshot.has_ecc_metric),
William A. Kennington III7f493702024-02-08 01:09:11 -0800490 };
491 pb_ostream_t nost = {};
492 if (!pb_encode(&nost, bmcmetrics_metricproto_BmcMetricSnapshot_fields,
493 &snapshot))
Sui Chen03eba282021-02-11 11:35:56 -0800494 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800495 auto msg = std::format("Getting pb size: {}", PB_GET_ERROR(&nost));
496 log<level::ERR>(msg.c_str());
Sui Chen03eba282021-02-11 11:35:56 -0800497 return;
498 }
William A. Kennington III7f493702024-02-08 01:09:11 -0800499 pbDump.resize(nost.bytes_written);
500 auto ost = pb_ostream_from_buffer(
501 reinterpret_cast<pb_byte_t*>(pbDump.data()), pbDump.size());
502 if (!pb_encode(&ost, bmcmetrics_metricproto_BmcMetricSnapshot_fields,
503 &snapshot))
Sui Chen03eba282021-02-11 11:35:56 -0800504 {
William A. Kennington III7f493702024-02-08 01:09:11 -0800505 auto msg = std::format("Writing pb msg: {}", PB_GET_ERROR(&ost));
506 log<level::ERR>(msg.c_str());
507 return;
Sui Chen03eba282021-02-11 11:35:56 -0800508 }
Sui Chen03eba282021-02-11 11:35:56 -0800509 done = true;
510}
511
512// BmcBlobSessionStat (9) but passing meta as reference instead of pointer,
513// since the metadata must not be null at this point.
514bool BmcHealthSnapshot::stat(blobs::BlobMeta& meta)
515{
516 if (!done)
517 {
518 // Bits 8~15 are blob-specific state flags.
519 // For this blob, bit 8 is set when metric collection is still in
520 // progress.
521 meta.blobState |= (1 << 8);
522 }
523 else
524 {
525 meta.blobState = 0;
526 meta.blobState = blobs::StateFlags::open_read;
527 meta.size = pbDump.size();
528 }
529 return true;
530}
531
532std::string_view BmcHealthSnapshot::read(uint32_t offset,
533 uint32_t requestedSize)
534{
535 uint32_t size = static_cast<uint32_t>(pbDump.size());
536 if (offset >= size)
537 {
538 return {};
539 }
540 return std::string_view(pbDump.data() + offset,
541 std::min(requestedSize, size - offset));
542}
543
544int BmcHealthSnapshot::getStringID(const std::string_view s)
545{
546 int ret = 0;
547 auto itr = stringTable.find(s.data());
548 if (itr == stringTable.end())
549 {
550 stringTable[s.data()] = stringId;
551 ret = stringId;
552 ++stringId;
553 }
554 else
555 {
556 ret = itr->second;
557 }
558 return ret;
559}
560
Patrick Williams2be45232023-05-10 07:51:22 -0500561} // namespace metric_blob