blob: bc8ac943706c74c2ff69c14805a8b83fb14fdde8 [file] [log] [blame]
Brandon Kimdab96f12021-02-18 11:21:37 -08001// Copyright 2021 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Sui Chen03eba282021-02-11 11:35:56 -080015#include "metric.hpp"
16
17#include "metricblob.pb.h"
18
19#include "util.hpp"
20
21#include <sys/statvfs.h>
22
23#include <phosphor-logging/log.hpp>
24
25#include <cstdint>
26#include <filesystem>
27#include <sstream>
28#include <string>
29#include <string_view>
30
31namespace metric_blob
32{
33
34using phosphor::logging::entry;
35using phosphor::logging::log;
36using level = phosphor::logging::level;
37
38BmcHealthSnapshot::BmcHealthSnapshot() :
39 done(false), stringId(0), ticksPerSec(0)
40{}
41
42struct ProcStatEntry
43{
44 std::string cmdline;
45 std::string tcomm;
46 float utime;
47 float stime;
48
49 // Processes with the longest utime + stime are ranked first.
50 // Tie breaking is done with cmdline then tcomm.
51 bool operator<(const ProcStatEntry& other) const
52 {
53 const float negTime = -(utime + stime);
54 const float negOtherTime = -(other.utime + other.stime);
55 return std::tie(negTime, cmdline, tcomm) <
56 std::tie(negOtherTime, other.cmdline, other.tcomm);
57 }
58};
59
60bmcmetrics::metricproto::BmcProcStatMetric BmcHealthSnapshot::getProcStatList()
61{
62 constexpr std::string_view procPath = "/proc/";
63
64 bmcmetrics::metricproto::BmcProcStatMetric ret;
65 std::vector<ProcStatEntry> entries;
66
67 for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
68 {
69 const std::string& path = procEntry.path();
70 int pid = -1;
71 if (isNumericPath(path, pid))
72 {
73 ProcStatEntry entry;
74
75 try
76 {
77 entry.cmdline = getCmdLine(pid);
78 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
79 entry.tcomm = t.tcomm;
80 entry.utime = t.utime;
81 entry.stime = t.stime;
82
83 entries.push_back(entry);
84 }
85 catch (const std::exception& e)
86 {
87 log<level::ERR>("Could not obtain process stats");
88 }
89 }
90 }
91
92 std::sort(entries.begin(), entries.end());
93
94 bool isOthers = false;
95 ProcStatEntry others;
96 others.cmdline = "(Others)";
97 others.utime = others.stime = 0;
98
99 // Only show this many processes and aggregate all remaining ones into
100 // "others" in order to keep the size of the snapshot reasonably small.
101 // With 10 process stat entries and 10 FD count entries, the size of the
102 // snapshot reaches around 1.5KiB. This is non-trivial, and we have to set
103 // the collection interval long enough so as not to over-stress the IPMI
104 // interface and the data collection service. The value of 10 is chosen
105 // empirically, it might be subject to adjustments when the system is
106 // launched later.
107 constexpr int topN = 10;
108
109 for (size_t i = 0; i < entries.size(); ++i)
110 {
111 if (i >= topN)
112 {
113 isOthers = true;
114 }
115
116 ProcStatEntry& entry = entries[i];
117
118 if (isOthers)
119 {
120 others.utime += entry.utime;
121 others.stime += entry.stime;
122 }
123 else
124 {
125 bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s;
126 std::string fullCmdline = entry.cmdline;
127 if (entry.tcomm.size() > 0)
128 {
129 fullCmdline += " " + entry.tcomm;
130 }
131 s.set_sidx_cmdline(getStringID(fullCmdline));
132 s.set_utime(entry.utime);
133 s.set_stime(entry.stime);
134 *(ret.add_stats()) = s;
135 }
136 }
137
138 if (isOthers)
139 {
140 bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s;
141 s.set_sidx_cmdline(getStringID(others.cmdline));
142 s.set_utime(others.utime);
143 s.set_stime(others.stime);
144 *(ret.add_stats()) = s;
145 }
146
147 return ret;
148}
149
150int getFdCount(int pid)
151{
152 const std::string& fdPath = "/proc/" + std::to_string(pid) + "/fd";
153 return std::distance(std::filesystem::directory_iterator(fdPath),
154 std::filesystem::directory_iterator{});
155}
156
157struct FdStatEntry
158{
159 int fdCount;
160 std::string cmdline;
161 std::string tcomm;
162
163 // Processes with the largest fdCount goes first.
164 // Tie-breaking using cmdline then tcomm.
165 bool operator<(const FdStatEntry& other) const
166 {
167 const int negFdCount = -fdCount;
168 const int negOtherFdCount = -other.fdCount;
169 return std::tie(negFdCount, cmdline, tcomm) <
170 std::tie(negOtherFdCount, other.cmdline, other.tcomm);
171 }
172};
173
174bmcmetrics::metricproto::BmcFdStatMetric BmcHealthSnapshot::getFdStatList()
175{
176 bmcmetrics::metricproto::BmcFdStatMetric ret;
177
178 // Sort by fd count, no tie-breaking
179 std::vector<FdStatEntry> entries;
180
181 const std::string_view procPath = "/proc/";
182 for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
183 {
184 const std::string& path = procEntry.path();
185 int pid = 0;
186 FdStatEntry entry;
187 if (isNumericPath(path, pid))
188 {
189 try
190 {
191 entry.fdCount = getFdCount(pid);
192 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
193 entry.cmdline = getCmdLine(pid);
194 entry.tcomm = t.tcomm;
195 entries.push_back(entry);
196 }
197 catch (const std::exception& e)
198 {
199 log<level::ERR>("Could not get file descriptor stats");
200 }
201 }
202 }
203
204 std::sort(entries.begin(), entries.end());
205
206 bool isOthers = false;
207
208 // Only report the detailed fd count and cmdline for the top 10 entries,
209 // and collapse all others into "others".
210 constexpr int topN = 10;
211
212 FdStatEntry others;
213 others.cmdline = "(Others)";
214 others.fdCount = 0;
215
216 for (size_t i = 0; i < entries.size(); ++i)
217 {
218 if (i >= topN)
219 {
220 isOthers = true;
221 }
222
223 const FdStatEntry& entry = entries[i];
224 if (isOthers)
225 {
226 others.fdCount += entry.fdCount;
227 }
228 else
229 {
230 bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s;
231 std::string fullCmdline = entry.cmdline;
232 if (entry.tcomm.size() > 0)
233 {
234 fullCmdline += " " + entry.tcomm;
235 }
236 s.set_sidx_cmdline(getStringID(fullCmdline));
237 s.set_fd_count(entry.fdCount);
238 *(ret.add_stats()) = s;
239 }
240 }
241
242 if (isOthers)
243 {
244 bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s;
245 s.set_sidx_cmdline(getStringID(others.cmdline));
246 s.set_fd_count(others.fdCount);
247 *(ret.add_stats()) = s;
248 }
249
250 return ret;
251}
252
253void BmcHealthSnapshot::serializeSnapshotToArray(
254 const bmcmetrics::metricproto::BmcMetricSnapshot& snapshot)
255{
256 size_t size = snapshot.ByteSizeLong();
257 if (size > 0)
258 {
259 pbDump.resize(size);
260 if (!snapshot.SerializeToArray(pbDump.data(), size))
261 {
262 log<level::ERR>("Could not serialize protobuf to array");
263 }
264 }
265}
266
267void BmcHealthSnapshot::doWork()
268{
269 bmcmetrics::metricproto::BmcMetricSnapshot snapshot;
270
271 // Memory info
Michael Shenb63d6312021-04-26 13:30:57 +0800272 std::string meminfoBuffer = readFileThenGrepIntoString("/proc/meminfo");
Sui Chen03eba282021-02-11 11:35:56 -0800273
274 {
275 bmcmetrics::metricproto::BmcMemoryMetric m;
276
277 std::string_view sv(meminfoBuffer.data());
278 // MemAvailable
279 int value;
280 bool ok = parseMeminfoValue(sv, "MemAvailable:", value);
281 if (ok)
282 {
283 m.set_mem_available(value);
284 }
285
286 ok = parseMeminfoValue(sv, "Slab:", value);
287 if (ok)
288 {
289 m.set_slab(value);
290 }
291
292 ok = parseMeminfoValue(sv, "KernelStack:", value);
293 if (ok)
294 {
295 m.set_kernel_stack(value);
296 }
297
298 *(snapshot.mutable_memory_metric()) = m;
299 }
300
301 // Uptime
Michael Shenb63d6312021-04-26 13:30:57 +0800302 std::string uptimeBuffer = readFileThenGrepIntoString("/proc/uptime");
303 double uptime = 0;
304 double idleProcessTime = 0;
305 BootTimesMonotonic btm;
306 if (!parseProcUptime(uptimeBuffer, uptime, idleProcessTime))
307 {
308 log<level::ERR>("Error parsing /proc/uptime");
309 }
310 else if (!getBootTimesMonotonic(btm))
311 {
312 log<level::ERR>("Could not get boot time");
313 }
314 else
Sui Chen03eba282021-02-11 11:35:56 -0800315 {
316 bmcmetrics::metricproto::BmcUptimeMetric m1;
317 m1.set_uptime(uptime);
318 m1.set_idle_process_time(idleProcessTime);
Michael Shenb63d6312021-04-26 13:30:57 +0800319 if (btm.firmwareTime == 0 && btm.powerOnSecCounterTime != 0)
320 {
321 m1.set_firmware_boot_time_sec(
322 static_cast<double>(btm.powerOnSecCounterTime) - uptime);
323 }
324 else
325 {
326 m1.set_firmware_boot_time_sec(
327 static_cast<double>(btm.firmwareTime - btm.loaderTime) / 1e6);
328 }
329 m1.set_loader_boot_time_sec(static_cast<double>(btm.loaderTime) / 1e6);
330 // initrf presents
331 if (btm.initrdTime != 0)
332 {
333 m1.set_kernel_boot_time_sec(static_cast<double>(btm.initrdTime) /
334 1e6);
335 m1.set_initrd_boot_time_sec(
336 static_cast<double>(btm.userspaceTime - btm.initrdTime) / 1e6);
337 m1.set_userspace_boot_time_sec(
338 static_cast<double>(btm.finishTime - btm.userspaceTime) / 1e6);
339 }
340 else
341 {
342 m1.set_kernel_boot_time_sec(static_cast<double>(btm.userspaceTime) /
343 1e6);
344 m1.set_initrd_boot_time_sec(0);
345 m1.set_userspace_boot_time_sec(
346 static_cast<double>(btm.finishTime - btm.userspaceTime) / 1e6);
347 }
Sui Chen03eba282021-02-11 11:35:56 -0800348 *(snapshot.mutable_uptime_metric()) = m1;
349 }
Sui Chen03eba282021-02-11 11:35:56 -0800350
351 // Storage space
352 struct statvfs fiData;
353 if ((statvfs("/", &fiData)) < 0)
354 {
355 log<level::ERR>("Could not call statvfs");
356 }
357 else
358 {
359 uint64_t kib = (fiData.f_bsize * fiData.f_bfree) / 1024;
360 bmcmetrics::metricproto::BmcDiskSpaceMetric m2;
361 m2.set_rwfs_kib_available(static_cast<int>(kib));
362 *(snapshot.mutable_storage_space_metric()) = m2;
363 }
364
365 // The next metrics require a sane ticks_per_sec value, typically 100 on
366 // the BMC. In the very rare circumstance when it's 0, exit early and return
367 // a partially complete snapshot (no process).
368 ticksPerSec = getTicksPerSec();
369
370 // FD stat
371 *(snapshot.mutable_fdstat_metric()) = getFdStatList();
372
373 if (ticksPerSec == 0)
374 {
375 log<level::ERR>("ticksPerSec is 0, skipping the process list metric");
376 serializeSnapshotToArray(snapshot);
377 done = true;
378 return;
379 }
380
381 // Proc stat
382 *(snapshot.mutable_procstat_metric()) = getProcStatList();
383
384 // String table
385 std::vector<std::string_view> strings(stringTable.size());
386 for (const auto& [s, i] : stringTable)
387 {
388 strings[i] = s;
389 }
390
391 bmcmetrics::metricproto::BmcStringTable st;
392 for (size_t i = 0; i < strings.size(); ++i)
393 {
394 bmcmetrics::metricproto::BmcStringTable::StringEntry entry;
395 entry.set_value(strings[i].data());
396 *(st.add_entries()) = entry;
397 }
398 *(snapshot.mutable_string_table()) = st;
399
400 // Save to buffer
401 serializeSnapshotToArray(snapshot);
402 done = true;
403}
404
405// BmcBlobSessionStat (9) but passing meta as reference instead of pointer,
406// since the metadata must not be null at this point.
407bool BmcHealthSnapshot::stat(blobs::BlobMeta& meta)
408{
409 if (!done)
410 {
411 // Bits 8~15 are blob-specific state flags.
412 // For this blob, bit 8 is set when metric collection is still in
413 // progress.
414 meta.blobState |= (1 << 8);
415 }
416 else
417 {
418 meta.blobState = 0;
419 meta.blobState = blobs::StateFlags::open_read;
420 meta.size = pbDump.size();
421 }
422 return true;
423}
424
425std::string_view BmcHealthSnapshot::read(uint32_t offset,
426 uint32_t requestedSize)
427{
428 uint32_t size = static_cast<uint32_t>(pbDump.size());
429 if (offset >= size)
430 {
431 return {};
432 }
433 return std::string_view(pbDump.data() + offset,
434 std::min(requestedSize, size - offset));
435}
436
437int BmcHealthSnapshot::getStringID(const std::string_view s)
438{
439 int ret = 0;
440 auto itr = stringTable.find(s.data());
441 if (itr == stringTable.end())
442 {
443 stringTable[s.data()] = stringId;
444 ret = stringId;
445 ++stringId;
446 }
447 else
448 {
449 ret = itr->second;
450 }
451 return ret;
452}
453
454} // namespace metric_blob