blob: 078b3e1c206de8863804acec657d951d1a40fc87 [file] [log] [blame]
Brandon Kimdab96f12021-02-18 11:21:37 -08001// Copyright 2021 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Sui Chen03eba282021-02-11 11:35:56 -080015#include "metric.hpp"
16
17#include "metricblob.pb.h"
18
19#include "util.hpp"
20
21#include <sys/statvfs.h>
22
23#include <phosphor-logging/log.hpp>
24
25#include <cstdint>
26#include <filesystem>
27#include <sstream>
28#include <string>
29#include <string_view>
30
31namespace metric_blob
32{
33
34using phosphor::logging::entry;
35using phosphor::logging::log;
36using level = phosphor::logging::level;
37
38BmcHealthSnapshot::BmcHealthSnapshot() :
39 done(false), stringId(0), ticksPerSec(0)
40{}
41
42struct ProcStatEntry
43{
44 std::string cmdline;
45 std::string tcomm;
46 float utime;
47 float stime;
48
49 // Processes with the longest utime + stime are ranked first.
50 // Tie breaking is done with cmdline then tcomm.
51 bool operator<(const ProcStatEntry& other) const
52 {
53 const float negTime = -(utime + stime);
54 const float negOtherTime = -(other.utime + other.stime);
55 return std::tie(negTime, cmdline, tcomm) <
56 std::tie(negOtherTime, other.cmdline, other.tcomm);
57 }
58};
59
60bmcmetrics::metricproto::BmcProcStatMetric BmcHealthSnapshot::getProcStatList()
61{
62 constexpr std::string_view procPath = "/proc/";
63
64 bmcmetrics::metricproto::BmcProcStatMetric ret;
65 std::vector<ProcStatEntry> entries;
66
67 for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
68 {
69 const std::string& path = procEntry.path();
70 int pid = -1;
71 if (isNumericPath(path, pid))
72 {
73 ProcStatEntry entry;
74
75 try
76 {
77 entry.cmdline = getCmdLine(pid);
78 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
79 entry.tcomm = t.tcomm;
80 entry.utime = t.utime;
81 entry.stime = t.stime;
82
83 entries.push_back(entry);
84 }
85 catch (const std::exception& e)
86 {
87 log<level::ERR>("Could not obtain process stats");
88 }
89 }
90 }
91
92 std::sort(entries.begin(), entries.end());
93
94 bool isOthers = false;
95 ProcStatEntry others;
96 others.cmdline = "(Others)";
97 others.utime = others.stime = 0;
98
99 // Only show this many processes and aggregate all remaining ones into
100 // "others" in order to keep the size of the snapshot reasonably small.
101 // With 10 process stat entries and 10 FD count entries, the size of the
102 // snapshot reaches around 1.5KiB. This is non-trivial, and we have to set
103 // the collection interval long enough so as not to over-stress the IPMI
104 // interface and the data collection service. The value of 10 is chosen
105 // empirically, it might be subject to adjustments when the system is
106 // launched later.
107 constexpr int topN = 10;
108
109 for (size_t i = 0; i < entries.size(); ++i)
110 {
111 if (i >= topN)
112 {
113 isOthers = true;
114 }
115
116 ProcStatEntry& entry = entries[i];
117
118 if (isOthers)
119 {
120 others.utime += entry.utime;
121 others.stime += entry.stime;
122 }
123 else
124 {
125 bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s;
126 std::string fullCmdline = entry.cmdline;
127 if (entry.tcomm.size() > 0)
128 {
129 fullCmdline += " " + entry.tcomm;
130 }
131 s.set_sidx_cmdline(getStringID(fullCmdline));
132 s.set_utime(entry.utime);
133 s.set_stime(entry.stime);
134 *(ret.add_stats()) = s;
135 }
136 }
137
138 if (isOthers)
139 {
140 bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s;
141 s.set_sidx_cmdline(getStringID(others.cmdline));
142 s.set_utime(others.utime);
143 s.set_stime(others.stime);
144 *(ret.add_stats()) = s;
145 }
146
147 return ret;
148}
149
150int getFdCount(int pid)
151{
152 const std::string& fdPath = "/proc/" + std::to_string(pid) + "/fd";
153 return std::distance(std::filesystem::directory_iterator(fdPath),
154 std::filesystem::directory_iterator{});
155}
156
157struct FdStatEntry
158{
159 int fdCount;
160 std::string cmdline;
161 std::string tcomm;
162
163 // Processes with the largest fdCount goes first.
164 // Tie-breaking using cmdline then tcomm.
165 bool operator<(const FdStatEntry& other) const
166 {
167 const int negFdCount = -fdCount;
168 const int negOtherFdCount = -other.fdCount;
169 return std::tie(negFdCount, cmdline, tcomm) <
170 std::tie(negOtherFdCount, other.cmdline, other.tcomm);
171 }
172};
173
174bmcmetrics::metricproto::BmcFdStatMetric BmcHealthSnapshot::getFdStatList()
175{
176 bmcmetrics::metricproto::BmcFdStatMetric ret;
177
178 // Sort by fd count, no tie-breaking
179 std::vector<FdStatEntry> entries;
180
181 const std::string_view procPath = "/proc/";
182 for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
183 {
184 const std::string& path = procEntry.path();
185 int pid = 0;
186 FdStatEntry entry;
187 if (isNumericPath(path, pid))
188 {
189 try
190 {
191 entry.fdCount = getFdCount(pid);
192 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
193 entry.cmdline = getCmdLine(pid);
194 entry.tcomm = t.tcomm;
195 entries.push_back(entry);
196 }
197 catch (const std::exception& e)
198 {
199 log<level::ERR>("Could not get file descriptor stats");
200 }
201 }
202 }
203
204 std::sort(entries.begin(), entries.end());
205
206 bool isOthers = false;
207
208 // Only report the detailed fd count and cmdline for the top 10 entries,
209 // and collapse all others into "others".
210 constexpr int topN = 10;
211
212 FdStatEntry others;
213 others.cmdline = "(Others)";
214 others.fdCount = 0;
215
216 for (size_t i = 0; i < entries.size(); ++i)
217 {
218 if (i >= topN)
219 {
220 isOthers = true;
221 }
222
223 const FdStatEntry& entry = entries[i];
224 if (isOthers)
225 {
226 others.fdCount += entry.fdCount;
227 }
228 else
229 {
230 bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s;
231 std::string fullCmdline = entry.cmdline;
232 if (entry.tcomm.size() > 0)
233 {
234 fullCmdline += " " + entry.tcomm;
235 }
236 s.set_sidx_cmdline(getStringID(fullCmdline));
237 s.set_fd_count(entry.fdCount);
238 *(ret.add_stats()) = s;
239 }
240 }
241
242 if (isOthers)
243 {
244 bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s;
245 s.set_sidx_cmdline(getStringID(others.cmdline));
246 s.set_fd_count(others.fdCount);
247 *(ret.add_stats()) = s;
248 }
249
250 return ret;
251}
252
253void BmcHealthSnapshot::serializeSnapshotToArray(
254 const bmcmetrics::metricproto::BmcMetricSnapshot& snapshot)
255{
256 size_t size = snapshot.ByteSizeLong();
257 if (size > 0)
258 {
259 pbDump.resize(size);
260 if (!snapshot.SerializeToArray(pbDump.data(), size))
261 {
262 log<level::ERR>("Could not serialize protobuf to array");
263 }
264 }
265}
266
267void BmcHealthSnapshot::doWork()
268{
269 bmcmetrics::metricproto::BmcMetricSnapshot snapshot;
270
271 // Memory info
272 std::string meminfoBuffer = readFileIntoString("/proc/meminfo");
273
274 {
275 bmcmetrics::metricproto::BmcMemoryMetric m;
276
277 std::string_view sv(meminfoBuffer.data());
278 // MemAvailable
279 int value;
280 bool ok = parseMeminfoValue(sv, "MemAvailable:", value);
281 if (ok)
282 {
283 m.set_mem_available(value);
284 }
285
286 ok = parseMeminfoValue(sv, "Slab:", value);
287 if (ok)
288 {
289 m.set_slab(value);
290 }
291
292 ok = parseMeminfoValue(sv, "KernelStack:", value);
293 if (ok)
294 {
295 m.set_kernel_stack(value);
296 }
297
298 *(snapshot.mutable_memory_metric()) = m;
299 }
300
301 // Uptime
302 std::string uptimeBuffer = readFileIntoString("/proc/uptime");
303 double uptime = 0, idleProcessTime = 0;
304 if (parseProcUptime(uptimeBuffer, uptime, idleProcessTime))
305 {
306 bmcmetrics::metricproto::BmcUptimeMetric m1;
307 m1.set_uptime(uptime);
308 m1.set_idle_process_time(idleProcessTime);
309 *(snapshot.mutable_uptime_metric()) = m1;
310 }
311 else
312 {
313 log<level::ERR>("Error parsing /proc/uptime");
314 }
315
316 // Storage space
317 struct statvfs fiData;
318 if ((statvfs("/", &fiData)) < 0)
319 {
320 log<level::ERR>("Could not call statvfs");
321 }
322 else
323 {
324 uint64_t kib = (fiData.f_bsize * fiData.f_bfree) / 1024;
325 bmcmetrics::metricproto::BmcDiskSpaceMetric m2;
326 m2.set_rwfs_kib_available(static_cast<int>(kib));
327 *(snapshot.mutable_storage_space_metric()) = m2;
328 }
329
330 // The next metrics require a sane ticks_per_sec value, typically 100 on
331 // the BMC. In the very rare circumstance when it's 0, exit early and return
332 // a partially complete snapshot (no process).
333 ticksPerSec = getTicksPerSec();
334
335 // FD stat
336 *(snapshot.mutable_fdstat_metric()) = getFdStatList();
337
338 if (ticksPerSec == 0)
339 {
340 log<level::ERR>("ticksPerSec is 0, skipping the process list metric");
341 serializeSnapshotToArray(snapshot);
342 done = true;
343 return;
344 }
345
346 // Proc stat
347 *(snapshot.mutable_procstat_metric()) = getProcStatList();
348
349 // String table
350 std::vector<std::string_view> strings(stringTable.size());
351 for (const auto& [s, i] : stringTable)
352 {
353 strings[i] = s;
354 }
355
356 bmcmetrics::metricproto::BmcStringTable st;
357 for (size_t i = 0; i < strings.size(); ++i)
358 {
359 bmcmetrics::metricproto::BmcStringTable::StringEntry entry;
360 entry.set_value(strings[i].data());
361 *(st.add_entries()) = entry;
362 }
363 *(snapshot.mutable_string_table()) = st;
364
365 // Save to buffer
366 serializeSnapshotToArray(snapshot);
367 done = true;
368}
369
370// BmcBlobSessionStat (9) but passing meta as reference instead of pointer,
371// since the metadata must not be null at this point.
372bool BmcHealthSnapshot::stat(blobs::BlobMeta& meta)
373{
374 if (!done)
375 {
376 // Bits 8~15 are blob-specific state flags.
377 // For this blob, bit 8 is set when metric collection is still in
378 // progress.
379 meta.blobState |= (1 << 8);
380 }
381 else
382 {
383 meta.blobState = 0;
384 meta.blobState = blobs::StateFlags::open_read;
385 meta.size = pbDump.size();
386 }
387 return true;
388}
389
390std::string_view BmcHealthSnapshot::read(uint32_t offset,
391 uint32_t requestedSize)
392{
393 uint32_t size = static_cast<uint32_t>(pbDump.size());
394 if (offset >= size)
395 {
396 return {};
397 }
398 return std::string_view(pbDump.data() + offset,
399 std::min(requestedSize, size - offset));
400}
401
402int BmcHealthSnapshot::getStringID(const std::string_view s)
403{
404 int ret = 0;
405 auto itr = stringTable.find(s.data());
406 if (itr == stringTable.end())
407 {
408 stringTable[s.data()] = stringId;
409 ret = stringId;
410 ++stringId;
411 }
412 else
413 {
414 ret = itr->second;
415 }
416 return ret;
417}
418
419} // namespace metric_blob