blob: 9da828b144184271f171d7f1513f20d5ca5d4622 [file] [log] [blame]
Matthew Barthc95c5272020-06-15 19:51:13 -05001/**
Mike Capps7b34ee02022-05-04 14:16:12 -04002 * Copyright © 2022 IBM Corporation
Matthew Barthc95c5272020-06-15 19:51:13 -05003 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
Matthew Barthc95c5272020-06-15 19:51:13 -050016#include "system.hpp"
17
18#include "fan.hpp"
19#include "fan_defs.hpp"
20#include "tach_sensor.hpp"
21#include "trust_manager.hpp"
22#include "types.hpp"
Mike Cappsfdcd5db2021-05-20 12:47:10 -040023#include "utility.hpp"
Matthew Barthc95c5272020-06-15 19:51:13 -050024#ifdef MONITOR_USE_JSON
Mike Cappsb4379a12021-10-11 14:18:06 -040025#include "json_config.hpp"
Matthew Barthc95c5272020-06-15 19:51:13 -050026#include "json_parser.hpp"
27#endif
28
Matt Spinlerc8d3c512021-01-06 14:22:25 -060029#include "config.h"
30
Matt Spinlerbb449c12021-06-14 11:45:28 -060031#include "hwmon_ffdc.hpp"
32
Matthew Barthc95c5272020-06-15 19:51:13 -050033#include <nlohmann/json.hpp>
Matthew Barthd06905c2020-06-12 08:13:06 -050034#include <phosphor-logging/log.hpp>
Matthew Barthc95c5272020-06-15 19:51:13 -050035#include <sdbusplus/bus.hpp>
36#include <sdeventplus/event.hpp>
Matthew Barthd06905c2020-06-12 08:13:06 -050037#include <sdeventplus/source/signal.hpp>
Matthew Barthc95c5272020-06-15 19:51:13 -050038
39namespace phosphor::fan::monitor
40{
41
42using json = nlohmann::json;
Matt Spinlerf13b42e2020-10-26 15:29:49 -050043using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
44
Matthew Barthd06905c2020-06-12 08:13:06 -050045using namespace phosphor::logging;
Matthew Barthc95c5272020-06-15 19:51:13 -050046
47System::System(Mode mode, sdbusplus::bus::bus& bus,
48 const sdeventplus::Event& event) :
49 _mode(mode),
Matt Spinlerc8d3c512021-01-06 14:22:25 -060050 _bus(bus), _event(event),
51 _powerState(std::make_unique<PGoodState>(
Matt Spinlere892e392020-10-14 13:21:31 -050052 bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
Matt Spinlerc8d3c512021-01-06 14:22:25 -060053 std::placeholders::_1))),
54 _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
Matt Spinler7d135642021-02-04 12:44:17 -060055{}
Matt Spinlere892e392020-10-14 13:21:31 -050056
Matthew Barth823bc492021-06-21 14:19:09 -050057void System::start()
Matt Spinler7d135642021-02-04 12:44:17 -060058{
Mike Cappsb4379a12021-10-11 14:18:06 -040059 namespace match = sdbusplus::bus::match;
60
61 // must be done before service detection
62 _inventoryMatch = std::make_unique<match::match>(
63 _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
64 std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
65
66 bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
67 _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
68 "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
69
70 if (invServiceRunning)
71 {
72 _inventoryMatch.reset();
73
74 if (!_loaded)
75 {
76 load();
77 }
78 }
79}
80
81void System::load()
82{
Matthew Barthc95c5272020-06-15 19:51:13 -050083 json jsonObj = json::object();
84#ifdef MONITOR_USE_JSON
Mike Cappsb4379a12021-10-11 14:18:06 -040085 try
86 {
Mike Capps808d7fe2022-06-13 10:12:16 -040087 jsonObj = getJsonObj();
Matthew Barthc95c5272020-06-15 19:51:13 -050088#endif
Mike Cappsb4379a12021-10-11 14:18:06 -040089 auto trustGrps = getTrustGroups(jsonObj);
90 auto fanDefs = getFanDefinitions(jsonObj);
91 // Retrieve and set trust groups within the trust manager
92 setTrustMgr(getTrustGroups(jsonObj));
93 // Clear/set configured fan definitions
94 _fans.clear();
95 _fanHealth.clear();
96 // Retrieve fan definitions and create fan objects to be monitored
97 setFans(fanDefs);
98 setFaultConfig(jsonObj);
99 log<level::INFO>("Configuration loaded");
100
101 _loaded = true;
102#ifdef MONITOR_USE_JSON
103 }
104 catch (const phosphor::fan::NoConfigFound&)
105 {}
106#endif
Matt Spinlere892e392020-10-14 13:21:31 -0500107
Matt Spinlere892e392020-10-14 13:21:31 -0500108 if (_powerState->isPowerOn())
109 {
Matt Spinler752f24e2022-07-06 15:57:54 -0500110 // Fans could be missing on startup, so check the power off rules.
111 // Tach sensors default to functional, so they wouldn't cause a power
112 // off here.
Matt Spinlere892e392020-10-14 13:21:31 -0500113 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
114 [this](auto& rule) {
Matt Spinlere892e392020-10-14 13:21:31 -0500115 rule->check(PowerRuleState::runtime, _fanHealth);
116 });
117 }
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400118
Mike Cappsb4379a12021-10-11 14:18:06 -0400119 subscribeSensorsToServices();
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400120}
121
Mike Capps25f03272021-09-13 13:38:44 -0400122void System::subscribeSensorsToServices()
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400123{
Mike Capps25f03272021-09-13 13:38:44 -0400124 namespace match = sdbusplus::bus::match;
125
Mike Cappsb4379a12021-10-11 14:18:06 -0400126 _sensorMatch.clear();
127
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400128 SensorMapType sensorMap;
129
130 // build a list of all interfaces, always including the value interface
131 // using set automatically guards against duplicates
132 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
133
134 for (const auto& fan : _fans)
135 {
136 for (const auto& sensor : fan->sensors())
137 {
138 unique_interfaces.insert(sensor->getInterface());
139 }
140 }
141 // convert them to vector to pass into getSubTreeRaw
142 std::vector<std::string> interfaces(unique_interfaces.begin(),
143 unique_interfaces.end());
144
Mike Capps25f03272021-09-13 13:38:44 -0400145 try
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400146 {
Mike Capps25f03272021-09-13 13:38:44 -0400147 // get service information for all service names that are
148 // hosting these interfaces
149 auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
150 _bus, FAN_SENSOR_PATH, interfaces, 0);
151
152 for (const auto& fan : _fans)
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400153 {
Mike Capps25f03272021-09-13 13:38:44 -0400154 // For every sensor in each fan
155 for (const auto& sensor : fan->sensors())
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400156 {
Mike Capps25f03272021-09-13 13:38:44 -0400157 const auto itServ = serviceObjects.find(sensor->name());
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400158
Mike Capps25f03272021-09-13 13:38:44 -0400159 if (serviceObjects.end() == itServ || itServ->second.empty())
160 {
161 getLogger().log(
162 fmt::format("Fan sensor entry {} not found in D-Bus",
163 sensor->name()),
164 Logger::error);
165 continue;
166 }
167
168 for (const auto& [serviceName, unused] : itServ->second)
169 {
170 // associate service name with sensor
171 sensorMap[serviceName].insert(sensor);
172 }
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400173 }
174 }
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400175
Mike Capps25f03272021-09-13 13:38:44 -0400176 // only create 1 match per service
177 for (const auto& [serviceName, unused] : sensorMap)
178 {
179 // map its service name to the sensor
180 _sensorMatch.emplace_back(std::make_unique<match::match>(
181 _bus, match::rules::nameOwnerChanged(serviceName),
182 std::bind(&System::tachSignalOffline, this,
183 std::placeholders::_1, sensorMap)));
184 }
185 }
186 catch (const util::DBusError&)
187 {
188 // catch exception from getSubTreeRaw() when fan sensor paths don't
189 // exist yet
190 }
Matthew Barthd06905c2020-06-12 08:13:06 -0500191}
192
Mike Cappsb4379a12021-10-11 14:18:06 -0400193void System::inventoryOnlineCb(sdbusplus::message::message& msg)
194{
195 namespace match = sdbusplus::bus::match;
196
197 std::string iface;
198 msg.read(iface);
199
200 if (util::INVENTORY_INTF != iface)
201 {
202 return;
203 }
204
205 std::string oldName;
206 msg.read(oldName);
207
208 std::string newName;
209 msg.read(newName);
210
211 // newName should never be empty since match was reset on the first
212 // nameOwnerChanged signal received from the service.
213 if (!_loaded && !newName.empty())
214 {
215 load();
216 }
217
218 // cancel any further notifications about the service state
219 _inventoryMatch.reset();
220}
221
Matthew Barthd06905c2020-06-12 08:13:06 -0500222void System::sighupHandler(sdeventplus::source::Signal&,
223 const struct signalfd_siginfo*)
224{
225 try
Matthew Barthc95c5272020-06-15 19:51:13 -0500226 {
Mike Cappsb4379a12021-10-11 14:18:06 -0400227 load();
Matthew Barthd06905c2020-06-12 08:13:06 -0500228 }
Mike Cappsb4379a12021-10-11 14:18:06 -0400229 catch (std::runtime_error& re)
Matthew Barthd06905c2020-06-12 08:13:06 -0500230 {
231 log<level::ERR>("Error reloading config, no config changes made",
232 entry("LOAD_ERROR=%s", re.what()));
Matthew Barthc95c5272020-06-15 19:51:13 -0500233 }
234}
235
236const std::vector<CreateGroupFunction>
Mike Capps808d7fe2022-06-13 10:12:16 -0400237 System::getTrustGroups([[maybe_unused]] const json& jsonObj)
Matthew Barthc95c5272020-06-15 19:51:13 -0500238{
239#ifdef MONITOR_USE_JSON
240 return getTrustGrps(jsonObj);
241#else
242 return trustGroups;
243#endif
244}
245
Matthew Barthd06905c2020-06-12 08:13:06 -0500246void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
247{
248 _trust = std::make_unique<trust::Manager>(groupFuncs);
249}
250
Mike Capps808d7fe2022-06-13 10:12:16 -0400251const std::vector<FanDefinition>
252 System::getFanDefinitions([[maybe_unused]] const json& jsonObj)
Matthew Barthc95c5272020-06-15 19:51:13 -0500253{
254#ifdef MONITOR_USE_JSON
255 return getFanDefs(jsonObj);
256#else
257 return fanDefinitions;
258#endif
259}
260
Matthew Barthd06905c2020-06-12 08:13:06 -0500261void System::setFans(const std::vector<FanDefinition>& fanDefs)
262{
263 for (const auto& fanDef : fanDefs)
264 {
265 // Check if a condition exists on the fan
266 auto condition = std::get<conditionField>(fanDef);
267 if (condition)
268 {
269 // Condition exists, skip adding fan if it fails
270 if (!(*condition)(_bus))
271 {
272 continue;
273 }
274 }
275 _fans.emplace_back(
Matt Spinlerb0412d02020-10-12 16:53:52 -0500276 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
Matt Spinlerb63aa092020-10-14 09:45:11 -0500277
278 updateFanHealth(*(_fans.back()));
Matthew Barthd06905c2020-06-12 08:13:06 -0500279 }
280}
281
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400282// callback indicating a service went [on|off]line.
283// Determine on/offline status, set all sensors for that service
284// to new state
285//
286void System::tachSignalOffline(sdbusplus::message::message& msg,
287 SensorMapType const& sensorMap)
288{
289 std::string serviceName, oldOwner, newOwner;
290
291 msg.read(serviceName);
292 msg.read(oldOwner);
293 msg.read(newOwner);
294
295 // true if sensor server came back online, false -> went offline
296 bool hasOwner = !newOwner.empty() && oldOwner.empty();
297
298 std::string stateStr(hasOwner ? "online" : "offline");
299 getLogger().log(fmt::format("Changing sensors for service {} to {}",
300 serviceName, stateStr),
301 Logger::info);
302
303 auto sensorItr(sensorMap.find(serviceName));
304
305 if (sensorItr != sensorMap.end())
306 {
307 // set all sensors' owner state to not-owned
308 for (auto& sensor : sensorItr->second)
309 {
310 sensor->setOwner(hasOwner);
311 sensor->getFan().process(*sensor);
312 }
313 }
314}
315
Matt Spinlerb63aa092020-10-14 09:45:11 -0500316void System::updateFanHealth(const Fan& fan)
317{
318 std::vector<bool> sensorStatus;
319 for (const auto& sensor : fan.sensors())
320 {
321 sensorStatus.push_back(sensor->functional());
322 }
323
324 _fanHealth[fan.getName()] =
325 std::make_tuple(fan.present(), std::move(sensorStatus));
326}
327
Matt Spinler4283c5d2021-03-01 15:56:00 -0600328void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
Matt Spinlerb63aa092020-10-14 09:45:11 -0500329{
330 updateFanHealth(fan);
Matt Spinlere892e392020-10-14 13:21:31 -0500331
Matt Spinler4283c5d2021-03-01 15:56:00 -0600332 if (_powerState->isPowerOn() && !skipRulesCheck)
Matt Spinlere892e392020-10-14 13:21:31 -0500333 {
334 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
335 [this](auto& rule) {
336 rule->check(PowerRuleState::runtime, _fanHealth);
337 });
338 }
339}
340
Mike Capps808d7fe2022-06-13 10:12:16 -0400341void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
Matt Spinlere892e392020-10-14 13:21:31 -0500342{
343#ifdef MONITOR_USE_JSON
344 std::shared_ptr<PowerInterfaceBase> powerInterface =
Matt Spinlerba3ee9a2021-01-06 14:45:50 -0600345 std::make_shared<PowerInterface>(_thermalAlert);
Matt Spinlere892e392020-10-14 13:21:31 -0500346
Matt Spinlerac1efc12020-10-27 10:20:11 -0500347 PowerOffAction::PrePowerOffFunc func =
348 std::bind(std::mem_fn(&System::logShutdownError), this);
349
350 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500351
352 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
Matt Spinlere892e392020-10-14 13:21:31 -0500353#endif
354}
355
356void System::powerStateChanged(bool powerStateOn)
357{
Matt Spinler7d135642021-02-04 12:44:17 -0600358 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
359 fan->powerStateChanged(powerStateOn);
360 });
361
Matt Spinlere892e392020-10-14 13:21:31 -0500362 if (powerStateOn)
363 {
Mike Cappsb4379a12021-10-11 14:18:06 -0400364 if (!_loaded)
Matt Spinler7d135642021-02-04 12:44:17 -0600365 {
366 log<level::ERR>("No conf file found at power on");
Matthew Barthba53d3e2021-02-24 07:48:37 -0600367 throw std::runtime_error("No conf file found at power on");
Matt Spinler7d135642021-02-04 12:44:17 -0600368 }
369
Matt Spinlerbb449c12021-06-14 11:45:28 -0600370 // If no fan has its sensors on D-Bus, then there is a problem
371 // with the fan controller. Log an error and shut down.
372 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
373 return fan->numSensorsOnDBusAtPowerOn() == 0;
374 }))
375 {
376 handleOfflineFanController();
377 return;
378 }
379
Mike Capps25f03272021-09-13 13:38:44 -0400380 if (_sensorMatch.empty())
381 {
382 subscribeSensorsToServices();
383 }
384
Matt Spinlere892e392020-10-14 13:21:31 -0500385 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
386 [this](auto& rule) {
387 rule->check(PowerRuleState::atPgood, _fanHealth);
388 });
389 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
390 [this](auto& rule) {
391 rule->check(PowerRuleState::runtime, _fanHealth);
392 });
393 }
394 else
395 {
Matt Spinlerc8d3c512021-01-06 14:22:25 -0600396 _thermalAlert.enabled(false);
397
Matt Spinlere892e392020-10-14 13:21:31 -0500398 // Cancel any in-progress power off actions
399 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
400 [this](auto& rule) { rule->cancel(); });
401 }
Matt Spinlerb63aa092020-10-14 09:45:11 -0500402}
403
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500404void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
405{
406 std::string fanPath{util::INVENTORY_PATH + fan.getName()};
407
408 getLogger().log(
409 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath,
410 sensor.name()),
411 Logger::error);
412
413 // In order to know if the event log should have a severity of error or
414 // informational, count the number of existing nonfunctional sensors and
415 // compare it to _numNonfuncSensorsBeforeError.
416 size_t nonfuncSensors = 0;
417 for (const auto& fan : _fans)
418 {
419 for (const auto& s : fan->sensors())
420 {
421 // Don't count nonfunctional sensors that still have their
422 // error timer running as nonfunctional since they haven't
423 // had event logs created for those errors yet.
424 if (!s->functional() && !s->errorTimerRunning())
425 {
426 nonfuncSensors++;
427 }
428 }
429 }
430
431 Severity severity = Severity::Error;
432 if (nonfuncSensors < _numNonfuncSensorsBeforeError)
433 {
434 severity = Severity::Informational;
435 }
436
437 auto error =
438 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
439 fanPath, sensor.name(), severity);
440
441 auto sensorData = captureSensorData();
442 error->commit(sensorData);
443
Matt Spinlerac1efc12020-10-27 10:20:11 -0500444 // Save the error so it can be committed again on a power off.
445 _lastError = std::move(error);
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500446}
447
Matt Spinler27f6b682020-10-27 08:43:37 -0500448void System::fanMissingErrorTimerExpired(const Fan& fan)
449{
450 std::string fanPath{util::INVENTORY_PATH + fan.getName()};
451
452 getLogger().log(
453 fmt::format("Creating event log for missing fan {}", fanPath),
454 Logger::error);
455
456 auto error = std::make_unique<FanError>(
457 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
458
459 auto sensorData = captureSensorData();
460 error->commit(sensorData);
461
Matt Spinlerac1efc12020-10-27 10:20:11 -0500462 // Save the error so it can be committed again on a power off.
463 _lastError = std::move(error);
464}
465
466void System::logShutdownError()
467{
468 if (_lastError)
469 {
470 getLogger().log("Re-committing previous fan error before power off");
471
472 // Still use the latest sensor data
473 auto sensorData = captureSensorData();
Matt Spinlerf435eb12021-05-11 14:44:25 -0500474 _lastError->commit(sensorData, true);
Matt Spinlerac1efc12020-10-27 10:20:11 -0500475 }
Matt Spinler27f6b682020-10-27 08:43:37 -0500476}
477
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500478json System::captureSensorData()
479{
480 json data;
481
482 for (const auto& fan : _fans)
483 {
484 for (const auto& sensor : fan->sensors())
485 {
486 json values;
487 values["present"] = fan->present();
488 values["functional"] = sensor->functional();
489 values["tach"] = sensor->getInput();
Mike Capps7b34ee02022-05-04 14:16:12 -0400490
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500491 if (sensor->hasTarget())
492 {
493 values["target"] = sensor->getTarget();
494 }
495
Mike Capps7b34ee02022-05-04 14:16:12 -0400496 // convert between string/json to remove newlines
497 values["prev_tachs"] = json(sensor->getPrevTach()).dump();
498
499 if (sensor->hasTarget())
500 {
501 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
502 }
503
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500504 data["sensors"][sensor->name()] = values;
505 }
506 }
507
508 return data;
509}
510
Matt Spinlerbb449c12021-06-14 11:45:28 -0600511void System::handleOfflineFanController()
512{
513 getLogger().log("The fan controller appears to be offline. Shutting down.",
514 Logger::error);
515
516 auto ffdc = collectHwmonFFDC();
517
518 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
519 Severity::Critical};
520 error.commit(ffdc, true);
521
522 PowerInterface::executeHardPowerOff();
Mike Capps683a96c2022-04-27 16:46:06 -0400523
524 createBmcDump();
525}
526
527/**
528 * @brief Create a BMC Dump
529 */
530void System::createBmcDump() const
531{
532 try
533 {
534 util::SDBusPlus::callMethod(
535 "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
536 "xyz.openbmc_project.Dump.Create", "CreateDump",
537 std::vector<
538 std::pair<std::string, std::variant<std::string, uint64_t>>>());
539 }
Mike Capps477b13b2022-07-11 10:45:46 -0400540 catch (const std::exception& e)
541 {
542 getLogger().log(
543 fmt::format("Caught exception while creating BMC dump: {}",
544 e.what()),
545 Logger::error);
546 }
Matt Spinlerbb449c12021-06-14 11:45:28 -0600547}
548
Matthew Barthc95c5272020-06-15 19:51:13 -0500549} // namespace phosphor::fan::monitor