blob: f3863b0c297505e86d1cc17cb736139813b24cb5 [file] [log] [blame]
Matthew Barthc95c5272020-06-15 19:51:13 -05001/**
Mike Cappsfdcd5db2021-05-20 12:47:10 -04002 * Copyright © 2021 IBM Corporation
Matthew Barthc95c5272020-06-15 19:51:13 -05003 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
Matthew Barthc95c5272020-06-15 19:51:13 -050016#include "system.hpp"
17
18#include "fan.hpp"
19#include "fan_defs.hpp"
20#include "tach_sensor.hpp"
21#include "trust_manager.hpp"
22#include "types.hpp"
Mike Cappsfdcd5db2021-05-20 12:47:10 -040023#include "utility.hpp"
Matthew Barthc95c5272020-06-15 19:51:13 -050024#ifdef MONITOR_USE_JSON
25#include "json_parser.hpp"
26#endif
27
Matt Spinlerc8d3c512021-01-06 14:22:25 -060028#include "config.h"
29
Matt Spinlerbb449c12021-06-14 11:45:28 -060030#include "hwmon_ffdc.hpp"
31
Matthew Barthc95c5272020-06-15 19:51:13 -050032#include <nlohmann/json.hpp>
Matthew Barthd06905c2020-06-12 08:13:06 -050033#include <phosphor-logging/log.hpp>
Matthew Barthc95c5272020-06-15 19:51:13 -050034#include <sdbusplus/bus.hpp>
35#include <sdeventplus/event.hpp>
Matthew Barthd06905c2020-06-12 08:13:06 -050036#include <sdeventplus/source/signal.hpp>
Matthew Barthc95c5272020-06-15 19:51:13 -050037
38namespace phosphor::fan::monitor
39{
40
41using json = nlohmann::json;
Matt Spinlerf13b42e2020-10-26 15:29:49 -050042using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
43
Matthew Barthd06905c2020-06-12 08:13:06 -050044using namespace phosphor::logging;
Matthew Barthc95c5272020-06-15 19:51:13 -050045
46System::System(Mode mode, sdbusplus::bus::bus& bus,
47 const sdeventplus::Event& event) :
48 _mode(mode),
Matt Spinlerc8d3c512021-01-06 14:22:25 -060049 _bus(bus), _event(event),
50 _powerState(std::make_unique<PGoodState>(
Matt Spinlere892e392020-10-14 13:21:31 -050051 bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
Matt Spinlerc8d3c512021-01-06 14:22:25 -060052 std::placeholders::_1))),
53 _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
Matt Spinler7d135642021-02-04 12:44:17 -060054{}
Matt Spinlere892e392020-10-14 13:21:31 -050055
Matthew Barth823bc492021-06-21 14:19:09 -050056void System::start()
Matt Spinler7d135642021-02-04 12:44:17 -060057{
58 _started = true;
Matthew Barthc95c5272020-06-15 19:51:13 -050059 json jsonObj = json::object();
60#ifdef MONITOR_USE_JSON
Matthew Barth823bc492021-06-21 14:19:09 -050061 auto confFile =
62 fan::JsonConfig::getConfFile(_bus, confAppName, confFileName);
Matt Spinler7d135642021-02-04 12:44:17 -060063 jsonObj = fan::JsonConfig::load(confFile);
Matthew Barthc95c5272020-06-15 19:51:13 -050064#endif
65 // Retrieve and set trust groups within the trust manager
Matthew Barthd06905c2020-06-12 08:13:06 -050066 setTrustMgr(getTrustGroups(jsonObj));
Matthew Barthc95c5272020-06-15 19:51:13 -050067 // Retrieve fan definitions and create fan objects to be monitored
Matthew Barthd06905c2020-06-12 08:13:06 -050068 setFans(getFanDefinitions(jsonObj));
Matt Spinlere892e392020-10-14 13:21:31 -050069 setFaultConfig(jsonObj);
Matthew Barthd06905c2020-06-12 08:13:06 -050070 log<level::INFO>("Configuration loaded");
Matt Spinlere892e392020-10-14 13:21:31 -050071
Matt Spinlere892e392020-10-14 13:21:31 -050072 if (_powerState->isPowerOn())
73 {
74 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
75 [this](auto& rule) {
Matt Spinlere892e392020-10-14 13:21:31 -050076 rule->check(PowerRuleState::runtime, _fanHealth);
77 });
78 }
Mike Cappsfdcd5db2021-05-20 12:47:10 -040079
Mike Capps25f03272021-09-13 13:38:44 -040080 if (_sensorMatch.empty())
Mike Cappsfdcd5db2021-05-20 12:47:10 -040081 {
Mike Capps25f03272021-09-13 13:38:44 -040082 subscribeSensorsToServices();
Mike Cappsfdcd5db2021-05-20 12:47:10 -040083 }
84}
85
Mike Capps25f03272021-09-13 13:38:44 -040086void System::subscribeSensorsToServices()
Mike Cappsfdcd5db2021-05-20 12:47:10 -040087{
Mike Capps25f03272021-09-13 13:38:44 -040088 namespace match = sdbusplus::bus::match;
89
Mike Cappsfdcd5db2021-05-20 12:47:10 -040090 SensorMapType sensorMap;
91
92 // build a list of all interfaces, always including the value interface
93 // using set automatically guards against duplicates
94 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
95
96 for (const auto& fan : _fans)
97 {
98 for (const auto& sensor : fan->sensors())
99 {
100 unique_interfaces.insert(sensor->getInterface());
101 }
102 }
103 // convert them to vector to pass into getSubTreeRaw
104 std::vector<std::string> interfaces(unique_interfaces.begin(),
105 unique_interfaces.end());
106
Mike Capps25f03272021-09-13 13:38:44 -0400107 try
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400108 {
Mike Capps25f03272021-09-13 13:38:44 -0400109 // get service information for all service names that are
110 // hosting these interfaces
111 auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
112 _bus, FAN_SENSOR_PATH, interfaces, 0);
113
114 for (const auto& fan : _fans)
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400115 {
Mike Capps25f03272021-09-13 13:38:44 -0400116 // For every sensor in each fan
117 for (const auto& sensor : fan->sensors())
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400118 {
Mike Capps25f03272021-09-13 13:38:44 -0400119 const auto itServ = serviceObjects.find(sensor->name());
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400120
Mike Capps25f03272021-09-13 13:38:44 -0400121 if (serviceObjects.end() == itServ || itServ->second.empty())
122 {
123 getLogger().log(
124 fmt::format("Fan sensor entry {} not found in D-Bus",
125 sensor->name()),
126 Logger::error);
127 continue;
128 }
129
130 for (const auto& [serviceName, unused] : itServ->second)
131 {
132 // associate service name with sensor
133 sensorMap[serviceName].insert(sensor);
134 }
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400135 }
136 }
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400137
Mike Capps25f03272021-09-13 13:38:44 -0400138 // only create 1 match per service
139 for (const auto& [serviceName, unused] : sensorMap)
140 {
141 // map its service name to the sensor
142 _sensorMatch.emplace_back(std::make_unique<match::match>(
143 _bus, match::rules::nameOwnerChanged(serviceName),
144 std::bind(&System::tachSignalOffline, this,
145 std::placeholders::_1, sensorMap)));
146 }
147 }
148 catch (const util::DBusError&)
149 {
150 // catch exception from getSubTreeRaw() when fan sensor paths don't
151 // exist yet
152 }
Matthew Barthd06905c2020-06-12 08:13:06 -0500153}
154
155void System::sighupHandler(sdeventplus::source::Signal&,
156 const struct signalfd_siginfo*)
157{
158 try
Matthew Barthc95c5272020-06-15 19:51:13 -0500159 {
Matthew Barthd06905c2020-06-12 08:13:06 -0500160 json jsonObj = json::object();
161#ifdef MONITOR_USE_JSON
162 jsonObj = getJsonObj(_bus);
163#endif
164 auto trustGrps = getTrustGroups(jsonObj);
165 auto fanDefs = getFanDefinitions(jsonObj);
166 // Set configured trust groups
167 setTrustMgr(trustGrps);
168 // Clear/set configured fan definitions
169 _fans.clear();
Matt Spinlerb63aa092020-10-14 09:45:11 -0500170 _fanHealth.clear();
Matthew Barthd06905c2020-06-12 08:13:06 -0500171 setFans(fanDefs);
Matt Spinlere892e392020-10-14 13:21:31 -0500172 setFaultConfig(jsonObj);
Matthew Barthd06905c2020-06-12 08:13:06 -0500173 log<level::INFO>("Configuration reloaded successfully");
Matt Spinlere892e392020-10-14 13:21:31 -0500174
175 if (_powerState->isPowerOn())
176 {
177 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
178 [this](auto& rule) {
179 rule->check(PowerRuleState::runtime, _fanHealth);
180 });
181 }
Mike Capps25f03272021-09-13 13:38:44 -0400182
183 _sensorMatch.clear();
184 subscribeSensorsToServices();
Matthew Barthd06905c2020-06-12 08:13:06 -0500185 }
Patrick Williamsddb773b2021-10-06 11:24:49 -0500186 catch (const std::runtime_error& re)
Matthew Barthd06905c2020-06-12 08:13:06 -0500187 {
188 log<level::ERR>("Error reloading config, no config changes made",
189 entry("LOAD_ERROR=%s", re.what()));
Matthew Barthc95c5272020-06-15 19:51:13 -0500190 }
191}
192
193const std::vector<CreateGroupFunction>
194 System::getTrustGroups(const json& jsonObj)
195{
196#ifdef MONITOR_USE_JSON
197 return getTrustGrps(jsonObj);
198#else
199 return trustGroups;
200#endif
201}
202
Matthew Barthd06905c2020-06-12 08:13:06 -0500203void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
204{
205 _trust = std::make_unique<trust::Manager>(groupFuncs);
206}
207
Matthew Barthc95c5272020-06-15 19:51:13 -0500208const std::vector<FanDefinition> System::getFanDefinitions(const json& jsonObj)
209{
210#ifdef MONITOR_USE_JSON
211 return getFanDefs(jsonObj);
212#else
213 return fanDefinitions;
214#endif
215}
216
Matthew Barthd06905c2020-06-12 08:13:06 -0500217void System::setFans(const std::vector<FanDefinition>& fanDefs)
218{
219 for (const auto& fanDef : fanDefs)
220 {
221 // Check if a condition exists on the fan
222 auto condition = std::get<conditionField>(fanDef);
223 if (condition)
224 {
225 // Condition exists, skip adding fan if it fails
226 if (!(*condition)(_bus))
227 {
228 continue;
229 }
230 }
231 _fans.emplace_back(
Matt Spinlerb0412d02020-10-12 16:53:52 -0500232 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
Matt Spinlerb63aa092020-10-14 09:45:11 -0500233
234 updateFanHealth(*(_fans.back()));
Matthew Barthd06905c2020-06-12 08:13:06 -0500235 }
236}
237
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400238// callback indicating a service went [on|off]line.
239// Determine on/offline status, set all sensors for that service
240// to new state
241//
242void System::tachSignalOffline(sdbusplus::message::message& msg,
243 SensorMapType const& sensorMap)
244{
245 std::string serviceName, oldOwner, newOwner;
246
247 msg.read(serviceName);
248 msg.read(oldOwner);
249 msg.read(newOwner);
250
251 // true if sensor server came back online, false -> went offline
252 bool hasOwner = !newOwner.empty() && oldOwner.empty();
253
254 std::string stateStr(hasOwner ? "online" : "offline");
255 getLogger().log(fmt::format("Changing sensors for service {} to {}",
256 serviceName, stateStr),
257 Logger::info);
258
259 auto sensorItr(sensorMap.find(serviceName));
260
261 if (sensorItr != sensorMap.end())
262 {
263 // set all sensors' owner state to not-owned
264 for (auto& sensor : sensorItr->second)
265 {
266 sensor->setOwner(hasOwner);
267 sensor->getFan().process(*sensor);
268 }
269 }
270}
271
Matt Spinlerb63aa092020-10-14 09:45:11 -0500272void System::updateFanHealth(const Fan& fan)
273{
274 std::vector<bool> sensorStatus;
275 for (const auto& sensor : fan.sensors())
276 {
277 sensorStatus.push_back(sensor->functional());
278 }
279
280 _fanHealth[fan.getName()] =
281 std::make_tuple(fan.present(), std::move(sensorStatus));
282}
283
Matt Spinler4283c5d2021-03-01 15:56:00 -0600284void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
Matt Spinlerb63aa092020-10-14 09:45:11 -0500285{
286 updateFanHealth(fan);
Matt Spinlere892e392020-10-14 13:21:31 -0500287
Matt Spinler4283c5d2021-03-01 15:56:00 -0600288 if (_powerState->isPowerOn() && !skipRulesCheck)
Matt Spinlere892e392020-10-14 13:21:31 -0500289 {
290 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
291 [this](auto& rule) {
292 rule->check(PowerRuleState::runtime, _fanHealth);
293 });
294 }
295}
296
297void System::setFaultConfig(const json& jsonObj)
298{
299#ifdef MONITOR_USE_JSON
300 std::shared_ptr<PowerInterfaceBase> powerInterface =
Matt Spinlerba3ee9a2021-01-06 14:45:50 -0600301 std::make_shared<PowerInterface>(_thermalAlert);
Matt Spinlere892e392020-10-14 13:21:31 -0500302
Matt Spinlerac1efc12020-10-27 10:20:11 -0500303 PowerOffAction::PrePowerOffFunc func =
304 std::bind(std::mem_fn(&System::logShutdownError), this);
305
306 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500307
308 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
Matt Spinlere892e392020-10-14 13:21:31 -0500309#endif
310}
311
312void System::powerStateChanged(bool powerStateOn)
313{
Matt Spinler7d135642021-02-04 12:44:17 -0600314 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
315 fan->powerStateChanged(powerStateOn);
316 });
317
Matt Spinlere892e392020-10-14 13:21:31 -0500318 if (powerStateOn)
319 {
Matt Spinler7d135642021-02-04 12:44:17 -0600320 if (!_started)
321 {
322 log<level::ERR>("No conf file found at power on");
Matthew Barthba53d3e2021-02-24 07:48:37 -0600323 throw std::runtime_error("No conf file found at power on");
Matt Spinler7d135642021-02-04 12:44:17 -0600324 }
325
Matt Spinlerbb449c12021-06-14 11:45:28 -0600326 // If no fan has its sensors on D-Bus, then there is a problem
327 // with the fan controller. Log an error and shut down.
328 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
329 return fan->numSensorsOnDBusAtPowerOn() == 0;
330 }))
331 {
332 handleOfflineFanController();
333 return;
334 }
335
Mike Capps25f03272021-09-13 13:38:44 -0400336 if (_sensorMatch.empty())
337 {
338 subscribeSensorsToServices();
339 }
340
Matt Spinlere892e392020-10-14 13:21:31 -0500341 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
342 [this](auto& rule) {
343 rule->check(PowerRuleState::atPgood, _fanHealth);
344 });
345 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
346 [this](auto& rule) {
347 rule->check(PowerRuleState::runtime, _fanHealth);
348 });
349 }
350 else
351 {
Matt Spinlerc8d3c512021-01-06 14:22:25 -0600352 _thermalAlert.enabled(false);
353
Matt Spinlere892e392020-10-14 13:21:31 -0500354 // Cancel any in-progress power off actions
355 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
356 [this](auto& rule) { rule->cancel(); });
357 }
Matt Spinlerb63aa092020-10-14 09:45:11 -0500358}
359
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500360void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
361{
362 std::string fanPath{util::INVENTORY_PATH + fan.getName()};
363
364 getLogger().log(
365 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath,
366 sensor.name()),
367 Logger::error);
368
369 // In order to know if the event log should have a severity of error or
370 // informational, count the number of existing nonfunctional sensors and
371 // compare it to _numNonfuncSensorsBeforeError.
372 size_t nonfuncSensors = 0;
373 for (const auto& fan : _fans)
374 {
375 for (const auto& s : fan->sensors())
376 {
377 // Don't count nonfunctional sensors that still have their
378 // error timer running as nonfunctional since they haven't
379 // had event logs created for those errors yet.
380 if (!s->functional() && !s->errorTimerRunning())
381 {
382 nonfuncSensors++;
383 }
384 }
385 }
386
387 Severity severity = Severity::Error;
388 if (nonfuncSensors < _numNonfuncSensorsBeforeError)
389 {
390 severity = Severity::Informational;
391 }
392
393 auto error =
394 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
395 fanPath, sensor.name(), severity);
396
397 auto sensorData = captureSensorData();
398 error->commit(sensorData);
399
Matt Spinlerac1efc12020-10-27 10:20:11 -0500400 // Save the error so it can be committed again on a power off.
401 _lastError = std::move(error);
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500402}
403
Matt Spinler27f6b682020-10-27 08:43:37 -0500404void System::fanMissingErrorTimerExpired(const Fan& fan)
405{
406 std::string fanPath{util::INVENTORY_PATH + fan.getName()};
407
408 getLogger().log(
409 fmt::format("Creating event log for missing fan {}", fanPath),
410 Logger::error);
411
412 auto error = std::make_unique<FanError>(
413 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
414
415 auto sensorData = captureSensorData();
416 error->commit(sensorData);
417
Matt Spinlerac1efc12020-10-27 10:20:11 -0500418 // Save the error so it can be committed again on a power off.
419 _lastError = std::move(error);
420}
421
422void System::logShutdownError()
423{
424 if (_lastError)
425 {
426 getLogger().log("Re-committing previous fan error before power off");
427
428 // Still use the latest sensor data
429 auto sensorData = captureSensorData();
Matt Spinlerf435eb12021-05-11 14:44:25 -0500430 _lastError->commit(sensorData, true);
Matt Spinlerac1efc12020-10-27 10:20:11 -0500431 }
Matt Spinler27f6b682020-10-27 08:43:37 -0500432}
433
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500434json System::captureSensorData()
435{
436 json data;
437
438 for (const auto& fan : _fans)
439 {
440 for (const auto& sensor : fan->sensors())
441 {
442 json values;
443 values["present"] = fan->present();
444 values["functional"] = sensor->functional();
445 values["tach"] = sensor->getInput();
446 if (sensor->hasTarget())
447 {
448 values["target"] = sensor->getTarget();
449 }
450
451 data["sensors"][sensor->name()] = values;
452 }
453 }
454
455 return data;
456}
457
Matt Spinlerbb449c12021-06-14 11:45:28 -0600458void System::handleOfflineFanController()
459{
460 getLogger().log("The fan controller appears to be offline. Shutting down.",
461 Logger::error);
462
463 auto ffdc = collectHwmonFFDC();
464
465 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
466 Severity::Critical};
467 error.commit(ffdc, true);
468
469 PowerInterface::executeHardPowerOff();
470}
471
Matthew Barthc95c5272020-06-15 19:51:13 -0500472} // namespace phosphor::fan::monitor