blob: 9b27b7e8f86650e648407b64faab5e726b1a7d5f [file] [log] [blame]
Matthew Barthc95c5272020-06-15 19:51:13 -05001/**
Mike Cappsfdcd5db2021-05-20 12:47:10 -04002 * Copyright © 2021 IBM Corporation
Matthew Barthc95c5272020-06-15 19:51:13 -05003 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
Matthew Barthc95c5272020-06-15 19:51:13 -050016#include "system.hpp"
17
18#include "fan.hpp"
19#include "fan_defs.hpp"
20#include "tach_sensor.hpp"
21#include "trust_manager.hpp"
22#include "types.hpp"
Mike Cappsfdcd5db2021-05-20 12:47:10 -040023#include "utility.hpp"
Matthew Barthc95c5272020-06-15 19:51:13 -050024#ifdef MONITOR_USE_JSON
25#include "json_parser.hpp"
26#endif
27
Matt Spinlerc8d3c512021-01-06 14:22:25 -060028#include "config.h"
29
Matt Spinlerbb449c12021-06-14 11:45:28 -060030#include "hwmon_ffdc.hpp"
31
Matthew Barthc95c5272020-06-15 19:51:13 -050032#include <nlohmann/json.hpp>
Matthew Barthd06905c2020-06-12 08:13:06 -050033#include <phosphor-logging/log.hpp>
Matthew Barthc95c5272020-06-15 19:51:13 -050034#include <sdbusplus/bus.hpp>
35#include <sdeventplus/event.hpp>
Matthew Barthd06905c2020-06-12 08:13:06 -050036#include <sdeventplus/source/signal.hpp>
Matthew Barthc95c5272020-06-15 19:51:13 -050037
38namespace phosphor::fan::monitor
39{
40
41using json = nlohmann::json;
Matt Spinlerf13b42e2020-10-26 15:29:49 -050042using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
43
Matthew Barthd06905c2020-06-12 08:13:06 -050044using namespace phosphor::logging;
Matthew Barthc95c5272020-06-15 19:51:13 -050045
46System::System(Mode mode, sdbusplus::bus::bus& bus,
47 const sdeventplus::Event& event) :
48 _mode(mode),
Matt Spinlerc8d3c512021-01-06 14:22:25 -060049 _bus(bus), _event(event),
50 _powerState(std::make_unique<PGoodState>(
Matt Spinlere892e392020-10-14 13:21:31 -050051 bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
Matt Spinlerc8d3c512021-01-06 14:22:25 -060052 std::placeholders::_1))),
53 _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
Matt Spinler7d135642021-02-04 12:44:17 -060054{}
Matt Spinlere892e392020-10-14 13:21:31 -050055
Matthew Barth823bc492021-06-21 14:19:09 -050056void System::start()
Matt Spinler7d135642021-02-04 12:44:17 -060057{
58 _started = true;
Matthew Barthc95c5272020-06-15 19:51:13 -050059 json jsonObj = json::object();
60#ifdef MONITOR_USE_JSON
Matthew Barth823bc492021-06-21 14:19:09 -050061 auto confFile =
62 fan::JsonConfig::getConfFile(_bus, confAppName, confFileName);
Matt Spinler7d135642021-02-04 12:44:17 -060063 jsonObj = fan::JsonConfig::load(confFile);
Matthew Barthc95c5272020-06-15 19:51:13 -050064#endif
65 // Retrieve and set trust groups within the trust manager
Matthew Barthd06905c2020-06-12 08:13:06 -050066 setTrustMgr(getTrustGroups(jsonObj));
Matthew Barthc95c5272020-06-15 19:51:13 -050067 // Retrieve fan definitions and create fan objects to be monitored
Matthew Barthd06905c2020-06-12 08:13:06 -050068 setFans(getFanDefinitions(jsonObj));
Matt Spinlere892e392020-10-14 13:21:31 -050069 setFaultConfig(jsonObj);
Matthew Barthd06905c2020-06-12 08:13:06 -050070 log<level::INFO>("Configuration loaded");
Matt Spinlere892e392020-10-14 13:21:31 -050071
Matt Spinlere892e392020-10-14 13:21:31 -050072 if (_powerState->isPowerOn())
73 {
74 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
75 [this](auto& rule) {
Matt Spinlere892e392020-10-14 13:21:31 -050076 rule->check(PowerRuleState::runtime, _fanHealth);
77 });
78 }
Mike Cappsfdcd5db2021-05-20 12:47:10 -040079
80 auto sensorMap = buildNameOwnerChangedMap();
81
82 namespace match = sdbusplus::bus::match;
83
84 // for each service, register a callback handler for nameOwnerChanged
85 for (const auto& service_itr : sensorMap)
86 {
87 _sensorMatch.push_back(std::make_unique<match::match>(
88 _bus, match::rules::nameOwnerChanged(service_itr.first),
89 std::bind(&System::tachSignalOffline, this, std::placeholders::_1,
90 sensorMap)));
91 }
92}
93
94SensorMapType System::buildNameOwnerChangedMap() const
95{
96 SensorMapType sensorMap;
97
98 // build a list of all interfaces, always including the value interface
99 // using set automatically guards against duplicates
100 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
101
102 for (const auto& fan : _fans)
103 {
104 for (const auto& sensor : fan->sensors())
105 {
106 unique_interfaces.insert(sensor->getInterface());
107 }
108 }
109 // convert them to vector to pass into getSubTreeRaw
110 std::vector<std::string> interfaces(unique_interfaces.begin(),
111 unique_interfaces.end());
112
113 // get service information for all service names that are
114 // hosting these interfaces
115 auto serviceObjects =
116 util::SDBusPlus::getSubTreeRaw(_bus, FAN_SENSOR_PATH, interfaces, 0);
117
118 for (const auto& fan : _fans)
119 {
120 // For every sensor in each fan
121 for (const auto& sensor : fan->sensors())
122 {
123 const auto itServ = serviceObjects.find(sensor->name());
124
125 if (serviceObjects.end() == itServ || itServ->second.empty())
126 {
127 getLogger().log(
128 fmt::format("Fan sensor entry {} not found in D-Bus",
129 sensor->name()),
130 Logger::error);
131 continue;
132 }
133
134 for (const auto& [serviceName, unused] : itServ->second)
135 {
136 // map its service name to the sensor
137 sensorMap[serviceName].insert(sensor);
138 }
139 }
140 }
141
142 return sensorMap;
Matthew Barthd06905c2020-06-12 08:13:06 -0500143}
144
145void System::sighupHandler(sdeventplus::source::Signal&,
146 const struct signalfd_siginfo*)
147{
148 try
Matthew Barthc95c5272020-06-15 19:51:13 -0500149 {
Matthew Barthd06905c2020-06-12 08:13:06 -0500150 json jsonObj = json::object();
151#ifdef MONITOR_USE_JSON
152 jsonObj = getJsonObj(_bus);
153#endif
154 auto trustGrps = getTrustGroups(jsonObj);
155 auto fanDefs = getFanDefinitions(jsonObj);
156 // Set configured trust groups
157 setTrustMgr(trustGrps);
158 // Clear/set configured fan definitions
159 _fans.clear();
Matt Spinlerb63aa092020-10-14 09:45:11 -0500160 _fanHealth.clear();
Matthew Barthd06905c2020-06-12 08:13:06 -0500161 setFans(fanDefs);
Matt Spinlere892e392020-10-14 13:21:31 -0500162 setFaultConfig(jsonObj);
Matthew Barthd06905c2020-06-12 08:13:06 -0500163 log<level::INFO>("Configuration reloaded successfully");
Matt Spinlere892e392020-10-14 13:21:31 -0500164
165 if (_powerState->isPowerOn())
166 {
167 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
168 [this](auto& rule) {
169 rule->check(PowerRuleState::runtime, _fanHealth);
170 });
171 }
Matthew Barthd06905c2020-06-12 08:13:06 -0500172 }
173 catch (std::runtime_error& re)
174 {
175 log<level::ERR>("Error reloading config, no config changes made",
176 entry("LOAD_ERROR=%s", re.what()));
Matthew Barthc95c5272020-06-15 19:51:13 -0500177 }
178}
179
180const std::vector<CreateGroupFunction>
181 System::getTrustGroups(const json& jsonObj)
182{
183#ifdef MONITOR_USE_JSON
184 return getTrustGrps(jsonObj);
185#else
186 return trustGroups;
187#endif
188}
189
Matthew Barthd06905c2020-06-12 08:13:06 -0500190void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
191{
192 _trust = std::make_unique<trust::Manager>(groupFuncs);
193}
194
Matthew Barthc95c5272020-06-15 19:51:13 -0500195const std::vector<FanDefinition> System::getFanDefinitions(const json& jsonObj)
196{
197#ifdef MONITOR_USE_JSON
198 return getFanDefs(jsonObj);
199#else
200 return fanDefinitions;
201#endif
202}
203
Matthew Barthd06905c2020-06-12 08:13:06 -0500204void System::setFans(const std::vector<FanDefinition>& fanDefs)
205{
206 for (const auto& fanDef : fanDefs)
207 {
208 // Check if a condition exists on the fan
209 auto condition = std::get<conditionField>(fanDef);
210 if (condition)
211 {
212 // Condition exists, skip adding fan if it fails
213 if (!(*condition)(_bus))
214 {
215 continue;
216 }
217 }
218 _fans.emplace_back(
Matt Spinlerb0412d02020-10-12 16:53:52 -0500219 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
Matt Spinlerb63aa092020-10-14 09:45:11 -0500220
221 updateFanHealth(*(_fans.back()));
Matthew Barthd06905c2020-06-12 08:13:06 -0500222 }
223}
224
Mike Cappsfdcd5db2021-05-20 12:47:10 -0400225// callback indicating a service went [on|off]line.
226// Determine on/offline status, set all sensors for that service
227// to new state
228//
229void System::tachSignalOffline(sdbusplus::message::message& msg,
230 SensorMapType const& sensorMap)
231{
232 std::string serviceName, oldOwner, newOwner;
233
234 msg.read(serviceName);
235 msg.read(oldOwner);
236 msg.read(newOwner);
237
238 // true if sensor server came back online, false -> went offline
239 bool hasOwner = !newOwner.empty() && oldOwner.empty();
240
241 std::string stateStr(hasOwner ? "online" : "offline");
242 getLogger().log(fmt::format("Changing sensors for service {} to {}",
243 serviceName, stateStr),
244 Logger::info);
245
246 auto sensorItr(sensorMap.find(serviceName));
247
248 if (sensorItr != sensorMap.end())
249 {
250 // set all sensors' owner state to not-owned
251 for (auto& sensor : sensorItr->second)
252 {
253 sensor->setOwner(hasOwner);
254 sensor->getFan().process(*sensor);
255 }
256 }
257}
258
Matt Spinlerb63aa092020-10-14 09:45:11 -0500259void System::updateFanHealth(const Fan& fan)
260{
261 std::vector<bool> sensorStatus;
262 for (const auto& sensor : fan.sensors())
263 {
264 sensorStatus.push_back(sensor->functional());
265 }
266
267 _fanHealth[fan.getName()] =
268 std::make_tuple(fan.present(), std::move(sensorStatus));
269}
270
Matt Spinler4283c5d2021-03-01 15:56:00 -0600271void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
Matt Spinlerb63aa092020-10-14 09:45:11 -0500272{
273 updateFanHealth(fan);
Matt Spinlere892e392020-10-14 13:21:31 -0500274
Matt Spinler4283c5d2021-03-01 15:56:00 -0600275 if (_powerState->isPowerOn() && !skipRulesCheck)
Matt Spinlere892e392020-10-14 13:21:31 -0500276 {
277 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
278 [this](auto& rule) {
279 rule->check(PowerRuleState::runtime, _fanHealth);
280 });
281 }
282}
283
284void System::setFaultConfig(const json& jsonObj)
285{
286#ifdef MONITOR_USE_JSON
287 std::shared_ptr<PowerInterfaceBase> powerInterface =
Matt Spinlerba3ee9a2021-01-06 14:45:50 -0600288 std::make_shared<PowerInterface>(_thermalAlert);
Matt Spinlere892e392020-10-14 13:21:31 -0500289
Matt Spinlerac1efc12020-10-27 10:20:11 -0500290 PowerOffAction::PrePowerOffFunc func =
291 std::bind(std::mem_fn(&System::logShutdownError), this);
292
293 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500294
295 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
Matt Spinlere892e392020-10-14 13:21:31 -0500296#endif
297}
298
299void System::powerStateChanged(bool powerStateOn)
300{
Matt Spinler7d135642021-02-04 12:44:17 -0600301 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
302 fan->powerStateChanged(powerStateOn);
303 });
304
Matt Spinlere892e392020-10-14 13:21:31 -0500305 if (powerStateOn)
306 {
Matt Spinler7d135642021-02-04 12:44:17 -0600307 if (!_started)
308 {
309 log<level::ERR>("No conf file found at power on");
Matthew Barthba53d3e2021-02-24 07:48:37 -0600310 throw std::runtime_error("No conf file found at power on");
Matt Spinler7d135642021-02-04 12:44:17 -0600311 }
312
Matt Spinlerbb449c12021-06-14 11:45:28 -0600313 // If no fan has its sensors on D-Bus, then there is a problem
314 // with the fan controller. Log an error and shut down.
315 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
316 return fan->numSensorsOnDBusAtPowerOn() == 0;
317 }))
318 {
319 handleOfflineFanController();
320 return;
321 }
322
Matt Spinlere892e392020-10-14 13:21:31 -0500323 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
324 [this](auto& rule) {
325 rule->check(PowerRuleState::atPgood, _fanHealth);
326 });
327 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
328 [this](auto& rule) {
329 rule->check(PowerRuleState::runtime, _fanHealth);
330 });
331 }
332 else
333 {
Matt Spinlerc8d3c512021-01-06 14:22:25 -0600334 _thermalAlert.enabled(false);
335
Matt Spinlere892e392020-10-14 13:21:31 -0500336 // Cancel any in-progress power off actions
337 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
338 [this](auto& rule) { rule->cancel(); });
339 }
Matt Spinlerb63aa092020-10-14 09:45:11 -0500340}
341
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500342void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
343{
344 std::string fanPath{util::INVENTORY_PATH + fan.getName()};
345
346 getLogger().log(
347 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath,
348 sensor.name()),
349 Logger::error);
350
351 // In order to know if the event log should have a severity of error or
352 // informational, count the number of existing nonfunctional sensors and
353 // compare it to _numNonfuncSensorsBeforeError.
354 size_t nonfuncSensors = 0;
355 for (const auto& fan : _fans)
356 {
357 for (const auto& s : fan->sensors())
358 {
359 // Don't count nonfunctional sensors that still have their
360 // error timer running as nonfunctional since they haven't
361 // had event logs created for those errors yet.
362 if (!s->functional() && !s->errorTimerRunning())
363 {
364 nonfuncSensors++;
365 }
366 }
367 }
368
369 Severity severity = Severity::Error;
370 if (nonfuncSensors < _numNonfuncSensorsBeforeError)
371 {
372 severity = Severity::Informational;
373 }
374
375 auto error =
376 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
377 fanPath, sensor.name(), severity);
378
379 auto sensorData = captureSensorData();
380 error->commit(sensorData);
381
Matt Spinlerac1efc12020-10-27 10:20:11 -0500382 // Save the error so it can be committed again on a power off.
383 _lastError = std::move(error);
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500384}
385
Matt Spinler27f6b682020-10-27 08:43:37 -0500386void System::fanMissingErrorTimerExpired(const Fan& fan)
387{
388 std::string fanPath{util::INVENTORY_PATH + fan.getName()};
389
390 getLogger().log(
391 fmt::format("Creating event log for missing fan {}", fanPath),
392 Logger::error);
393
394 auto error = std::make_unique<FanError>(
395 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
396
397 auto sensorData = captureSensorData();
398 error->commit(sensorData);
399
Matt Spinlerac1efc12020-10-27 10:20:11 -0500400 // Save the error so it can be committed again on a power off.
401 _lastError = std::move(error);
402}
403
404void System::logShutdownError()
405{
406 if (_lastError)
407 {
408 getLogger().log("Re-committing previous fan error before power off");
409
410 // Still use the latest sensor data
411 auto sensorData = captureSensorData();
Matt Spinlerf435eb12021-05-11 14:44:25 -0500412 _lastError->commit(sensorData, true);
Matt Spinlerac1efc12020-10-27 10:20:11 -0500413 }
Matt Spinler27f6b682020-10-27 08:43:37 -0500414}
415
Matt Spinlerf13b42e2020-10-26 15:29:49 -0500416json System::captureSensorData()
417{
418 json data;
419
420 for (const auto& fan : _fans)
421 {
422 for (const auto& sensor : fan->sensors())
423 {
424 json values;
425 values["present"] = fan->present();
426 values["functional"] = sensor->functional();
427 values["tach"] = sensor->getInput();
428 if (sensor->hasTarget())
429 {
430 values["target"] = sensor->getTarget();
431 }
432
433 data["sensors"][sensor->name()] = values;
434 }
435 }
436
437 return data;
438}
439
Matt Spinlerbb449c12021-06-14 11:45:28 -0600440void System::handleOfflineFanController()
441{
442 getLogger().log("The fan controller appears to be offline. Shutting down.",
443 Logger::error);
444
445 auto ffdc = collectHwmonFFDC();
446
447 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
448 Severity::Critical};
449 error.commit(ffdc, true);
450
451 PowerInterface::executeHardPowerOff();
452}
453
Matthew Barthc95c5272020-06-15 19:51:13 -0500454} // namespace phosphor::fan::monitor