blob: b79fff4d88142fa6a235b88dc8f6212834f1083f [file] [log] [blame]
Vishwanatha Subbanna307d80b2017-06-28 15:56:09 +05301#include "occ_status.hpp"
Gunnar Mills94df8c92018-09-14 14:50:03 -05002
Chris Cain17257672021-10-22 13:41:03 -05003#include "occ_manager.hpp"
Vishwanatha Subbanna6add0b82017-07-21 19:02:37 +05304#include "occ_sensor.hpp"
Chris Cain78e86012021-03-04 16:15:31 -06005#include "powermode.hpp"
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +05306#include "utils.hpp"
Gunnar Mills94df8c92018-09-14 14:50:03 -05007
Chris Caina8857c52021-01-27 11:53:05 -06008#include <fmt/core.h>
9
Chris Cain78e86012021-03-04 16:15:31 -060010#ifdef POWER10
11#include <com/ibm/Host/Target/server.hpp>
12#endif
Gunnar Mills94df8c92018-09-14 14:50:03 -050013#include <phosphor-logging/log.hpp>
Chris Cain78e86012021-03-04 16:15:31 -060014
Chris Caine2d0a432022-03-28 11:08:49 -050015#include <filesystem>
16
Vishwanatha Subbanna307d80b2017-06-28 15:56:09 +053017namespace open_power
18{
19namespace occ
20{
Chris Cain78e86012021-03-04 16:15:31 -060021
Chris Caina8857c52021-01-27 11:53:05 -060022using namespace phosphor::logging;
Vishwanatha Subbanna307d80b2017-06-28 15:56:09 +053023
24// Handles updates to occActive property
25bool Status::occActive(bool value)
26{
Vishwanatha Subbanna32e84e92017-06-28 19:17:28 +053027 if (value != this->occActive())
28 {
Chris Caina8857c52021-01-27 11:53:05 -060029 log<level::INFO>(fmt::format("Status::occActive OCC{} changed to {}",
30 instance, value)
31 .c_str());
Vishwanatha Subbanna32e84e92017-06-28 19:17:28 +053032 if (value)
33 {
34 // Bind the device
35 device.bind();
Vishwanatha Subbannaee4d83d2017-06-29 18:35:00 +053036
Edward A. James9fd2bdc2017-11-08 16:18:57 -060037 // Start watching for errors
38 addErrorWatch();
39
Chris Caina8857c52021-01-27 11:53:05 -060040 // Reset last OCC state
41 lastState = 0;
42
Chris Cain5d66a0a2022-02-09 08:52:10 -060043 if (device.master())
44 {
Chris Cain5d66a0a2022-02-09 08:52:10 -060045 // Update powercap bounds from OCC
Chris Cain40501a22022-03-14 17:33:27 -050046 manager.updatePcapBounds();
Chris Cain5d66a0a2022-02-09 08:52:10 -060047 }
48
Vishwanatha Subbanna2dc9b1a2017-08-18 18:29:41 +053049 // Call into Manager to let know that we have bound
Chris Cain1be43372021-12-09 19:29:37 -060050 if (this->managerCallBack)
Vishwanatha Subbanna2dc9b1a2017-08-18 18:29:41 +053051 {
Sheldon Bailey373af752022-02-21 15:14:00 -060052 this->managerCallBack(instance, value);
Edward A. James9fd2bdc2017-11-08 16:18:57 -060053 }
Vishwanatha Subbanna32e84e92017-06-28 19:17:28 +053054 }
55 else
56 {
Chris Caina7b74dc2021-11-10 17:03:43 -060057#ifdef POWER10
Chris Cain1be43372021-12-09 19:29:37 -060058 if (pmode && device.master())
Chris Cain36f9cde2021-11-22 11:18:21 -060059 {
60 // Prevent mode changes
61 pmode->setMasterActive(false);
62 }
Chris Caina7b74dc2021-11-10 17:03:43 -060063 if (safeStateDelayTimer.isEnabled())
64 {
65 // stop safe delay timer
66 safeStateDelayTimer.setEnabled(false);
67 }
68#endif
69
Chris Cain36f9cde2021-11-22 11:18:21 -060070 // Call into Manager to let know that we will unbind.
Chris Cain1be43372021-12-09 19:29:37 -060071 if (this->managerCallBack)
Chris Cain36f9cde2021-11-22 11:18:21 -060072 {
Sheldon Bailey373af752022-02-21 15:14:00 -060073 this->managerCallBack(instance, value);
Chris Cain36f9cde2021-11-22 11:18:21 -060074 }
75
Edward A. James9fd2bdc2017-11-08 16:18:57 -060076 // Stop watching for errors
77 removeErrorWatch();
Vishwanatha Subbannaee4d83d2017-06-29 18:35:00 +053078
79 // Do the unbind.
Vishwanatha Subbanna32e84e92017-06-28 19:17:28 +053080 device.unBind();
81 }
82 }
Edward A. James5e177972017-10-25 15:50:31 -050083 else if (value && !device.bound())
84 {
85 // Existing error watch is on a dead file descriptor.
Edward A. James9fd2bdc2017-11-08 16:18:57 -060086 removeErrorWatch();
Edward A. James5e177972017-10-25 15:50:31 -050087
88 /*
89 * In it's constructor, Status checks Device::bound() to see if OCC is
90 * active or not.
91 * Device::bound() checks for occX-dev0 directory.
92 * We will lose occX-dev0 directories during FSI rescan.
93 * So, if we start this application (and construct Status), and then
94 * later do FSI rescan, we will end up with occActive = true and device
95 * NOT bound. Lets correct that situation here.
96 */
97 device.bind();
98
99 // Add error watch again
Edward A. James9fd2bdc2017-11-08 16:18:57 -0600100 addErrorWatch();
Edward A. James5e177972017-10-25 15:50:31 -0500101 }
Eddie James6d6d1b32019-04-22 10:45:08 -0500102 else if (!value && device.bound())
103 {
104 removeErrorWatch();
105
106 // In the event that the application never receives the active signal
107 // even though the OCC is active (this can occur if the BMC is rebooted
108 // with the host on, since the initial OCC driver probe will discover
109 // the OCCs), this application needs to be able to unbind the device
110 // when we get the OCC inactive signal.
111 device.unBind();
112 }
Vishwanatha Subbanna307d80b2017-06-28 15:56:09 +0530113 return Base::Status::occActive(value);
114}
115
Vishwanatha Subbannaee4d83d2017-06-29 18:35:00 +0530116// Callback handler when a device error is reported.
Eddie Jamescbad2192021-10-07 09:39:39 -0500117void Status::deviceError()
Vishwanatha Subbannaee4d83d2017-06-29 18:35:00 +0530118{
Chris Cain36f9cde2021-11-22 11:18:21 -0600119#ifdef POWER10
Chris Cain1be43372021-12-09 19:29:37 -0600120 if (pmode && device.master())
121 {
122 // Prevent mode changes
123 pmode->setMasterActive(false);
124 }
Chris Cain36f9cde2021-11-22 11:18:21 -0600125#endif
126
Eddie Jamescbad2192021-10-07 09:39:39 -0500127 // This would deem OCC inactive
128 this->occActive(false);
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530129
Eddie Jamescbad2192021-10-07 09:39:39 -0500130 // Reset the OCC
131 this->resetOCC();
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530132}
133
134// Sends message to host control command handler to reset OCC
135void Status::resetOCC()
136{
Chris Caina8857c52021-01-27 11:53:05 -0600137 log<level::INFO>(
138 fmt::format(">>Status::resetOCC() - requesting reset for OCC{}",
139 instance)
140 .c_str());
Tom Joseph00325232020-07-29 17:51:48 +0530141#ifdef PLDM
142 if (resetCallBack)
143 {
144 this->resetCallBack(instance);
145 }
146#else
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530147 constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
148 constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
149
150 // This will throw exception on failure
George Liuf3b75142021-06-10 11:22:50 +0800151 auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530152
George Liuf3b75142021-06-10 11:22:50 +0800153 auto& bus = utils::getBus();
Gunnar Mills94df8c92018-09-14 14:50:03 -0500154 auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
155 CONTROL_HOST_INTF, "Execute");
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530156 // OCC Reset control command
Gunnar Mills94df8c92018-09-14 14:50:03 -0500157 method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530158
159 // OCC Sensor ID for callout reasons
Patrick Williamse0962702020-05-13 17:50:22 -0500160 method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530161 bus.call_noreply(method);
162 return;
Tom Joseph00325232020-07-29 17:51:48 +0530163#endif
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530164}
165
166// Handler called by Host control command handler to convey the
167// status of the executed command
168void Status::hostControlEvent(sdbusplus::message::message& msg)
169{
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530170 std::string cmdCompleted{};
171 std::string cmdStatus{};
172
173 msg.read(cmdCompleted, cmdStatus);
174
175 log<level::DEBUG>("Host control signal values",
Gunnar Mills94df8c92018-09-14 14:50:03 -0500176 entry("COMMAND=%s", cmdCompleted.c_str()),
177 entry("STATUS=%s", cmdStatus.c_str()));
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530178
Gunnar Mills94df8c92018-09-14 14:50:03 -0500179 if (Control::Host::convertResultFromString(cmdStatus) !=
180 Control::Host::Result::Success)
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530181 {
Gunnar Mills94df8c92018-09-14 14:50:03 -0500182 if (Control::Host::convertCommandFromString(cmdCompleted) ==
183 Control::Host::Command::OCCReset)
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530184 {
Gunnar Mills85e65202018-04-08 15:01:54 -0500185 // Must be a Timeout. Log an Error trace
Alexander Filippov1d69e192019-03-21 18:12:07 +0300186 log<level::ERR>(
187 "Error resetting the OCC.", entry("PATH=%s", path.c_str()),
188 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance))));
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530189 }
190 }
191 return;
Vishwanatha Subbannaee4d83d2017-06-29 18:35:00 +0530192}
193
Sheldon Bailey373af752022-02-21 15:14:00 -0600194// Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
Chris Caina8857c52021-01-27 11:53:05 -0600195void Status::readOccState()
196{
Sheldon Bailey373af752022-02-21 15:14:00 -0600197 currentOccReadRetriesCount = occReadRetries;
198 occReadStateNow();
Chris Caina8857c52021-01-27 11:53:05 -0600199}
200
Chris Cain78e86012021-03-04 16:15:31 -0600201#ifdef POWER10
Chris Cain78e86012021-03-04 16:15:31 -0600202// Special processing that needs to happen once the OCCs change to ACTIVE state
203void Status::occsWentActive()
204{
205 CmdStatus status = CmdStatus::SUCCESS;
206
Chris Cain36f9cde2021-11-22 11:18:21 -0600207 status = pmode->sendModeChange();
Chris Cain78e86012021-03-04 16:15:31 -0600208 if (status != CmdStatus::SUCCESS)
209 {
George Liub5ca1012021-09-10 12:53:11 +0800210 log<level::ERR>(
211 fmt::format(
212 "Status::occsWentActive: OCC mode change failed with status {}",
213 status)
214 .c_str());
Chris Cainc567dc82022-04-01 15:09:17 -0500215
216 // Disable and reset to try recovering
217 deviceError();
Chris Cain78e86012021-03-04 16:15:31 -0600218 }
219
Chris Cain36f9cde2021-11-22 11:18:21 -0600220 status = pmode->sendIpsData();
Chris Cain78e86012021-03-04 16:15:31 -0600221 if (status != CmdStatus::SUCCESS)
222 {
223 log<level::ERR>(
224 fmt::format(
George Liub5ca1012021-09-10 12:53:11 +0800225 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}",
Chris Cain78e86012021-03-04 16:15:31 -0600226 status)
227 .c_str());
Chris Cainc567dc82022-04-01 15:09:17 -0500228
229 if (status == CmdStatus::COMM_FAILURE)
230 {
231 // Disable and reset to try recovering
232 deviceError();
233 }
Chris Cain78e86012021-03-04 16:15:31 -0600234 }
235}
236
Chris Cain17257672021-10-22 13:41:03 -0500237// Send Ambient and Altitude to the OCC
238CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
239{
240 CmdStatus status = CmdStatus::FAILURE;
241 bool ambientValid = true;
242 uint8_t ambientTemp = inTemp;
243 uint16_t altitude = inAltitude;
244
245 if (ambientTemp == 0xFF)
246 {
247 // Get latest readings from manager
248 manager.getAmbientData(ambientValid, ambientTemp, altitude);
249 log<level::DEBUG>(
250 fmt::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m",
251 ambientValid, ambientTemp, altitude)
252 .c_str());
253 }
254
255 std::vector<std::uint8_t> cmd, rsp;
256 cmd.reserve(11);
257 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
258 cmd.push_back(0x00); // Data Length (2 bytes)
259 cmd.push_back(0x08); //
260 cmd.push_back(0x00); // Version
261 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
262 cmd.push_back(ambientTemp); // Ambient Temperature
263 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes)
264 cmd.push_back(altitude & 0xFF); //
265 cmd.push_back(0x00); // Reserved (3 bytes)
266 cmd.push_back(0x00);
267 cmd.push_back(0x00);
268 log<level::DEBUG>(fmt::format("sendAmbient: SEND_AMBIENT "
269 "command to OCC{} ({} bytes)",
270 instance, cmd.size())
271 .c_str());
272 status = occCmd.send(cmd, rsp);
273 if (status == CmdStatus::SUCCESS)
274 {
275 if (rsp.size() == 5)
276 {
277 if (RspStatus::SUCCESS != RspStatus(rsp[2]))
278 {
279 log<level::ERR>(
280 fmt::format(
Chris Cainc567dc82022-04-01 15:09:17 -0500281 "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}",
Chris Cain17257672021-10-22 13:41:03 -0500282 rsp[2])
283 .c_str());
284 dump_hex(rsp);
285 status = CmdStatus::FAILURE;
286 }
287 }
288 else
289 {
Chris Cainc567dc82022-04-01 15:09:17 -0500290 log<level::ERR>(
291 fmt::format(
292 "sendAmbient: INVALID SEND_AMBIENT response length:{}",
293 rsp.size())
294 .c_str());
Chris Cain17257672021-10-22 13:41:03 -0500295 dump_hex(rsp);
296 status = CmdStatus::FAILURE;
297 }
298 }
299 else
300 {
Chris Cainc567dc82022-04-01 15:09:17 -0500301 log<level::ERR>(
302 fmt::format(
303 "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}",
304 status)
305 .c_str());
306
307 if (status == CmdStatus::COMM_FAILURE)
Chris Cain17257672021-10-22 13:41:03 -0500308 {
Chris Cainc567dc82022-04-01 15:09:17 -0500309 // Disable and reset to try recovering
310 deviceError();
Chris Cain17257672021-10-22 13:41:03 -0500311 }
312 }
313
314 return status;
315}
Chris Caina7b74dc2021-11-10 17:03:43 -0600316
317// Called when safe timer expires to determine if OCCs need to be reset
318void Status::safeStateDelayExpired()
319{
320 if (this->occActive())
321 {
322 log<level::INFO>(
323 fmt::format(
324 "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
325 instance)
326 .c_str());
327 // Disable and reset to try recovering
328 deviceError();
329 }
330}
Chris Cain78e86012021-03-04 16:15:31 -0600331#endif // POWER10
332
Chris Caine2d0a432022-03-28 11:08:49 -0500333fs::path Status::getHwmonPath()
Chris Cain5d66a0a2022-02-09 08:52:10 -0600334{
335 using namespace std::literals::string_literals;
336
Chris Caine2d0a432022-03-28 11:08:49 -0500337 if (!fs::exists(hwmonPath))
338 {
339 static bool tracedFail[8] = {0};
Chris Cain5d66a0a2022-02-09 08:52:10 -0600340
Chris Caine2d0a432022-03-28 11:08:49 -0500341 if (!hwmonPath.empty())
342 {
343 log<level::ERR>(
344 fmt::format("Status::getHwmonPath(): path no longer exists: {}",
345 hwmonPath.c_str())
346 .c_str());
347 hwmonPath.clear();
348 }
349
350 // Build the base HWMON path
351 fs::path prefixPath =
352 fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
353 std::to_string(instance + 1) + "/hwmon/"s};
354
355 // Get the hwmonXX directory name
356 try
357 {
358 // there should only be one directory
359 const int numDirs = std::distance(
360 fs::directory_iterator(prefixPath), fs::directory_iterator{});
361 if (numDirs == 1)
362 {
363 hwmonPath = *fs::directory_iterator(prefixPath);
364 tracedFail[instance] = false;
365 }
366 else
367 {
368 if (!tracedFail[instance])
369 {
370 log<level::ERR>(
371 fmt::format(
372 "Status::getHwmonPath(): Found multiple ({}) hwmon paths!",
373 numDirs)
374 .c_str());
375 tracedFail[instance] = true;
376 }
377 }
378 }
379 catch (const fs::filesystem_error& e)
380 {
381 if (!tracedFail[instance])
382 {
383 log<level::ERR>(
384 fmt::format(
385 "Status::getHwmonPath(): error accessing {}: {}",
386 prefixPath.c_str(), e.what())
387 .c_str());
388 tracedFail[instance] = true;
389 }
390 }
391 }
392
393 return hwmonPath;
Chris Cain5d66a0a2022-02-09 08:52:10 -0600394}
395
Sheldon Bailey373af752022-02-21 15:14:00 -0600396// Called to read state and upon failure to read after occReadStateFailTimer.
397void Status::occReadStateNow()
398{
399 unsigned int state;
400 const fs::path filename =
401 fs::path(DEV_PATH) /
402 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
403
404 std::ifstream file;
405 bool goodFile = false;
406
407 // open file.
408 file.open(filename, std::ios::in);
409 const int openErrno = errno;
410
411 // File is open and state can be used.
412 if (file.is_open() && file.good())
413 {
414 goodFile = true;
415 file >> state;
416
417 if (state != lastState)
418 {
419 // Trace OCC state changes
420 log<level::INFO>(
421 fmt::format("Status::readOccState: OCC{} state 0x{:02X}",
422 instance, state)
423 .c_str());
424 lastState = state;
425#ifdef POWER10
426 if (OccState(state) == OccState::ACTIVE)
427 {
428 if (pmode && device.master())
429 {
430 // Set the master OCC on the PowerMode object
431 pmode->setMasterOcc(path);
432 // Enable mode changes
433 pmode->setMasterActive();
434
435 // Special processing by master OCC when it goes active
436 occsWentActive();
437 }
438
439 CmdStatus status = sendAmbient();
440 if (status != CmdStatus::SUCCESS)
441 {
442 log<level::ERR>(
443 fmt::format(
444 "readOccState: Sending Ambient failed with status {}",
445 status)
446 .c_str());
447 }
448 }
449
450 // If OCC in known Good State.
451 if ((OccState(state) == OccState::ACTIVE) ||
452 (OccState(state) == OccState::CHARACTERIZATION) ||
453 (OccState(state) == OccState::OBSERVATION))
454 {
455 // Good OCC State then sensors valid again
456 stateValid = true;
457
458 if (safeStateDelayTimer.isEnabled())
459 {
460 // stop safe delay timer (no longer in SAFE state)
461 safeStateDelayTimer.setEnabled(false);
462 }
463 }
464 // Else not Valid state We would be in SAFE mode.
465 // This captures both SAFE mode, and 0x00, or other invalid
466 // state values.
467 else
468 {
469 if (!safeStateDelayTimer.isEnabled())
470 {
471 // start safe delay timer (before requesting reset)
472 using namespace std::literals::chrono_literals;
473 safeStateDelayTimer.restartOnce(60s);
474 }
475 // Not valid state, update sensors to Nan & not functional.
476 stateValid = false;
477 }
478#else
479 // Before P10 state not checked, only used good file open.
480 stateValid = true;
481#endif
482 }
483 }
484 file.close();
485
486 // if failed to Read a state or not a valid state -> Attempt retry
487 // after 1 Second delay if allowed.
488 if ((!goodFile) || (!stateValid))
489 {
490 if (!goodFile)
491 {
492 // If not able to read, OCC may be offline
493 log<level::ERR>(
494 fmt::format("Status::readOccState: open failed (errno={})",
495 openErrno)
496 .c_str());
497 }
498 else
499 {
500 // else this failed due to state not valid.
501 log<level::ERR>(
502 fmt::format(
503 "Status::readOccState: OCC{} Invalid state 0x{:02X}",
504 instance, state)
505 .c_str());
506 }
507
508#ifdef READ_OCC_SENSORS
509 manager.setSensorValueToNonFunctional(instance);
510#endif
511
512 // See occReadRetries for number of retry attempts.
513 if (currentOccReadRetriesCount > 0)
514 {
515 --currentOccReadRetriesCount;
516#ifdef POWER10
517 using namespace std::chrono_literals;
518 occReadStateFailTimer.restartOnce(1s);
519#endif
520 }
521 else
522 {
523 // State could not be determined, set it to NO State.
524 lastState = 0;
525
526 // Disable the ability to send Failed actions until OCC is
527 // Active again.
528 stateValid = false;
529
530 // Disable and reset to try recovering
531 deviceError();
532 }
533 }
534}
535
Vishwanatha Subbanna307d80b2017-06-28 15:56:09 +0530536} // namespace occ
537} // namespace open_power