blob: 9e6515561901ec4b854197bb3eb7e31efb319915 [file] [log] [blame]
Vishwanatha Subbanna307d80b2017-06-28 15:56:09 +05301#include "occ_status.hpp"
Gunnar Mills94df8c92018-09-14 14:50:03 -05002
Chris Cain17257672021-10-22 13:41:03 -05003#include "occ_manager.hpp"
Vishwanatha Subbanna6add0b82017-07-21 19:02:37 +05304#include "occ_sensor.hpp"
Chris Cain78e86012021-03-04 16:15:31 -06005#include "powermode.hpp"
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +05306#include "utils.hpp"
Gunnar Mills94df8c92018-09-14 14:50:03 -05007
Chris Caina8857c52021-01-27 11:53:05 -06008#include <fmt/core.h>
9
Chris Cain78e86012021-03-04 16:15:31 -060010#ifdef POWER10
11#include <com/ibm/Host/Target/server.hpp>
12#endif
Gunnar Mills94df8c92018-09-14 14:50:03 -050013#include <phosphor-logging/log.hpp>
Chris Cain78e86012021-03-04 16:15:31 -060014
Chris Caine2d0a432022-03-28 11:08:49 -050015#include <filesystem>
16
Vishwanatha Subbanna307d80b2017-06-28 15:56:09 +053017namespace open_power
18{
19namespace occ
20{
Chris Cain78e86012021-03-04 16:15:31 -060021
Chris Caina8857c52021-01-27 11:53:05 -060022using namespace phosphor::logging;
Vishwanatha Subbanna307d80b2017-06-28 15:56:09 +053023
24// Handles updates to occActive property
25bool Status::occActive(bool value)
26{
Vishwanatha Subbanna32e84e92017-06-28 19:17:28 +053027 if (value != this->occActive())
28 {
Chris Caina8857c52021-01-27 11:53:05 -060029 log<level::INFO>(fmt::format("Status::occActive OCC{} changed to {}",
30 instance, value)
31 .c_str());
Vishwanatha Subbanna32e84e92017-06-28 19:17:28 +053032 if (value)
33 {
34 // Bind the device
35 device.bind();
Vishwanatha Subbannaee4d83d2017-06-29 18:35:00 +053036
Edward A. James9fd2bdc2017-11-08 16:18:57 -060037 // Start watching for errors
38 addErrorWatch();
39
Chris Caina8857c52021-01-27 11:53:05 -060040 // Reset last OCC state
41 lastState = 0;
42
Chris Cain5d66a0a2022-02-09 08:52:10 -060043 if (device.master())
44 {
Chris Cain5d66a0a2022-02-09 08:52:10 -060045 // Update powercap bounds from OCC
Chris Cain40501a22022-03-14 17:33:27 -050046 manager.updatePcapBounds();
Chris Cain5d66a0a2022-02-09 08:52:10 -060047 }
48
Vishwanatha Subbanna2dc9b1a2017-08-18 18:29:41 +053049 // Call into Manager to let know that we have bound
Chris Cain1be43372021-12-09 19:29:37 -060050 if (this->managerCallBack)
Vishwanatha Subbanna2dc9b1a2017-08-18 18:29:41 +053051 {
Sheldon Bailey373af752022-02-21 15:14:00 -060052 this->managerCallBack(instance, value);
Edward A. James9fd2bdc2017-11-08 16:18:57 -060053 }
Vishwanatha Subbanna32e84e92017-06-28 19:17:28 +053054 }
55 else
56 {
Chris Caina7b74dc2021-11-10 17:03:43 -060057#ifdef POWER10
Chris Cain1be43372021-12-09 19:29:37 -060058 if (pmode && device.master())
Chris Cain36f9cde2021-11-22 11:18:21 -060059 {
60 // Prevent mode changes
61 pmode->setMasterActive(false);
62 }
Chris Caina7b74dc2021-11-10 17:03:43 -060063 if (safeStateDelayTimer.isEnabled())
64 {
65 // stop safe delay timer
66 safeStateDelayTimer.setEnabled(false);
67 }
68#endif
69
Chris Cain36f9cde2021-11-22 11:18:21 -060070 // Call into Manager to let know that we will unbind.
Chris Cain1be43372021-12-09 19:29:37 -060071 if (this->managerCallBack)
Chris Cain36f9cde2021-11-22 11:18:21 -060072 {
Sheldon Bailey373af752022-02-21 15:14:00 -060073 this->managerCallBack(instance, value);
Chris Cain36f9cde2021-11-22 11:18:21 -060074 }
75
Edward A. James9fd2bdc2017-11-08 16:18:57 -060076 // Stop watching for errors
77 removeErrorWatch();
Vishwanatha Subbannaee4d83d2017-06-29 18:35:00 +053078
79 // Do the unbind.
Vishwanatha Subbanna32e84e92017-06-28 19:17:28 +053080 device.unBind();
81 }
82 }
Edward A. James5e177972017-10-25 15:50:31 -050083 else if (value && !device.bound())
84 {
85 // Existing error watch is on a dead file descriptor.
Edward A. James9fd2bdc2017-11-08 16:18:57 -060086 removeErrorWatch();
Edward A. James5e177972017-10-25 15:50:31 -050087
88 /*
89 * In it's constructor, Status checks Device::bound() to see if OCC is
90 * active or not.
91 * Device::bound() checks for occX-dev0 directory.
92 * We will lose occX-dev0 directories during FSI rescan.
93 * So, if we start this application (and construct Status), and then
94 * later do FSI rescan, we will end up with occActive = true and device
95 * NOT bound. Lets correct that situation here.
96 */
97 device.bind();
98
99 // Add error watch again
Edward A. James9fd2bdc2017-11-08 16:18:57 -0600100 addErrorWatch();
Edward A. James5e177972017-10-25 15:50:31 -0500101 }
Eddie James6d6d1b32019-04-22 10:45:08 -0500102 else if (!value && device.bound())
103 {
104 removeErrorWatch();
105
106 // In the event that the application never receives the active signal
107 // even though the OCC is active (this can occur if the BMC is rebooted
108 // with the host on, since the initial OCC driver probe will discover
109 // the OCCs), this application needs to be able to unbind the device
110 // when we get the OCC inactive signal.
111 device.unBind();
112 }
Vishwanatha Subbanna307d80b2017-06-28 15:56:09 +0530113 return Base::Status::occActive(value);
114}
115
Vishwanatha Subbannaee4d83d2017-06-29 18:35:00 +0530116// Callback handler when a device error is reported.
Eddie Jamescbad2192021-10-07 09:39:39 -0500117void Status::deviceError()
Vishwanatha Subbannaee4d83d2017-06-29 18:35:00 +0530118{
Chris Cain36f9cde2021-11-22 11:18:21 -0600119#ifdef POWER10
Chris Cain1be43372021-12-09 19:29:37 -0600120 if (pmode && device.master())
121 {
122 // Prevent mode changes
123 pmode->setMasterActive(false);
124 }
Chris Cain36f9cde2021-11-22 11:18:21 -0600125#endif
126
Eddie Jamescbad2192021-10-07 09:39:39 -0500127 // This would deem OCC inactive
128 this->occActive(false);
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530129
Eddie Jamescbad2192021-10-07 09:39:39 -0500130 // Reset the OCC
131 this->resetOCC();
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530132}
133
134// Sends message to host control command handler to reset OCC
135void Status::resetOCC()
136{
Chris Caina8857c52021-01-27 11:53:05 -0600137 log<level::INFO>(
138 fmt::format(">>Status::resetOCC() - requesting reset for OCC{}",
139 instance)
140 .c_str());
Tom Joseph00325232020-07-29 17:51:48 +0530141#ifdef PLDM
142 if (resetCallBack)
143 {
144 this->resetCallBack(instance);
145 }
146#else
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530147 constexpr auto CONTROL_HOST_PATH = "/org/open_power/control/host0";
148 constexpr auto CONTROL_HOST_INTF = "org.open_power.Control.Host";
149
150 // This will throw exception on failure
George Liuf3b75142021-06-10 11:22:50 +0800151 auto service = utils::getService(CONTROL_HOST_PATH, CONTROL_HOST_INTF);
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530152
George Liuf3b75142021-06-10 11:22:50 +0800153 auto& bus = utils::getBus();
Gunnar Mills94df8c92018-09-14 14:50:03 -0500154 auto method = bus.new_method_call(service.c_str(), CONTROL_HOST_PATH,
155 CONTROL_HOST_INTF, "Execute");
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530156 // OCC Reset control command
Gunnar Mills94df8c92018-09-14 14:50:03 -0500157 method.append(convertForMessage(Control::Host::Command::OCCReset).c_str());
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530158
159 // OCC Sensor ID for callout reasons
Patrick Williamse0962702020-05-13 17:50:22 -0500160 method.append(std::variant<uint8_t>(std::get<0>(sensorMap.at(instance))));
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530161 bus.call_noreply(method);
162 return;
Tom Joseph00325232020-07-29 17:51:48 +0530163#endif
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530164}
165
166// Handler called by Host control command handler to convey the
167// status of the executed command
168void Status::hostControlEvent(sdbusplus::message::message& msg)
169{
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530170 std::string cmdCompleted{};
171 std::string cmdStatus{};
172
173 msg.read(cmdCompleted, cmdStatus);
174
175 log<level::DEBUG>("Host control signal values",
Gunnar Mills94df8c92018-09-14 14:50:03 -0500176 entry("COMMAND=%s", cmdCompleted.c_str()),
177 entry("STATUS=%s", cmdStatus.c_str()));
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530178
Gunnar Mills94df8c92018-09-14 14:50:03 -0500179 if (Control::Host::convertResultFromString(cmdStatus) !=
180 Control::Host::Result::Success)
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530181 {
Gunnar Mills94df8c92018-09-14 14:50:03 -0500182 if (Control::Host::convertCommandFromString(cmdCompleted) ==
183 Control::Host::Command::OCCReset)
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530184 {
Gunnar Mills85e65202018-04-08 15:01:54 -0500185 // Must be a Timeout. Log an Error trace
Alexander Filippov1d69e192019-03-21 18:12:07 +0300186 log<level::ERR>(
187 "Error resetting the OCC.", entry("PATH=%s", path.c_str()),
188 entry("SENSORID=0x%X", std::get<0>(sensorMap.at(instance))));
Vishwanatha Subbanna30e329a2017-07-24 23:13:14 +0530189 }
190 }
191 return;
Vishwanatha Subbannaee4d83d2017-06-29 18:35:00 +0530192}
193
Sheldon Bailey373af752022-02-21 15:14:00 -0600194// Called from Manager::pollerTimerExpired() in preperation to POLL OCC.
Chris Caina8857c52021-01-27 11:53:05 -0600195void Status::readOccState()
196{
Sheldon Bailey373af752022-02-21 15:14:00 -0600197 currentOccReadRetriesCount = occReadRetries;
198 occReadStateNow();
Chris Caina8857c52021-01-27 11:53:05 -0600199}
200
Chris Cain78e86012021-03-04 16:15:31 -0600201#ifdef POWER10
Chris Cain78e86012021-03-04 16:15:31 -0600202// Special processing that needs to happen once the OCCs change to ACTIVE state
203void Status::occsWentActive()
204{
205 CmdStatus status = CmdStatus::SUCCESS;
206
Chris Cain36f9cde2021-11-22 11:18:21 -0600207 status = pmode->sendModeChange();
Chris Cain78e86012021-03-04 16:15:31 -0600208 if (status != CmdStatus::SUCCESS)
209 {
George Liub5ca1012021-09-10 12:53:11 +0800210 log<level::ERR>(
211 fmt::format(
212 "Status::occsWentActive: OCC mode change failed with status {}",
213 status)
214 .c_str());
Chris Cain78e86012021-03-04 16:15:31 -0600215 }
216
Chris Cain36f9cde2021-11-22 11:18:21 -0600217 status = pmode->sendIpsData();
Chris Cain78e86012021-03-04 16:15:31 -0600218 if (status != CmdStatus::SUCCESS)
219 {
220 log<level::ERR>(
221 fmt::format(
George Liub5ca1012021-09-10 12:53:11 +0800222 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}",
Chris Cain78e86012021-03-04 16:15:31 -0600223 status)
224 .c_str());
225 }
226}
227
Chris Cain17257672021-10-22 13:41:03 -0500228// Send Ambient and Altitude to the OCC
229CmdStatus Status::sendAmbient(const uint8_t inTemp, const uint16_t inAltitude)
230{
231 CmdStatus status = CmdStatus::FAILURE;
232 bool ambientValid = true;
233 uint8_t ambientTemp = inTemp;
234 uint16_t altitude = inAltitude;
235
236 if (ambientTemp == 0xFF)
237 {
238 // Get latest readings from manager
239 manager.getAmbientData(ambientValid, ambientTemp, altitude);
240 log<level::DEBUG>(
241 fmt::format("sendAmbient: valid: {}, Ambient: {}C, altitude: {}m",
242 ambientValid, ambientTemp, altitude)
243 .c_str());
244 }
245
246 std::vector<std::uint8_t> cmd, rsp;
247 cmd.reserve(11);
248 cmd.push_back(uint8_t(CmdType::SEND_AMBIENT));
249 cmd.push_back(0x00); // Data Length (2 bytes)
250 cmd.push_back(0x08); //
251 cmd.push_back(0x00); // Version
252 cmd.push_back(ambientValid ? 0 : 0xFF); // Ambient Status
253 cmd.push_back(ambientTemp); // Ambient Temperature
254 cmd.push_back(altitude >> 8); // Altitude in meters (2 bytes)
255 cmd.push_back(altitude & 0xFF); //
256 cmd.push_back(0x00); // Reserved (3 bytes)
257 cmd.push_back(0x00);
258 cmd.push_back(0x00);
259 log<level::DEBUG>(fmt::format("sendAmbient: SEND_AMBIENT "
260 "command to OCC{} ({} bytes)",
261 instance, cmd.size())
262 .c_str());
263 status = occCmd.send(cmd, rsp);
264 if (status == CmdStatus::SUCCESS)
265 {
266 if (rsp.size() == 5)
267 {
268 if (RspStatus::SUCCESS != RspStatus(rsp[2]))
269 {
270 log<level::ERR>(
271 fmt::format(
272 "sendAmbient: SEND_AMBIENT failed with status 0x{:02X}",
273 rsp[2])
274 .c_str());
275 dump_hex(rsp);
276 status = CmdStatus::FAILURE;
277 }
278 }
279 else
280 {
281 log<level::ERR>("sendAmbient: INVALID SEND_AMBIENT response");
282 dump_hex(rsp);
283 status = CmdStatus::FAILURE;
284 }
285 }
286 else
287 {
288 if (status == CmdStatus::OPEN_FAILURE)
289 {
290 // OCC not active yet
291 status = CmdStatus::SUCCESS;
292 }
293 else
294 {
295 log<level::ERR>("sendAmbient: SEND_AMBIENT FAILED!");
296 }
297 }
298
299 return status;
300}
Chris Caina7b74dc2021-11-10 17:03:43 -0600301
302// Called when safe timer expires to determine if OCCs need to be reset
303void Status::safeStateDelayExpired()
304{
305 if (this->occActive())
306 {
307 log<level::INFO>(
308 fmt::format(
309 "safeStateDelayExpired: OCC{} is in SAFE state, requesting reset",
310 instance)
311 .c_str());
312 // Disable and reset to try recovering
313 deviceError();
314 }
315}
Chris Cain78e86012021-03-04 16:15:31 -0600316#endif // POWER10
317
Chris Caine2d0a432022-03-28 11:08:49 -0500318fs::path Status::getHwmonPath()
Chris Cain5d66a0a2022-02-09 08:52:10 -0600319{
320 using namespace std::literals::string_literals;
321
Chris Caine2d0a432022-03-28 11:08:49 -0500322 if (!fs::exists(hwmonPath))
323 {
324 static bool tracedFail[8] = {0};
Chris Cain5d66a0a2022-02-09 08:52:10 -0600325
Chris Caine2d0a432022-03-28 11:08:49 -0500326 if (!hwmonPath.empty())
327 {
328 log<level::ERR>(
329 fmt::format("Status::getHwmonPath(): path no longer exists: {}",
330 hwmonPath.c_str())
331 .c_str());
332 hwmonPath.clear();
333 }
334
335 // Build the base HWMON path
336 fs::path prefixPath =
337 fs::path{OCC_HWMON_PATH + "occ-hwmon."s +
338 std::to_string(instance + 1) + "/hwmon/"s};
339
340 // Get the hwmonXX directory name
341 try
342 {
343 // there should only be one directory
344 const int numDirs = std::distance(
345 fs::directory_iterator(prefixPath), fs::directory_iterator{});
346 if (numDirs == 1)
347 {
348 hwmonPath = *fs::directory_iterator(prefixPath);
349 tracedFail[instance] = false;
350 }
351 else
352 {
353 if (!tracedFail[instance])
354 {
355 log<level::ERR>(
356 fmt::format(
357 "Status::getHwmonPath(): Found multiple ({}) hwmon paths!",
358 numDirs)
359 .c_str());
360 tracedFail[instance] = true;
361 }
362 }
363 }
364 catch (const fs::filesystem_error& e)
365 {
366 if (!tracedFail[instance])
367 {
368 log<level::ERR>(
369 fmt::format(
370 "Status::getHwmonPath(): error accessing {}: {}",
371 prefixPath.c_str(), e.what())
372 .c_str());
373 tracedFail[instance] = true;
374 }
375 }
376 }
377
378 return hwmonPath;
Chris Cain5d66a0a2022-02-09 08:52:10 -0600379}
380
Sheldon Bailey373af752022-02-21 15:14:00 -0600381// Called to read state and upon failure to read after occReadStateFailTimer.
382void Status::occReadStateNow()
383{
384 unsigned int state;
385 const fs::path filename =
386 fs::path(DEV_PATH) /
387 fs::path(sysfsName + "." + std::to_string(instance + 1)) / "occ_state";
388
389 std::ifstream file;
390 bool goodFile = false;
391
392 // open file.
393 file.open(filename, std::ios::in);
394 const int openErrno = errno;
395
396 // File is open and state can be used.
397 if (file.is_open() && file.good())
398 {
399 goodFile = true;
400 file >> state;
401
402 if (state != lastState)
403 {
404 // Trace OCC state changes
405 log<level::INFO>(
406 fmt::format("Status::readOccState: OCC{} state 0x{:02X}",
407 instance, state)
408 .c_str());
409 lastState = state;
410#ifdef POWER10
411 if (OccState(state) == OccState::ACTIVE)
412 {
413 if (pmode && device.master())
414 {
415 // Set the master OCC on the PowerMode object
416 pmode->setMasterOcc(path);
417 // Enable mode changes
418 pmode->setMasterActive();
419
420 // Special processing by master OCC when it goes active
421 occsWentActive();
422 }
423
424 CmdStatus status = sendAmbient();
425 if (status != CmdStatus::SUCCESS)
426 {
427 log<level::ERR>(
428 fmt::format(
429 "readOccState: Sending Ambient failed with status {}",
430 status)
431 .c_str());
432 }
433 }
434
435 // If OCC in known Good State.
436 if ((OccState(state) == OccState::ACTIVE) ||
437 (OccState(state) == OccState::CHARACTERIZATION) ||
438 (OccState(state) == OccState::OBSERVATION))
439 {
440 // Good OCC State then sensors valid again
441 stateValid = true;
442
443 if (safeStateDelayTimer.isEnabled())
444 {
445 // stop safe delay timer (no longer in SAFE state)
446 safeStateDelayTimer.setEnabled(false);
447 }
448 }
449 // Else not Valid state We would be in SAFE mode.
450 // This captures both SAFE mode, and 0x00, or other invalid
451 // state values.
452 else
453 {
454 if (!safeStateDelayTimer.isEnabled())
455 {
456 // start safe delay timer (before requesting reset)
457 using namespace std::literals::chrono_literals;
458 safeStateDelayTimer.restartOnce(60s);
459 }
460 // Not valid state, update sensors to Nan & not functional.
461 stateValid = false;
462 }
463#else
464 // Before P10 state not checked, only used good file open.
465 stateValid = true;
466#endif
467 }
468 }
469 file.close();
470
471 // if failed to Read a state or not a valid state -> Attempt retry
472 // after 1 Second delay if allowed.
473 if ((!goodFile) || (!stateValid))
474 {
475 if (!goodFile)
476 {
477 // If not able to read, OCC may be offline
478 log<level::ERR>(
479 fmt::format("Status::readOccState: open failed (errno={})",
480 openErrno)
481 .c_str());
482 }
483 else
484 {
485 // else this failed due to state not valid.
486 log<level::ERR>(
487 fmt::format(
488 "Status::readOccState: OCC{} Invalid state 0x{:02X}",
489 instance, state)
490 .c_str());
491 }
492
493#ifdef READ_OCC_SENSORS
494 manager.setSensorValueToNonFunctional(instance);
495#endif
496
497 // See occReadRetries for number of retry attempts.
498 if (currentOccReadRetriesCount > 0)
499 {
500 --currentOccReadRetriesCount;
501#ifdef POWER10
502 using namespace std::chrono_literals;
503 occReadStateFailTimer.restartOnce(1s);
504#endif
505 }
506 else
507 {
508 // State could not be determined, set it to NO State.
509 lastState = 0;
510
511 // Disable the ability to send Failed actions until OCC is
512 // Active again.
513 stateValid = false;
514
515 // Disable and reset to try recovering
516 deviceError();
517 }
518 }
519}
520
Vishwanatha Subbanna307d80b2017-06-28 15:56:09 +0530521} // namespace occ
522} // namespace open_power