Prevent termination on OCC open/read/write errors

Instead of creating an elog and terminating, the code will now retry on
a Open/Read/Write failure and if it continues to fail, it will return a
CommFailure to the caller. Then the caller of can determine how to
handle (reset PM complex or ignore).
Normally all communication errors with the OCC should trigger an OCC
reset.

Change-Id: Ibf272270156edae565231ca429f15e8fca2f15bc
Signed-off-by: Chris Cain <cjcain@us.ibm.com>
diff --git a/occ_command.cpp b/occ_command.cpp
index d9faea2..aa5ecb6 100644
--- a/occ_command.cpp
+++ b/occ_command.cpp
@@ -82,18 +82,12 @@
     fd = open(devicePath.c_str(), O_RDWR | O_NONBLOCK);
     if (fd < 0)
     {
-        const int open_errno = errno;
+        const int openErrno = errno;
         log<level::ERR>(
             fmt::format(
                 "OccCommand::openDevice: open failed (errno={}, path={})",
-                open_errno, devicePath)
+                openErrno, devicePath)
                 .c_str());
-        // This would log and terminate since its not handled.
-        elog<OpenFailure>(
-            phosphor::logging::org::open_power::OCC::Device::OpenFailure::
-                CALLOUT_ERRNO(open_errno),
-            phosphor::logging::org::open_power::OCC::Device::OpenFailure::
-                CALLOUT_DEVICE_PATH(devicePath.c_str()));
     }
     else
     {
@@ -127,11 +121,11 @@
     if (fd < 0)
     {
         // OCC is inactive; empty response
-        return CmdStatus::OPEN_FAILURE;
+        return CmdStatus::COMM_FAILURE;
     }
 
+    const uint8_t cmd_type = command[0];
 #ifdef TRACE_PACKETS
-    uint8_t cmd_type = command[0];
     log<level::INFO>(
         fmt::format("OCC{}: Sending 0x{:02X} command (length={}, {})",
                     occInstance, cmd_type, command.size(), devicePath)
@@ -145,16 +139,17 @@
     do
     {
         auto rc = write(fd, command.data(), command.size());
+        const int writeErrno = errno;
         if ((rc < 0) || (rc != (int)command.size()))
         {
-            const int write_errno = errno;
-            log<level::ERR>("OccCommand::send: write failed");
-            // This would log and terminate since its not handled.
-            elog<WriteFailure>(
-                phosphor::logging::org::open_power::OCC::Device::WriteFailure::
-                    CALLOUT_ERRNO(write_errno),
-                phosphor::logging::org::open_power::OCC::Device::WriteFailure::
-                    CALLOUT_DEVICE_PATH(devicePath.c_str()));
+            log<level::ERR>(
+                fmt::format(
+                    "OccCommand::send: write(OCC{}, command:0x{:02X}) failed with errno={} (retries={})",
+                    occInstance, cmd_type, writeErrno, retries)
+                    .c_str());
+            status = CmdStatus::COMM_FAILURE;
+            // retry if available
+            continue;
         }
         else
         {
@@ -166,12 +161,12 @@
         {
             uint8_t data{};
             auto len = read(fd, &data, sizeof(data));
-            const int read_errno = errno;
+            const int readErrno = errno;
             if (len > 0)
             {
                 response.emplace_back(data);
             }
-            else if (len < 0 && read_errno == EAGAIN)
+            else if (len < 0 && readErrno == EAGAIN)
             {
                 // We may have data coming still.
                 // This driver does not need a sleep for a retry.
@@ -186,15 +181,21 @@
             }
             else
             {
-                log<level::ERR>("OccCommand::send: read failed");
-                // This would log and terminate since its not handled.
-                elog<ReadFailure>(
-                    phosphor::logging::org::open_power::OCC::Device::
-                        ReadFailure::CALLOUT_ERRNO(read_errno),
-                    phosphor::logging::org::open_power::OCC::Device::
-                        ReadFailure::CALLOUT_DEVICE_PATH(devicePath.c_str()));
+                log<level::ERR>(
+                    fmt::format(
+                        "OccCommand::send: read(OCC{}, command:0x{:02X}) failed with errno={} (rspSize={}, retries={})",
+                        occInstance, cmd_type, readErrno, response.size(),
+                        retries)
+                        .c_str());
+                status = CmdStatus::COMM_FAILURE;
+                break;
             }
         }
+        if (status != CmdStatus::SUCCESS)
+        {
+            // retry if available
+            continue;
+        }
 
         if (response.size() > 2)
         {
@@ -228,7 +229,7 @@
                                 occInstance, rspChecksum, calcChecksum)
                         .c_str());
                 dump_hex(response);
-                status = CmdStatus::INVALID_CHECKSUM;
+                status = CmdStatus::COMM_FAILURE;
             }
             else
             {
@@ -271,7 +272,6 @@
             log<level::ERR>("OccCommand::send: Command will be retried");
             response.clear();
         }
-
     } while (retries-- > 0);
 
     closeDevice();
diff --git a/occ_command.hpp b/occ_command.hpp
index 9dde698..d632656 100644
--- a/occ_command.hpp
+++ b/occ_command.hpp
@@ -78,10 +78,9 @@
 
 enum class CmdStatus
 {
-    SUCCESS,
-    OPEN_FAILURE,
-    FAILURE,
-    INVALID_CHECKSUM
+    SUCCESS = 0x00,
+    FAILURE = 0x02,
+    COMM_FAILURE = 0x03
 };
 
 /** @brief Trace block of data in hex with log<level:INFO>
diff --git a/occ_pass_through.cpp b/occ_pass_through.cpp
index dd4e773..c826983 100644
--- a/occ_pass_through.cpp
+++ b/occ_pass_through.cpp
@@ -97,14 +97,11 @@
     }
     else
     {
-        if (status == CmdStatus::OPEN_FAILURE)
-        {
-            log<level::WARNING>("PassThrough::send() - OCC not active yet");
-        }
-        else
-        {
-            log<level::ERR>("PassThrough::send() - OCC command failed!");
-        }
+        log<level::ERR>(
+            fmt::format(
+                "PassThrough::send(): OCC command failed with status {}",
+                uint32_t(status))
+                .c_str());
     }
 
     return response;
diff --git a/occ_status.cpp b/occ_status.cpp
index 9e65155..b79fff4 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -212,6 +212,9 @@
                 "Status::occsWentActive: OCC mode change failed with status {}",
                 status)
                 .c_str());
+
+        // Disable and reset to try recovering
+        deviceError();
     }
 
     status = pmode->sendIpsData();
@@ -222,6 +225,12 @@
                 "Status::occsWentActive: Sending Idle Power Save Config data failed with status {}",
                 status)
                 .c_str());
+
+        if (status == CmdStatus::COMM_FAILURE)
+        {
+            // Disable and reset to try recovering
+            deviceError();
+        }
     }
 }
 
@@ -269,7 +278,7 @@
             {
                 log<level::ERR>(
                     fmt::format(
-                        "sendAmbient: SEND_AMBIENT failed with status 0x{:02X}",
+                        "sendAmbient: SEND_AMBIENT failed with rspStatus 0x{:02X}",
                         rsp[2])
                         .c_str());
                 dump_hex(rsp);
@@ -278,21 +287,27 @@
         }
         else
         {
-            log<level::ERR>("sendAmbient: INVALID SEND_AMBIENT response");
+            log<level::ERR>(
+                fmt::format(
+                    "sendAmbient: INVALID SEND_AMBIENT response length:{}",
+                    rsp.size())
+                    .c_str());
             dump_hex(rsp);
             status = CmdStatus::FAILURE;
         }
     }
     else
     {
-        if (status == CmdStatus::OPEN_FAILURE)
+        log<level::ERR>(
+            fmt::format(
+                "sendAmbient: SEND_AMBIENT FAILED! with status 0x{:02X}",
+                status)
+                .c_str());
+
+        if (status == CmdStatus::COMM_FAILURE)
         {
-            // OCC not active yet
-            status = CmdStatus::SUCCESS;
-        }
-        else
-        {
-            log<level::ERR>("sendAmbient: SEND_AMBIENT FAILED!");
+            // Disable and reset to try recovering
+            deviceError();
         }
     }
 
diff --git a/powermode.cpp b/powermode.cpp
index 7db3565..ed17096 100644
--- a/powermode.cpp
+++ b/powermode.cpp
@@ -328,15 +328,11 @@
         }
         else
         {
-            if (status == CmdStatus::OPEN_FAILURE)
-            {
-                // OCC not active yet
-                status = CmdStatus::SUCCESS;
-            }
-            else
-            {
-                log<level::ERR>("PowerMode::sendModeChange: SET_MODE FAILED!");
-            }
+            log<level::ERR>(
+                fmt::format(
+                    "PowerMode::sendModeChange: SET_MODE FAILED with status={}",
+                    status)
+                    .c_str());
         }
     }
     else
@@ -573,16 +569,11 @@
     }
     else
     {
-        if (status == CmdStatus::OPEN_FAILURE)
-        {
-            // OCC not active yet
-            status = CmdStatus::SUCCESS;
-        }
-        else
-        {
-            log<level::ERR>(
-                "PowerMode::sendIpsData: SET_CFG_DATA[IPS] FAILED!");
-        }
+        log<level::ERR>(
+            fmt::format(
+                "PowerMode::sendIpsData: SET_CFG_DATA[IPS] with status={}",
+                status)
+                .c_str());
     }
 
     return status;