Avoid ssif driver entering aborting state

Bmc ssif driver will enter aborting state when current request has no
response for 15 seconds. In this state, driver will not accept any
response from ssifbridge and just wait for host send new request to
start over the new transaction. But somehow, if host fails to send
request, then host ssif driver will fail to enumurate ssif interface.

This commit adds a timer to ssif to resolve this issue.
To avoid this scenario, right before the timeout occurs in ssif driver,
ssifbridge will send a dummy response to make host start over a new
transaction by asking it retry. By doing this, bmc will has time to
recover from a high loading state especially when host is booting up.

Change-Id: Id2bc7aa632cef1ab296ab17d7cb0179c235e8a56
Signed-off-by: Quang Nguyen <quangnguyen@os.amperecomputing.com>
diff --git a/ssifbridged.cpp b/ssifbridged.cpp
index 20fc608..3a07add 100644
--- a/ssifbridged.cpp
+++ b/ssifbridged.cpp
@@ -26,6 +26,8 @@
 #include <phosphor-logging/log.hpp>
 #include <sdbusplus/asio/connection.hpp>
 #include <sdbusplus/asio/object_server.hpp>
+#include <sdbusplus/timer.hpp>
+#include <sdbusplus/asio/sd_event.hpp>
 
 #include <iostream>
 
@@ -46,6 +48,10 @@
     "xyz.openbmc_project.Ipmi.Channel.ipmi_ssif";
 static constexpr const char* ssifObj =
     "/xyz/openbmc_project/Ipmi/Channel/ipmi_ssif";
+/* The timer of driver is set to 15 seconds, need to send
+ * response before timeout occurs
+ */
+static constexpr const unsigned int hostReqTimeout = 14000000;
 
 class SsifChannel
 {
@@ -67,13 +73,14 @@
     void channelAbort(const char* msg, const boost::system::error_code& ec);
     void async_read();
     void processMessage(const boost::system::error_code& ecRd, size_t rlen);
+    int showNumOfReqNotRsp();
+    std::unique_ptr<boost::asio::posix::stream_descriptor> dev = nullptr;
 
   protected:
     std::array<uint8_t, ssifMessageSize> xferBuffer;
     std::shared_ptr<boost::asio::io_context> io;
     std::shared_ptr<sdbusplus::asio::connection> bus;
     std::shared_ptr<sdbusplus::asio::object_server> server;
-    std::unique_ptr<boost::asio::posix::stream_descriptor> dev = nullptr;
     bool verbose;
     /* This variable is always 0 when a request is responsed properly,
      * any value larger than 0 meaning there is/are request(s) which
@@ -82,6 +89,10 @@
     int numberOfReqNotRsp;
 };
 
+std::unique_ptr<phosphor::Timer> rspTimer
+    __attribute__((init_priority(101)));
+std::unique_ptr<SsifChannel> ssifchannel = nullptr;
+
 SsifChannel::SsifChannel(std::shared_ptr<boost::asio::io_context>& io,
                          std::shared_ptr<sdbusplus::asio::connection>& bus,
                          const std::string& device, bool verbose, int numberOfReqNotRsp) :
@@ -138,6 +149,66 @@
     });
 }
 
+int SsifChannel::showNumOfReqNotRsp()
+{
+    return numberOfReqNotRsp;
+}
+
+void rspTimerHandler()
+{
+    std::vector<uint8_t> rsp;
+    constexpr uint8_t ccResponseNotAvailable = 0xce;
+
+    rsp.resize(ssifchannel->sizeofLenField + 
+            sizeof(prev_req_cmd.cmd) + 
+            sizeof(prev_req_cmd.netfn) + 
+            sizeof(ccResponseNotAvailable));
+    std::string msgToLog = "timeout, send response to keep host alive"
+            " netfn=" + std::to_string(prev_req_cmd.netfn) +
+            " lun=" + std::to_string(prev_req_cmd.lun) +
+            " cmd=" + std::to_string(prev_req_cmd.cmd) +
+            " cc=" + std::to_string(ccResponseNotAvailable) +
+            " numberOfReqNotRsp=" + 
+            std::to_string(ssifchannel->showNumOfReqNotRsp());
+    log<level::INFO>(msgToLog.c_str());
+
+    unsigned int *t = (unsigned int *)&rsp[0];
+    *t = 3;
+    rsp[ssifchannel->sizeofLenField] =
+        ((prev_req_cmd.netfn + 1) << ssifchannel->netFnShift) | 
+        (prev_req_cmd.lun & ssifchannel->lunMask);
+    rsp[ssifchannel->sizeofLenField + 1] = prev_req_cmd.cmd;
+    rsp[ssifchannel->sizeofLenField + 2] = ccResponseNotAvailable;
+
+    boost::system::error_code ecWr;
+
+    size_t wlen =
+        boost::asio::write(*(ssifchannel->dev), 
+        boost::asio::buffer(rsp), ecWr);
+    if (ecWr || wlen != rsp.size())
+    {
+        msgToLog = "Failed to send ssif respond message:"
+                " size=" + std::to_string(wlen) +
+                " expect=" + std::to_string(rsp.size()) +
+                " error=" + ecWr.message() +
+                " netfn=" + std::to_string(prev_req_cmd.netfn + 1) +
+                " lun=" + std::to_string(prev_req_cmd.lun) +
+                " cmd=" + 
+                std::to_string(rsp[ssifchannel->sizeofLenField + 1]) +
+                " cc=" + std::to_string(ccResponseNotAvailable);
+        log<level::ERR>(msgToLog.c_str());
+    }
+}
+
+void initTimer()
+{
+    if (!rspTimer)
+    {
+        rspTimer = 
+            std::make_unique<phosphor::Timer>(rspTimerHandler);
+    }
+}
+
 void SsifChannel::processMessage(const boost::system::error_code& ecRd,
                                  size_t rlen)
 {
@@ -161,6 +232,8 @@
 
     /* there is a request coming */
     numberOfReqNotRsp++;
+    /* start response timer */
+    rspTimer->start(std::chrono::microseconds(hostReqTimeout), false);
 
     if (verbose)
     {
@@ -171,7 +244,9 @@
                 " len=" + std::to_string(lenRecv) +
                 " netfn=" + std::to_string(netfn) +
                 " lun=" + std::to_string(lun) +
-                " cmd=" + std::to_string(cmd);
+                " cmd=" + std::to_string(cmd) +
+                " numberOfReqNotRsp=" + 
+                std::to_string(numberOfReqNotRsp);
         log<level::INFO>(msgToLog.c_str());
     }
     // copy out payload
@@ -188,7 +263,10 @@
     static constexpr const char ipmiQueueIntf[] =
         "xyz.openbmc_project.Ipmi.Server";
     static constexpr const char ipmiQueueMethod[] = "execute";
-    static constexpr int dbusTimeout = 40000000;
+    /* now, we do not care dbus timeout, since we already have actions 
+     * before dbus timeout occurs
+     */
+    static constexpr unsigned int dbusTimeout = 60000000;
     bus->async_method_call_timed(
         [this, netfnCap{netfn}, lunCap{lun},
          cmdCap{cmd}](const boost::system::error_code& ec,
@@ -216,11 +294,11 @@
             {
                 if ((prev_req_cmd.netfn != (netfn-1) ||
                         prev_req_cmd.lun != lun ||
-                        prev_req_cmd.cmd != cmd) ||
-                        ((prev_req_cmd.netfn == (netfn-1) ||
-                        prev_req_cmd.lun == lun ||
-                        prev_req_cmd.cmd == cmd) &&
-                        numberOfReqNotRsp > 0))
+                        prev_req_cmd.cmd != cmd) || 
+                        ((prev_req_cmd.netfn == (netfn-1) && 
+                        prev_req_cmd.lun == lun &&
+                        prev_req_cmd.cmd == cmd) && 
+                        numberOfReqNotRsp != 0))
                 {
                     /* Only send response to the last request command to void
                      * duplicated response which makes host driver confused and
@@ -230,6 +308,18 @@
                      * (2) parameters are the same as request but handshake flow
                      * are in dupplicate request state
                      * */
+                    if (verbose)
+                    {
+                        std::string msgToLog = "Drop ssif respond message with"
+                                " len=" + std::to_string(payload.size() + 3) +
+                                " netfn=" + std::to_string(netfn) +
+                                " lun=" + std::to_string(lun) +
+                                " cmd=" + std::to_string(cmd) +
+                                " cc=" + std::to_string(cc) +
+                                " numberOfReqNotRsp=" + 
+                                std::to_string(numberOfReqNotRsp);
+                        log<level::INFO>(msgToLog.c_str());
+                    }
                     return;
                 }
                 rsp.resize(sizeofLenField + sizeof(netfn) + sizeof(cmd) +
@@ -255,7 +345,9 @@
                         " netfn=" + std::to_string(netfn) +
                         " lun=" + std::to_string(lun) +
                         " cmd=" + std::to_string(cmd) +
-                        " cc=" + std::to_string(cc);
+                        " cc=" + std::to_string(cc) +
+                        " numberOfReqNotRsp=" + 
+                        std::to_string(numberOfReqNotRsp);
                 log<level::INFO>(msgToLog.c_str());
             }
             boost::system::error_code ecWr;
@@ -273,6 +365,7 @@
                         " cc=" + std::to_string(cc);
                 log<level::ERR>(msgToLog.c_str());
             }
+            rspTimer->stop();
         },
         ipmiQueueService, ipmiQueuePath, ipmiQueueIntf, ipmiQueueMethod, dbusTimeout,
         netfn, lun, cmd, data, options);
@@ -294,14 +387,23 @@
     auto io = std::make_shared<boost::asio::io_context>();
     sd_bus* dbus;
     sd_bus_default_system(&dbus);
+    
+    /* This might be a phosphor::Timer bug, without timer t2, rspTimer 
+     * will not work 
+     * */
+    phosphor::Timer t2([]() { ; });
+    t2.start(std::chrono::microseconds(500000), true);
     auto bus = std::make_shared<sdbusplus::asio::connection>(*io, dbus);
     bus->request_name(ssifBus);
     // Create the SSIF channel, listening on D-Bus and on the SSIF device
-    SsifChannel ssifchannel(io, bus, device, verbose, numberOfReqNotRsp);
-    if (!ssifchannel.initOK())
+    ssifchannel = make_unique<SsifChannel>
+        (io, bus, device, verbose, numberOfReqNotRsp);
+    if (!ssifchannel->initOK())
     {
         return EXIT_FAILURE;
     }
+    initTimer();
+    sdbusplus::asio::sd_event_wrapper sdEvents(*io);
     io->run();
 
     return 0;