Log a PEL for communication, presence mismatch, and safe state errors
Add code to log a PEL in various error scenarios. Refactor some of the
error handling to get the return code out of the driver.
Signed-off-by: Eddie James <eajames@linux.ibm.com>
Change-Id: Ifd91cfc063718e484ec8886df8357d115c6b41e3
diff --git a/occ_device.cpp b/occ_device.cpp
index df8683e..0ca4fdc 100644
--- a/occ_device.cpp
+++ b/occ_device.cpp
@@ -79,16 +79,34 @@
return v == 1;
}
-void Device::errorCallback(bool error)
+void Device::errorCallback(int error)
{
if (error)
{
- statusObject.deviceError();
+ if (error != -EHOSTDOWN)
+ {
+ fs::path p = devPath;
+ if (fs::is_symlink(p))
+ {
+ p = fs::read_symlink(p);
+ }
+ statusObject.deviceError(Error::Descriptor(
+ "org.open_power.OCC.Device.ReadFailure", error, p.c_str()));
+ }
+ else
+ {
+ statusObject.deviceError(Error::Descriptor(SAFE_ERROR_PATH));
+ }
}
}
+void Device::presenceCallback(int)
+{
+ statusObject.deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
+}
+
#ifdef PLDM
-void Device::timeoutCallback(bool error)
+void Device::timeoutCallback(int error)
{
if (error)
{
@@ -97,17 +115,17 @@
}
#endif
-void Device::throttleProcTempCallback(bool error)
+void Device::throttleProcTempCallback(int error)
{
statusObject.throttleProcTemp(error);
}
-void Device::throttleProcPowerCallback(bool error)
+void Device::throttleProcPowerCallback(int error)
{
statusObject.throttleProcPower(error);
}
-void Device::throttleMemTempCallback(bool error)
+void Device::throttleMemTempCallback(int error)
{
statusObject.throttleMemTemp(error);
}
diff --git a/occ_device.hpp b/occ_device.hpp
index 873ecfc..b853412 100644
--- a/occ_device.hpp
+++ b/occ_device.hpp
@@ -245,37 +245,44 @@
return;
}
- /** @brief callback for OCC error and presence monitoring
+ /** @brief callback for OCC error monitoring
*
- * @param[in] error - True if an error is reported, false otherwise
+ * @param[in] error - Errno stored in the error file, 0 if no error
*/
- void errorCallback(bool error);
+ void errorCallback(int error);
+
+ /** @brief callback for OCC presence monitoring
+ *
+ * @param[in] occsPresent - The number of OCCs indicated in the poll
+ * response
+ */
+ void presenceCallback(int occsPresent);
#ifdef PLDM
/** @brief callback for SBE timeout monitoring
*
* @param[in] error - True if an error is reported, false otherwise
*/
- void timeoutCallback(bool error);
+ void timeoutCallback(int error);
#endif
/** @brief callback for the proc temp throttle event
*
* @param[in] error - True if an error is reported, false otherwise
*/
- void throttleProcTempCallback(bool error);
+ void throttleProcTempCallback(int error);
/** @brief callback for the proc power throttle event
*
* @param[in] error - True if an error is reported, false otherwise
*/
- void throttleProcPowerCallback(bool error);
+ void throttleProcPowerCallback(int error);
/** @brief callback for the proc temp throttle event
*
* @param[in] error - True if an error is reported, false otherwise
*/
- void throttleMemTempCallback(bool error);
+ void throttleMemTempCallback(int error);
/** @brief Get the pathname for a file based on a regular expression
*
diff --git a/occ_errors.cpp b/occ_errors.cpp
index c6f450d..f41173e 100644
--- a/occ_errors.cpp
+++ b/occ_errors.cpp
@@ -17,9 +17,6 @@
namespace occ
{
-// Value in error file indicating success
-constexpr auto NO_ERROR = '0';
-
using namespace phosphor::logging;
using namespace sdbusplus::org::open_power::OCC::Device::Error;
using InternalFailure =
@@ -113,6 +110,7 @@
void Error::analyzeEvent()
{
// Get the number of bytes to read
+ int err = 0;
int len = -1;
auto r = ioctl(fd, FIONREAD, &len);
if (r < 0)
@@ -127,10 +125,11 @@
// A non-zero data indicates an error condition
// Let the caller take appropriate action on this
auto data = readFile(len);
- bool error = !(data.empty() || data.front() == NO_ERROR);
+ if (!data.empty())
+ err = std::stoi(data, nullptr, 0);
if (callBack)
{
- callBack(error);
+ callBack(err);
}
return;
}
diff --git a/occ_errors.hpp b/occ_errors.hpp
index 6fe0983..ba4610e 100644
--- a/occ_errors.hpp
+++ b/occ_errors.hpp
@@ -15,6 +15,10 @@
namespace fs = std::filesystem;
+constexpr auto PRESENCE_ERROR_PATH =
+ "org.open_power.OCC.Firmware.PresenceMismatch";
+constexpr auto SAFE_ERROR_PATH = "org.open_power.OCC.Device.SafeState";
+
/** @class Error
* @brief Monitors for OCC device error condition
*/
@@ -34,7 +38,7 @@
* @param[in] callBack - Optional function callback on error condition
*/
Error(EventPtr& event, const fs::path& file,
- std::function<void(bool)> callBack = nullptr) :
+ std::function<void(int)> callBack = nullptr) :
event(event),
file(file), callBack(callBack)
{
@@ -49,6 +53,38 @@
}
}
+ /** @class Descriptor
+ * @brief Contains data relevant to an error that occurred.
+ */
+ class Descriptor
+ {
+ public:
+ Descriptor(const Descriptor&) = default;
+ Descriptor& operator=(const Descriptor&) = default;
+ Descriptor(Descriptor&&) = default;
+ Descriptor& operator=(Descriptor&&) = default;
+
+ Descriptor() : log(false), err(0), callout(nullptr), path(nullptr)
+ {}
+
+ /** @brief Constructs the Descriptor object
+ *
+ * @param[in] path - the DBus error path
+ * @param[in] err - Optional error return code
+ * @param[in] callout - Optional PEL callout path
+ */
+ Descriptor(const char* path, int err = 0,
+ const char* callout = nullptr) :
+ log(true),
+ err(err), callout(callout), path(path)
+ {}
+
+ bool log;
+ int err;
+ const char* callout;
+ const char* path;
+ };
+
/** @brief Starts to monitor for error conditions
*
* @param[in] poll - Indicates whether or not the error file should
@@ -109,7 +145,7 @@
fs::path file;
/** @brief Optional function to call on error scenario */
- std::function<void(bool)> callBack;
+ std::function<void(int)> callBack;
/** @brief Reads file data
*
diff --git a/occ_ffdc.cpp b/occ_ffdc.cpp
index 0dc8d3c..3ec42be 100644
--- a/occ_ffdc.cpp
+++ b/occ_ffdc.cpp
@@ -5,6 +5,7 @@
#include <errno.h>
#include <fcntl.h>
+#include <fmt/core.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <unistd.h>
@@ -24,7 +25,8 @@
static constexpr size_t sbe_status_header_size = 8;
static constexpr auto loggingObjectPath = "/xyz/openbmc_project/logging";
-static constexpr auto loggingInterface = "org.open_power.Logging.PEL";
+static constexpr auto loggingInterface = "xyz.openbmc_project.Logging.Create";
+static constexpr auto opLoggingInterface = "org.open_power.Logging.PEL";
using namespace phosphor::logging;
using namespace sdbusplus::org::open_power::OCC::Device::Error;
@@ -60,10 +62,10 @@
try
{
std::string service =
- utils::getService(loggingObjectPath, loggingInterface);
+ utils::getService(loggingObjectPath, opLoggingInterface);
auto method =
bus.new_method_call(service.c_str(), loggingObjectPath,
- loggingInterface, "CreatePELWithFFDCFiles");
+ opLoggingInterface, "CreatePELWithFFDCFiles");
auto level =
sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
@@ -83,6 +85,47 @@
return plid;
}
+void FFDC::createOCCResetPEL(unsigned int instance, const char* path, int err,
+ const char* callout)
+{
+ std::map<std::string, std::string> additionalData;
+
+ additionalData.emplace("_PID", std::to_string(getpid()));
+
+ if (err)
+ {
+ additionalData.emplace("CALLOUT_ERRNO", std::to_string(-err));
+ }
+
+ if (callout)
+ {
+ additionalData.emplace("CALLOUT_DEVICE_PATH", std::string(callout));
+ }
+
+ additionalData.emplace("OCC", std::to_string(instance));
+
+ auto& bus = utils::getBus();
+
+ try
+ {
+ std::string service =
+ utils::getService(loggingObjectPath, loggingInterface);
+ auto method = bus.new_method_call(service.c_str(), loggingObjectPath,
+ loggingInterface, "Create");
+ auto level =
+ sdbusplus::xyz::openbmc_project::Logging::server::convertForMessage(
+ sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level::
+ Error);
+ method.append(path, level, additionalData);
+ bus.call(method);
+ }
+ catch (const sdbusplus::exception::exception& e)
+ {
+ log<level::ERR>(
+ fmt::format("Failed to create PEL: {}", e.what()).c_str());
+ }
+}
+
// Reads the FFDC file and create an error log
void FFDC::analyzeEvent()
{
diff --git a/occ_ffdc.hpp b/occ_ffdc.hpp
index a4c882d..a2b1200 100644
--- a/occ_ffdc.hpp
+++ b/occ_ffdc.hpp
@@ -53,6 +53,17 @@
static uint32_t createPEL(const char* path, uint32_t src6, const char* msg,
int fd = -1);
+ /** @brief Helper function to create a PEL for the OCC reset with the
+ * OpenPower DBus interface
+ *
+ * @param[in] instance - the OCC instance id
+ * @param[in] path - the DBus error path
+ * @param[in] err - the error return code
+ * @param[in] callout - the PEL callout path
+ */
+ static void createOCCResetPEL(unsigned int instance, const char* path,
+ int err, const char* callout);
+
private:
/** @brief OCC instance number. Ex, 0,1, etc */
unsigned int instance;
diff --git a/occ_manager.cpp b/occ_manager.cpp
index 88b0d26..1efb1e5 100644
--- a/occ_manager.cpp
+++ b/occ_manager.cpp
@@ -1288,7 +1288,7 @@
masterInstance, instance)
.c_str());
// request reset
- obj->deviceError();
+ obj->deviceError(Error::Descriptor(PRESENCE_ERROR_PATH));
}
}
}
@@ -1300,7 +1300,8 @@
statusObjects.size())
.c_str());
// request reset
- statusObjects.front()->deviceError();
+ statusObjects.front()->deviceError(
+ Error::Descriptor(PRESENCE_ERROR_PATH));
}
else
{
diff --git a/occ_presence.cpp b/occ_presence.cpp
index e06ecb8..dd92725 100644
--- a/occ_presence.cpp
+++ b/occ_presence.cpp
@@ -51,7 +51,7 @@
.c_str());
if (callBack)
{
- callBack(true);
+ callBack(occsPresent);
}
}
}
diff --git a/occ_presence.hpp b/occ_presence.hpp
index 9e66acc..ef63c89 100644
--- a/occ_presence.hpp
+++ b/occ_presence.hpp
@@ -28,7 +28,7 @@
* @param[in] callBack - Optional function callback on error condition
*/
Presence(EventPtr& event, const fs::path& file, const Manager& mgr,
- std::function<void(bool)> callBack = nullptr) :
+ std::function<void(int)> callBack = nullptr) :
Error(event, file, callBack),
manager(mgr)
{
diff --git a/occ_status.cpp b/occ_status.cpp
index 6174c91..14981c7 100644
--- a/occ_status.cpp
+++ b/occ_status.cpp
@@ -116,7 +116,7 @@
}
// Callback handler when a device error is reported.
-void Status::deviceError()
+void Status::deviceError(Error::Descriptor d)
{
#ifdef POWER10
if (pmode && device.master())
@@ -126,6 +126,11 @@
}
#endif
+ if (d.log)
+ {
+ FFDC::createOCCResetPEL(instance, d.path, d.err, d.callout);
+ }
+
// This would deem OCC inactive
this->occActive(false);
@@ -327,7 +332,7 @@
instance)
.c_str());
// Disable and reset to try recovering
- deviceError();
+ deviceError(Error::Descriptor(SAFE_ERROR_PATH));
}
}
#endif // POWER10
diff --git a/occ_status.hpp b/occ_status.hpp
index fa9d2d9..224a534 100644
--- a/occ_status.hpp
+++ b/occ_status.hpp
@@ -184,8 +184,11 @@
/** @brief Read OCC state (will trigger kernel to poll the OCC) */
void readOccState();
- /** @brief Called when device errors are detected */
- void deviceError();
+ /** @brief Called when device errors are detected
+ *
+ * @param[in] d - description of the error that occurred
+ */
+ void deviceError(Error::Descriptor d = Error::Descriptor());
#ifdef POWER10
/** @brief Handle additional tasks when the OCCs reach active state */