Find and call out faulted GPUs
Isolate down to the GPU that caused the GPU PGOOD or
overtemp summary fault bit to turn on. On Witherspoon
this involves reading GPIOs on a pca9552 device to find
the GPU signaling the fault.
GPUs are not currently in the inventory, so the code
isn't doing the standard callout by adding a certain
metadata field. The GPU number that failed will just
be added to the error log metadata, and work will be done
with support to make sure that is documented. Also, the
other power fault callouts don't use the standard inventory
callouts either as they are more complicated than just a single
FRU, so this method is consistent with that.
Note that these faults do not cause the system to
power off automatically like other power faults, though
a future commit will power off the system on a GPU overtemp.
Change-Id: If4053f32a06a335a6612a04a8164d34306530b22
Signed-off-by: Matt Spinler <spinler@us.ibm.com>
diff --git a/power-sequencer/ucd90160.cpp b/power-sequencer/ucd90160.cpp
index 8317837..715dcd1 100644
--- a/power-sequencer/ucd90160.cpp
+++ b/power-sequencer/ucd90160.cpp
@@ -217,6 +217,16 @@
if (gpiStatus == Value::low)
{
+ //There may be some extra analysis we can do to narrow the
+ //error down further. Note that finding an error here won't
+ //prevent us from checking this GPI again.
+ errorCreated = doExtraAnalysis(gpiConfig);
+
+ if (errorCreated)
+ {
+ continue;
+ }
+
auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig);
auto status = (gpiStatus == Value::low) ? 0 : 1;
@@ -283,6 +293,98 @@
return gpioDevicePath;
}
+bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config)
+{
+
+ auto type = std::get<ucd90160::extraAnalysisField>(config);
+ if (type == ucd90160::extraAnalysisType::none)
+ {
+ return false;
+ }
+
+ //Currently the only extra analysis to do is to check other GPIOs.
+ return doGPIOAnalysis(type);
+}
+
+bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type)
+{
+ bool errorFound = false;
+
+ const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>(
+ deviceMap.find(getInstance())->second);
+
+ auto gpioConfig = analysisConfig.find(type);
+ if (gpioConfig == analysisConfig.end())
+ {
+ return errorFound;
+ }
+
+ auto path = std::get<ucd90160::gpioDevicePathField>(
+ gpioConfig->second);
+
+ //The /dev/gpiochipX device
+ auto device = findGPIODevice(path);
+
+ //The GPIO value of the fault condition
+ auto polarity = std::get<ucd90160::gpioPolarityField>(
+ gpioConfig->second);
+
+ //The GPIOs to check
+ auto& gpios = std::get<ucd90160::gpioDefinitionField>(
+ gpioConfig->second);
+
+ for (const auto& gpio : gpios)
+ {
+ gpio::Value value;
+
+ try
+ {
+ GPIO g{device,
+ std::get<ucd90160::gpioNumField>(gpio),
+ Direction::input};
+
+ value = g.read();
+ }
+ catch (std::exception& e)
+ {
+ if (!gpioAccessError)
+ {
+ //GPIO only throws InternalErrors - not worth committing.
+ log<level::ERR>(
+ "GPIO read failed while analyzing a power fault",
+ entry("CHIP_PATH=%s", path.c_str()));
+
+ gpioAccessError = true;
+ }
+ continue;
+ }
+
+ if (value == polarity)
+ {
+ errorFound = true;
+
+ auto part = std::get<ucd90160::gpioCalloutField>(gpio);
+ PartCallout callout{type, part};
+
+ if (isPartCalledOut(callout))
+ {
+ continue;
+ }
+
+ //Look up and call the error creation function
+ auto logError = std::get<ucd90160::errorFunctionField>(
+ gpioConfig->second);
+
+ logError(*this, part);
+
+ //Save the part callout so we don't call it out again
+ setPartCallout(callout);
+ }
+ }
+
+ return errorFound;
+}
+
void UCD90160::gpuPGOODError(const std::string& callout)
{
util::NamesValues nv;
diff --git a/power-sequencer/ucd90160.hpp b/power-sequencer/ucd90160.hpp
index cf4baad..b82711d 100644
--- a/power-sequencer/ucd90160.hpp
+++ b/power-sequencer/ucd90160.hpp
@@ -14,6 +14,10 @@
namespace power
{
+//Error type, callout
+using PartCallout =
+ std::tuple<ucd90160::extraAnalysisType, std::string>;
+
/**
* @class UCD90160
*
@@ -133,6 +137,35 @@
uint32_t readMFRStatus();
/**
+ * Does any additional fault analysis based on the
+ * value of the extraAnalysisType field in the GPIOConfig
+ * entry.
+ *
+ * Used to get better callouts.
+ *
+ * @param[in] config - the GPIOConfig entry to use
+ *
+ * @return bool - true if a HW error was found, false else
+ */
+ bool doExtraAnalysis(const ucd90160::GPIConfig& config);
+
+ /**
+ * Does additional fault analysis using GPIOs to
+ * specifically identify the failing part.
+ *
+ * Used when there are too many PGOOD inputs for
+ * the UCD90160 to handle, so just a summary bit
+ * is wired into the chip, and then the specific
+ * fault GPIOs are off of a different GPIO device,
+ * like an IO expander.
+ *
+ * @param[in] type - the type of analysis to do
+ *
+ * @return bool - true if a HW error was found, false else
+ */
+ bool doGPIOAnalysis(ucd90160::extraAnalysisType type);
+
+ /**
* Says if we've already logged a Vout fault
*
* The policy is only 1 of the same error will
@@ -179,6 +212,22 @@
}
/**
+ * Says if we've already logged a specific fault
+ * against a specific part
+ *
+ * @param[in] callout - error type and name tuple
+ *
+ * @return bool - if we've already logged this fault
+ * against this part
+ */
+ inline bool isPartCalledOut(const PartCallout& callout) const
+ {
+ return std::find(callouts.begin(),
+ callouts.end(),
+ callout) != callouts.end();
+ }
+
+ /**
* Saves that a PGOOD fault has been logged
*
* @param[in] input - the input the error was logged against
@@ -189,6 +238,16 @@
}
/**
+ * Saves that a specific fault on a specific part has been done
+ *
+ * @param[in] callout - error type and name tuple
+ */
+ inline void setPartCallout(const PartCallout& callout)
+ {
+ callouts.push_back(callout);
+ }
+
+ /**
* List of pages that Vout errors have
* already been logged against
*/
@@ -201,6 +260,11 @@
std::vector<uint32_t> pgoodErrors;
/**
+ * List of callouts that already been done
+ */
+ std::vector<PartCallout> callouts;
+
+ /**
* The read/write interface to this hardware
*/
pmbus::PMBus interface;
@@ -218,6 +282,13 @@
bool accessError = false;
/**
+ * Keeps track of GPIO access errors when doing the in depth
+ * PGOOD fault analysis to avoid repeatedly logging errors
+ * for bad hardware
+ */
+ bool gpioAccessError = false;
+
+ /**
* The path to the GPIO device used to read
* the GPI (PGOOD) status
*/