psu-ng: Add in detection of fan faults
If the FANS bit in the STATUS_WORD turns on (A fan or airflow fault or
warning has occurred), set a fan fault indicator in the power supply
object. During analysis of the power supplies, if a fan fault has
occurred, prioritize that over a temperature fault, include the
STATUS_TEMPERATURE and STATUS_FANS_1_2 command responses in the error
created. Call out the power supply with the fault.
Tested:
Verify no faults detected or logged on real hardware (Rainier 2S4U).
Simulate fan 1 fault on Rainier 2S2U, 110015FF PEL created.
Change-Id: Ifff5b4d96efe44b081a33caa01d70fdb578e57e3
Signed-off-by: Brandon Wyman <bjwyman@gmail.com>
diff --git a/phosphor-power-supply/power_supply.cpp b/phosphor-power-supply/power_supply.cpp
index 7c884ff..997be79 100644
--- a/phosphor-power-supply/power_supply.cpp
+++ b/phosphor-power-supply/power_supply.cpp
@@ -214,6 +214,7 @@
auto status0Vout = pmbusIntf->insertPageNum(STATUS_VOUT, 0);
statusVout = pmbusIntf->read(status0Vout, Type::Debug);
statusIout = pmbusIntf->read(STATUS_IOUT, Type::Debug);
+ statusFans12 = pmbusIntf->read(STATUS_FANS_1_2, Type::Debug);
statusTemperature =
pmbusIntf->read(STATUS_TEMPERATURE, Type::Debug);
if (statusWord & status_word::CML_FAULT)
@@ -293,6 +294,22 @@
voutUVFault = true;
}
+ if (statusWord & status_word::FAN_FAULT)
+ {
+ if (!fanFault)
+ {
+ log<level::ERR>(
+ fmt::format("FANS fault/warning: "
+ "STATUS_WORD = {:#04x}, "
+ "STATUS_MFR_SPECIFIC = {:#02x}, "
+ "STATUS_FANS_1_2 = {:#02x}",
+ statusWord, statusMFR, statusFans12)
+ .c_str());
+ }
+
+ fanFault = true;
+ }
+
if (statusWord & status_word::TEMPERATURE_FAULT_WARN)
{
if (!tempFault)
@@ -365,6 +382,7 @@
voutOVFault = false;
ioutOCFault = false;
voutUVFault = false;
+ fanFault = false;
tempFault = false;
pgoodFault = false;
}
@@ -420,6 +438,7 @@
voutOVFault = false;
ioutOCFault = false;
voutUVFault = false;
+ fanFault = false;
tempFault = false;
pgoodFault = false;
readFail = 0;
diff --git a/phosphor-power-supply/power_supply.hpp b/phosphor-power-supply/power_supply.hpp
index 149d81f..cf4c2bb 100644
--- a/phosphor-power-supply/power_supply.hpp
+++ b/phosphor-power-supply/power_supply.hpp
@@ -185,6 +185,14 @@
}
/**
+ * @brief Returns the last value read from STATUS_FANS_1_2.
+ */
+ uint64_t getStatusFans12() const
+ {
+ return statusFans12;
+ }
+
+ /**
* @brief Returns the last value read from STATUS_TEMPERATURE.
*/
uint64_t getStatusTemperature() const
@@ -198,8 +206,8 @@
bool isFaulted() const
{
return (hasCommFault() || vinUVFault || inputFault || voutOVFault ||
- ioutOCFault || voutUVFault || tempFault || pgoodFault ||
- mfrFault);
+ ioutOCFault || voutUVFault || fanFault || tempFault ||
+ pgoodFault || mfrFault);
}
/**
@@ -267,6 +275,14 @@
}
/**
+ *@brief Returns true if fan fault occurred.
+ */
+ bool hasFanFault() const
+ {
+ return fanFault;
+ }
+
+ /**
* @brief Returns true if TEMPERATURE fault occurred.
*/
bool hasTempFault() const
@@ -362,6 +378,10 @@
uint64_t statusIout = 0;
/** @brief Will be updated to the latest/last value read from
+ * STATUS_FANS_1_2. */
+ uint64_t statusFans12 = 0;
+
+ /** @brief Will be updated to the latest/last value read from
* STATUS_TEMPERATURE.*/
uint64_t statusTemperature = 0;
@@ -390,6 +410,9 @@
* of low byte is off. */
bool voutUVFault = false;
+ /** @brief True if FANS fault/warn bit on in STATUS_WORD. */
+ bool fanFault = false;
+
/** @brief True if bit 2 of STATUS_WORD low byte is on. */
bool tempFault = false;
diff --git a/phosphor-power-supply/psu_manager.cpp b/phosphor-power-supply/psu_manager.cpp
index 1c9378b..5a4af4b 100644
--- a/phosphor-power-supply/psu_manager.cpp
+++ b/phosphor-power-supply/psu_manager.cpp
@@ -509,6 +509,25 @@
psu->setFaultLogged();
}
+ // A fan fault should have priority over a temperature fault,
+ // since a failed fan may lead to a temperature problem.
+ else if (psu->hasFanFault())
+ {
+ // Include STATUS_TEMPERATURE and STATUS_FANS_1_2
+ additionalData["STATUS_TEMPERATURE"] =
+ fmt::format("{:#02x}", psu->getStatusTemperature());
+ additionalData["STATUS_FANS_1_2"] =
+ fmt::format("{:#02x}", psu->getStatusFans12());
+
+ additionalData["CALLOUT_INVENTORY_PATH"] =
+ psu->getInventoryPath();
+
+ createError(
+ "xyz.openbmc_project.Power.PowerSupply.Error.FanFault",
+ additionalData);
+
+ psu->setFaultLogged();
+ }
else if (psu->hasTempFault())
{
// Include STATUS_TEMPERATURE for temperature faults.
diff --git a/phosphor-power-supply/test/power_supply_tests.cpp b/phosphor-power-supply/test/power_supply_tests.cpp
index b340449..d42d359 100644
--- a/phosphor-power-supply/test/power_supply_tests.cpp
+++ b/phosphor-power-supply/test/power_supply_tests.cpp
@@ -30,6 +30,7 @@
uint8_t statusCMLValue{0x00};
uint8_t statusVOUTValue{0x00};
uint8_t statusIOUTValue{0x00};
+ uint8_t statusFans12Value{0x00};
uint8_t statusTempValue{0x00};
};
@@ -65,6 +66,9 @@
EXPECT_CALL(mockPMBus, read(STATUS_IOUT, _))
.Times(1)
.WillOnce(Return(expectations.statusIOUTValue));
+ EXPECT_CALL(mockPMBus, read(STATUS_FANS_1_2, _))
+ .Times(1)
+ .WillOnce(Return(expectations.statusFans12Value));
EXPECT_CALL(mockPMBus, read(STATUS_TEMPERATURE, _))
.Times(1)
.WillOnce(Return(expectations.statusTempValue));
@@ -152,6 +156,7 @@
EXPECT_EQ(psu->hasVoutOVFault(), false);
EXPECT_EQ(psu->hasIoutOCFault(), false);
EXPECT_EQ(psu->hasVoutUVFault(), false);
+ EXPECT_EQ(psu->hasFanFault(), false);
EXPECT_EQ(psu->hasTempFault(), false);
EXPECT_EQ(psu->hasPgoodFault(), false);
}
@@ -201,6 +206,7 @@
EXPECT_EQ(psu.hasVoutOVFault(), false);
EXPECT_EQ(psu.hasIoutOCFault(), false);
EXPECT_EQ(psu.hasVoutUVFault(), false);
+ EXPECT_EQ(psu.hasFanFault(), false);
EXPECT_EQ(psu.hasTempFault(), false);
EXPECT_EQ(psu.hasPgoodFault(), false);
}
@@ -237,6 +243,7 @@
EXPECT_EQ(psu2.hasVoutOVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), false);
EXPECT_EQ(psu2.hasVoutUVFault(), false);
+ EXPECT_EQ(psu2.hasFanFault(), false);
EXPECT_EQ(psu2.hasTempFault(), false);
EXPECT_EQ(psu2.hasPgoodFault(), false);
@@ -255,6 +262,7 @@
EXPECT_EQ(psu2.hasVoutOVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), false);
EXPECT_EQ(psu2.hasVoutUVFault(), false);
+ EXPECT_EQ(psu2.hasFanFault(), false);
EXPECT_EQ(psu2.hasTempFault(), false);
EXPECT_EQ(psu2.hasPgoodFault(), false);
}
@@ -281,6 +289,7 @@
EXPECT_EQ(psu2.hasVoutOVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), false);
EXPECT_EQ(psu2.hasVoutUVFault(), false);
+ EXPECT_EQ(psu2.hasFanFault(), false);
EXPECT_EQ(psu2.hasTempFault(), false);
EXPECT_EQ(psu2.hasPgoodFault(), false);
}
@@ -306,6 +315,7 @@
EXPECT_EQ(psu2.hasVoutOVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), false);
EXPECT_EQ(psu2.hasVoutUVFault(), false);
+ EXPECT_EQ(psu2.hasFanFault(), false);
EXPECT_EQ(psu2.hasTempFault(), false);
EXPECT_EQ(psu2.hasPgoodFault(), false);
}
@@ -331,6 +341,7 @@
EXPECT_EQ(psu2.hasVoutOVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), false);
EXPECT_EQ(psu2.hasVoutUVFault(), false);
+ EXPECT_EQ(psu2.hasFanFault(), false);
EXPECT_EQ(psu2.hasTempFault(), true);
EXPECT_EQ(psu2.hasPgoodFault(), false);
}
@@ -356,6 +367,7 @@
EXPECT_EQ(psu2.hasVoutOVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), false);
EXPECT_EQ(psu2.hasVoutUVFault(), false);
+ EXPECT_EQ(psu2.hasFanFault(), false);
EXPECT_EQ(psu2.hasTempFault(), false);
EXPECT_EQ(psu2.hasPgoodFault(), false);
}
@@ -383,6 +395,7 @@
EXPECT_EQ(psu2.hasVoutOVFault(), true);
EXPECT_EQ(psu2.hasVoutUVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), false);
+ EXPECT_EQ(psu2.hasFanFault(), false);
EXPECT_EQ(psu2.hasTempFault(), false);
EXPECT_EQ(psu2.hasPgoodFault(), false);
}
@@ -408,6 +421,7 @@
EXPECT_EQ(psu2.hasVoutOVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), true);
EXPECT_EQ(psu2.hasVoutUVFault(), false);
+ EXPECT_EQ(psu2.hasFanFault(), false);
EXPECT_EQ(psu2.hasTempFault(), false);
EXPECT_EQ(psu2.hasPgoodFault(), false);
}
@@ -433,21 +447,24 @@
EXPECT_EQ(psu2.hasVoutOVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), false);
EXPECT_EQ(psu2.hasVoutUVFault(), true);
+ EXPECT_EQ(psu2.hasFanFault(), false);
EXPECT_EQ(psu2.hasTempFault(), false);
EXPECT_EQ(psu2.hasPgoodFault(), false);
}
- // Ignore fan fault
+ // Fan fault
{
// First STATUS_WORD with no bits set, then with fan fault.
PMBusExpectations expectations;
setPMBusExpectations(mockPMBus, expectations);
psu2.analyze();
expectations.statusWordValue = (status_word::FAN_FAULT);
+ // STATUS_FANS_1_2 with fan 1 warning & fault bits on.
+ expectations.statusFans12Value = 0xA0;
setPMBusExpectations(mockPMBus, expectations);
psu2.analyze();
EXPECT_EQ(psu2.isPresent(), true);
- EXPECT_EQ(psu2.isFaulted(), false);
+ EXPECT_EQ(psu2.isFaulted(), true);
EXPECT_EQ(psu2.hasInputFault(), false);
EXPECT_EQ(psu2.hasMFRFault(), false);
EXPECT_EQ(psu2.hasVINUVFault(), false);
@@ -455,6 +472,7 @@
EXPECT_EQ(psu2.hasVoutOVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), false);
EXPECT_EQ(psu2.hasVoutUVFault(), false);
+ EXPECT_EQ(psu2.hasFanFault(), true);
EXPECT_EQ(psu2.hasTempFault(), false);
EXPECT_EQ(psu2.hasPgoodFault(), false);
}
@@ -482,6 +500,7 @@
EXPECT_EQ(psu2.hasVoutOVFault(), false);
EXPECT_EQ(psu2.hasVoutUVFault(), false);
EXPECT_EQ(psu2.hasIoutOCFault(), false);
+ EXPECT_EQ(psu2.hasFanFault(), false);
EXPECT_EQ(psu2.hasTempFault(), false);
EXPECT_EQ(psu2.hasPgoodFault(), true);
}
@@ -559,6 +578,7 @@
EXPECT_EQ(psu.hasVoutOVFault(), false);
EXPECT_EQ(psu.hasIoutOCFault(), false);
EXPECT_EQ(psu.hasVoutUVFault(), false);
+ EXPECT_EQ(psu.hasFanFault(), false);
EXPECT_EQ(psu.hasTempFault(), false);
EXPECT_EQ(psu.hasPgoodFault(), false);
@@ -574,6 +594,8 @@
expectations.statusVOUTValue = 0xFF;
// STATUS_IOUT with bits on.
expectations.statusIOUTValue = 0xFF;
+ // STATUS_FANS_1_2 with bits on.
+ expectations.statusFans12Value = 0xFF;
// STATUS_TEMPERATURE with bits on.
expectations.statusTempValue = 0xFF;
setPMBusExpectations(mockPMBus, expectations);
@@ -589,6 +611,7 @@
// Cannot have VOUT_OV_FAULT and VOUT_UV_FAULT.
// Rely on HasVoutUVFault() to verify this sets and clears.
EXPECT_EQ(psu.hasVoutUVFault(), false);
+ EXPECT_EQ(psu.hasFanFault(), true);
EXPECT_EQ(psu.hasTempFault(), true);
EXPECT_EQ(psu.hasPgoodFault(), true);
@@ -605,6 +628,7 @@
EXPECT_EQ(psu.hasVoutOVFault(), false);
EXPECT_EQ(psu.hasIoutOCFault(), false);
EXPECT_EQ(psu.hasVoutUVFault(), false);
+ EXPECT_EQ(psu.hasFanFault(), false);
EXPECT_EQ(psu.hasTempFault(), false);
EXPECT_EQ(psu.hasPgoodFault(), false);
@@ -698,6 +722,8 @@
expectations.statusVOUTValue = 0xFF;
// STATUS_IOUT with fault bits on.
expectations.statusIOUTValue = 0xFF;
+ // STATUS_FANS_1_2 with bits on.
+ expectations.statusFans12Value = 0xFF;
// STATUS_TEMPERATURE with fault bits on.
expectations.statusTempValue = 0xFF;
setPMBusExpectations(mockPMBus, expectations);
@@ -893,6 +919,37 @@
EXPECT_EQ(psu.hasVoutUVFault(), false);
}
+TEST_F(PowerSupplyTests, HasFanFault)
+{
+ auto bus = sdbusplus::bus::new_default();
+
+ PowerSupply psu{bus, PSUInventoryPath, 3, 0x6d, PSUGPIOLineName};
+ MockedGPIOInterface* mockPresenceGPIO =
+ static_cast<MockedGPIOInterface*>(psu.getPresenceGPIO());
+ // Always return 1 to indicate present.
+ EXPECT_CALL(*mockPresenceGPIO, read()).WillRepeatedly(Return(1));
+ psu.analyze();
+ MockedPMBus& mockPMBus = static_cast<MockedPMBus&>(psu.getPMBus());
+ EXPECT_EQ(psu.hasFanFault(), false);
+ // STATUS_WORD 0x0000 is powered on, no faults.
+ PMBusExpectations expectations;
+ setPMBusExpectations(mockPMBus, expectations);
+ psu.analyze();
+ EXPECT_EQ(psu.hasFanFault(), false);
+ // Turn fault on.
+ expectations.statusWordValue = (status_word::FAN_FAULT);
+ // STATUS_FANS_1_2 fault bit on (Fan 1 Fault)
+ expectations.statusFans12Value = 0x80;
+ setPMBusExpectations(mockPMBus, expectations);
+ psu.analyze();
+ EXPECT_EQ(psu.hasFanFault(), true);
+ // Back to no fault bits on in STATUS_WORD
+ expectations.statusWordValue = 0;
+ setPMBusExpectations(mockPMBus, expectations);
+ psu.analyze();
+ EXPECT_EQ(psu.hasFanFault(), false);
+}
+
TEST_F(PowerSupplyTests, HasTempFault)
{
auto bus = sdbusplus::bus::new_default();