Add in support for power supply over-temperature

If a power supply encounters an over-temperature condition, the FAN
FAULT OR WARNING bit of the STATUS_WORD (low byte) command response
should be turned on. Ideally, when this is encountered, both power
supplies would be called out, as one condition that could lead to this
is due to the other supply putting out less current. Since each
monitoring application of the power supplies only knows its own
information, just the power supply indicating the condition will be
called out along with metadata containing potentially relevant command
response data.

Change-Id: I8f96828f85161050f73bb080392e1e8fef4a179b
Signed-off-by: Brandon Wyman <bjwyman@gmail.com>
diff --git a/elog-errors.hpp b/elog-errors.hpp
index 3f3e073..d0c5500 100644
--- a/elog-errors.hpp
+++ b/elog-errors.hpp
@@ -75,6 +75,26 @@
 {
 namespace openbmc_project
 {
+namespace Power
+{
+namespace Fault
+{
+namespace Error
+{
+    struct PowerSupplyTemperatureFault;
+} // namespace Error
+} // namespace Fault
+} // namespace Power
+} // namespace openbmc_project
+} // namespace xyz
+} // namespace sdbusplus
+
+namespace sdbusplus
+{
+namespace xyz
+{
+namespace openbmc_project
+{
 namespace Common
 {
 namespace Callout
@@ -927,6 +947,54 @@
 {
 namespace Fault
 {
+namespace _PowerSupplyTemperatureFault
+{
+
+struct RAW_STATUS
+{
+    static constexpr auto str = "RAW_STATUS=%s";
+    static constexpr auto str_short = "RAW_STATUS";
+    using type = std::tuple<std::decay_t<decltype(str)>,const char*>;
+    explicit constexpr RAW_STATUS(const char* a) : _entry(entry(str, a)) {};
+    type _entry;
+};
+
+}  // namespace _PowerSupplyTemperatureFault
+
+struct PowerSupplyTemperatureFault
+{
+    static constexpr auto L = level::ERR;
+    using RAW_STATUS = _PowerSupplyTemperatureFault::RAW_STATUS;
+    using CALLOUT_INVENTORY_PATH = xyz::openbmc_project::Common::Callout::Inventory::CALLOUT_INVENTORY_PATH;
+    using metadata_types = std::tuple<RAW_STATUS, CALLOUT_INVENTORY_PATH>;
+
+};
+
+} // namespace Fault
+} // namespace Power
+} // namespace openbmc_project
+} // namespace xyz
+
+
+namespace details
+{
+
+template <>
+struct map_exception_type<sdbusplus::xyz::openbmc_project::Power::Fault::Error::PowerSupplyTemperatureFault>
+{
+    using type = xyz::openbmc_project::Power::Fault::PowerSupplyTemperatureFault;
+};
+
+}
+
+namespace xyz
+{
+namespace openbmc_project
+{
+namespace Power
+{
+namespace Fault
+{
 namespace _Shutdown
 {
 
diff --git a/pmbus.hpp b/pmbus.hpp
index a2572f7..fc9a1cb 100644
--- a/pmbus.hpp
+++ b/pmbus.hpp
@@ -69,6 +69,16 @@
 // out if that fault is on.
 constexpr auto VIN_UV_FAULT = 0x0008;
 
+// The bit mask representing the TEMPERATURE FAULT or WARNING bit of the
+// STATUS_WORD. Bit 2 of the low byte (STATUS_BYTE).
+constexpr auto TEMPERATURE_FAULT_WARN = 0x0004;
+
+}
+
+namespace status_temperature
+{
+// Overtemperature Fault
+constexpr auto OT_FAULT = 0x80;
 }
 
 /**
diff --git a/power-supply/power_supply.cpp b/power-supply/power_supply.cpp
index d9e870c..0bdbbfa 100644
--- a/power-supply/power_supply.cpp
+++ b/power-supply/power_supply.cpp
@@ -110,6 +110,7 @@
                 checkCurrentOutOverCurrentFault(statusWord);
                 checkOutputOvervoltageFault(statusWord);
                 checkFanFault(statusWord);
+                checkTemperatureFault(statusWord);
             }
         }
     }
@@ -145,6 +146,7 @@
             outputOCFault = false;
             outputOVFault = false;
             fanFault = false;
+            temperatureFault = false;
         }
     }
 
@@ -197,6 +199,7 @@
             outputOCFault = false;
             outputOVFault = false;
             fanFault = false;
+            temperatureFault = false;
             powerOnTimer.start(powerOnInterval, Timer::TimerType::oneshot);
         }
         else
@@ -398,7 +401,8 @@
         !outputOVFault)
     {
         statusInput = pmbusIntf.read(STATUS_INPUT, Type::Debug);
-        statusVout = pmbusIntf.read(STATUS_VOUT, Type::Debug);
+        auto status0Vout = pmbusIntf.insertPageNum(STATUS_VOUT, 0);
+        statusVout = pmbusIntf.read(status0Vout, Type::Debug);
         statusIout = pmbusIntf.read(STATUS_IOUT, Type::Debug);
         statusMFR = pmbusIntf.read(STATUS_MFR, Type::Debug);
 
@@ -429,7 +433,7 @@
     std::uint8_t statusTemperature = 0;
     std::uint8_t statusFans12 = 0;
 
-    // Check for an output overcurrent fault.
+    // Check for a fan fault or warning condition
     if ((statusWord & status_word::FAN_FAULT) &&
         !fanFault)
     {
@@ -454,6 +458,55 @@
     }
 }
 
+void PowerSupply::checkTemperatureFault(const uint16_t statusWord)
+{
+    using namespace witherspoon::pmbus;
+
+    // Due to how the PMBus core device driver sends a clear faults command
+    // the bit in STATUS_WORD will likely be cleared when we attempt to examine
+    // it for a Thermal Fault or Warning. So, check the STATUS_WORD and the
+    // STATUS_TEMPERATURE bits. If either indicates a fault, proceed with
+    // logging the over-temperature condition.
+    std::uint8_t statusTemperature = 0;
+    statusTemperature = pmbusIntf.read(STATUS_TEMPERATURE, Type::Debug);
+    if (((statusWord & status_word::TEMPERATURE_FAULT_WARN) ||
+         (statusTemperature & status_temperature::OT_FAULT)) &&
+        !temperatureFault)
+    {
+        // The power supply has had an over-temperature condition.
+        // This may not result in a shutdown if experienced for a short
+        // duration.
+        // This should not occur under normal conditions.
+        // The power supply may be faulty, or the paired supply may be putting
+        // out less current.
+        // Capture command responses with potentially relevant information,
+        // and call out the power supply reporting the condition.
+        std::uint8_t statusMFR = 0;
+        std::uint8_t statusIout = 0;
+        std::uint8_t statusFans12 = 0;
+
+        statusMFR = pmbusIntf.read(STATUS_MFR, Type::Debug);
+        statusIout = pmbusIntf.read(STATUS_IOUT, Type::Debug);
+        statusFans12 = pmbusIntf.read(STATUS_FANS_1_2, Type::Debug);
+
+        util::NamesValues nv;
+        nv.add("STATUS_WORD", statusWord);
+        nv.add("MFR_SPECIFIC", statusMFR);
+        nv.add("STATUS_IOUT", statusIout);
+        nv.add("STATUS_TEMPERATURE", statusTemperature);
+        nv.add("STATUS_FANS_1_2", statusFans12);
+
+        using metadata = xyz::openbmc_project::Power::Fault::
+                PowerSupplyTemperatureFault;
+
+        report<PowerSupplyTemperatureFault>(
+                metadata::RAW_STATUS(nv.get().c_str()),
+                metadata::CALLOUT_INVENTORY_PATH(inventoryPath.c_str()));
+
+        temperatureFault = true;
+    }
+}
+
 void PowerSupply::clearFaults()
 {
     //TODO - Clear faults at pre-poweron. openbmc/openbmc#1736
diff --git a/power-supply/power_supply.hpp b/power-supply/power_supply.hpp
index a3cea2d..379a937 100644
--- a/power-supply/power_supply.hpp
+++ b/power-supply/power_supply.hpp
@@ -90,7 +90,7 @@
         /** @brief True if the power supply is present. */
         bool present = false;
 
-        /** @brief Used to subscribe to D-Bus property changes to Present **/
+        /** @brief Used to subscribe to D-Bus property changes for Present */
         std::unique_ptr<sdbusplus::bus::match_t> presentMatch;
 
         /** @brief True if the power is on. */
@@ -119,7 +119,7 @@
          */
         Timer powerOnTimer;
 
-        /** @brief Used to subscribe to D-Bus power on state changes **/
+        /** @brief Used to subscribe to D-Bus power on state changes */
         std::unique_ptr<sdbusplus::bus::match_t> powerOnMatch;
 
         /** @brief Has a PMBus read failure already been logged? */
@@ -160,6 +160,11 @@
         bool fanFault = false;
 
         /**
+         * @brief Set to true during a temperature fault or warn condition.
+         */
+        bool temperatureFault = false;
+
+        /**
          * @brief Callback for inventory property changes
          *
          * Process change of Present property for power supply.
@@ -181,7 +186,7 @@
         /**
          * @brief Updates the poweredOn status by querying D-Bus
          *
-         * The D-Bus property for the sytem power state will be read to
+         * The D-Bus property for the system power state will be read to
          * determine if the system is powered on or not.
          */
         void updatePowerState();
@@ -240,6 +245,17 @@
          */
         void checkFanFault(const uint16_t statusWord);
 
+        /**
+         * @brief Checks for a temperature fault or warning condition.
+         *
+         * The low byte of STATUS_WORD is checked to see if the "TEMPERATURE
+         * FAULT OR WARNING" bit is turned on. If it is on, log an error,
+         * call out the power supply indicating the fault/warning condition.
+         *
+         * @parma[in] statusWord - 2 byte STATUS_WORD value read from sysfs
+         */
+        void checkTemperatureFault(const uint16_t statusWord);
+
 };
 
 }
diff --git a/xyz/openbmc_project/Power/Fault.errors.yaml b/xyz/openbmc_project/Power/Fault.errors.yaml
index 9f2e770..451f0fa 100644
--- a/xyz/openbmc_project/Power/Fault.errors.yaml
+++ b/xyz/openbmc_project/Power/Fault.errors.yaml
@@ -10,6 +10,8 @@
   description: The power supply detected an output overvoltage fault condition.
 - name: PowerSupplyFanFault
   description: The power supply detected bad fan operation.
+- name: PowerSupplyTemperatureFault
+  description: The power supply has had an over temperature condition.
 - name: Shutdown
   description: A power off was issued because a power fault was detected
 
diff --git a/xyz/openbmc_project/Power/Fault.metadata.yaml b/xyz/openbmc_project/Power/Fault.metadata.yaml
index b92da35..07c96eb 100644
--- a/xyz/openbmc_project/Power/Fault.metadata.yaml
+++ b/xyz/openbmc_project/Power/Fault.metadata.yaml
@@ -36,6 +36,13 @@
       type: string
   inherits:
     - xyz.openbmc_project.Common.Callout.Inventory
+- name: PowerSupplyTemperatureFault
+  level: ERR
+  meta:
+    - str: "RAW_STATUS=%s"
+      type: string
+  inherits:
+    - xyz.openbmc_project.Common.Callout.Inventory
 - name: Shutdown
   level: ERR