revert "revert log debug information for sensor threshold assert events"
Add back the orignial submit 9bf6804c2d76b92005ad9851cb052d407ea3117f
and includes the fix the bug that crashed ipmbsensor.
log debug information for sensor threshold assert events
There are sightings that TCPUx_P12_PVCCIO_VS_Temp Sensor
reports reading of zero and trips the low critical threshold.
Add debug prints to gather data.
Also add logs for raw value in sensor base class to help
debug threshold assert events for other sensor type.
Tested:
Verified that log messages show up as expected for threshold
assert events. There is no unwanted log messages on systems that
do not have bad sensor readings.
Verified system stayed up for 30 minutes without crashing.
Signed-off-by: Zhikui Ren <zhikui.ren@intel.com>
Change-Id: I73e00e24bbae463dbe0f34e2308ee934588028d1
diff --git a/include/sensor.hpp b/include/sensor.hpp
index a8321fd..7fa9300 100644
--- a/include/sensor.hpp
+++ b/include/sensor.hpp
@@ -48,6 +48,7 @@
std::shared_ptr<sdbusplus::asio::dbus_interface> availableInterface;
std::shared_ptr<sdbusplus::asio::dbus_interface> operationalInterface;
double value = std::numeric_limits<double>::quiet_NaN();
+ double rawValue = std::numeric_limits<double>::quiet_NaN();
bool overriddenState = false;
bool internalSet = false;
double hysteresisTrigger;
diff --git a/src/ADCSensor.cpp b/src/ADCSensor.cpp
index c3980e0..aa7b50e 100644
--- a/src/ADCSensor.cpp
+++ b/src/ADCSensor.cpp
@@ -79,6 +79,7 @@
}
association = objectServer.add_interface(
"/xyz/openbmc_project/sensors/voltage/" + name, association::interface);
+
setInitialProperties(conn);
}
@@ -167,11 +168,9 @@
// todo read scaling factors from configuration
try
{
- double nvalue = std::stod(response);
-
- nvalue = (nvalue / sensorScaleFactor) / scaleFactor;
+ rawValue = std::stod(response);
+ double nvalue = (rawValue / sensorScaleFactor) / scaleFactor;
nvalue = std::round(nvalue * roundFactor) / roundFactor;
-
updateValue(nvalue);
}
catch (std::invalid_argument&)
@@ -193,6 +192,7 @@
int fd = open(path.c_str(), O_RDONLY);
if (fd < 0)
{
+ std::cerr << "adcsensor " << name << " failed to open " << path << "\n";
return; // we're no longer valid
}
inputDev.assign(fd);
@@ -201,6 +201,14 @@
std::shared_ptr<ADCSensor> self = weakRef.lock();
if (ec == boost::asio::error::operation_aborted)
{
+ if (self)
+ {
+ std::cerr << "adcsensor " << self->name << " read cancelled\n";
+ }
+ else
+ {
+ std::cerr << "adcsensor read cancelled no self\n";
+ }
return; // we're being canceled
}
@@ -208,6 +216,10 @@
{
self->setupRead();
}
+ else
+ {
+ std::cerr << "adcsensor weakref no self\n";
+ }
});
}
diff --git a/src/CPUSensor.cpp b/src/CPUSensor.cpp
index d68e980..4377bf9 100644
--- a/src/CPUSensor.cpp
+++ b/src/CPUSensor.cpp
@@ -224,9 +224,9 @@
try
{
std::getline(responseStream, response);
- double nvalue = std::stod(response);
+ rawValue = std::stod(response);
responseStream.clear();
- nvalue /= CPUSensor::sensorScaleFactor;
+ double nvalue = rawValue / CPUSensor::sensorScaleFactor;
if (show)
{
diff --git a/src/HwmonTempSensor.cpp b/src/HwmonTempSensor.cpp
index 649ca68..35dc5f3 100644
--- a/src/HwmonTempSensor.cpp
+++ b/src/HwmonTempSensor.cpp
@@ -106,6 +106,8 @@
if ((err == boost::system::errc::bad_file_descriptor) ||
(err == boost::asio::error::misc_errors::not_found))
{
+ std::cerr << "Hwmon temp sensor " << name << " removed " << path
+ << "\n";
return; // we're being destroyed
}
std::istream responseStream(&readBuf);
@@ -115,8 +117,8 @@
std::getline(responseStream, response);
try
{
- double nvalue = std::stod(response);
- nvalue /= sensorScaleFactor;
+ rawValue = std::stod(response);
+ double nvalue = rawValue / sensorScaleFactor;
updateValue(nvalue);
}
catch (const std::invalid_argument&)
@@ -134,6 +136,8 @@
int fd = open(path.c_str(), O_RDONLY);
if (fd < 0)
{
+ std::cerr << "Hwmon temp sensor " << name << " not valid " << path
+ << "\n";
return; // we're no longer valid
}
inputDev.assign(fd);
@@ -143,6 +147,15 @@
std::shared_ptr<HwmonTempSensor> self = weakRef.lock();
if (ec == boost::asio::error::operation_aborted)
{
+ if (self)
+ {
+ std::cerr << "Hwmon temp sensor " << self->name
+ << " read cancelled " << self->path << "\n";
+ }
+ else
+ {
+ std::cerr << "Hwmon sensor read cancelled, no self\n";
+ }
return; // we're being canceled
}
if (self)
diff --git a/src/IpmbSensor.cpp b/src/IpmbSensor.cpp
index bc9d842..a7c1e09 100644
--- a/src/IpmbSensor.cpp
+++ b/src/IpmbSensor.cpp
@@ -241,7 +241,6 @@
return false;
}
resp = data[0];
-
return true;
}
case (ReadingFormat::byte3):
@@ -348,6 +347,18 @@
read();
return;
}
+ else
+ {
+ // rawValue only used in debug logging
+ // up to 5th byte in data are used to derive value
+ size_t end = std::min(sizeof(uint64_t), data.size());
+ uint64_t rawData = 0;
+ for (size_t i = 0; i < end; i++)
+ {
+ reinterpret_cast<uint8_t*>(&rawData)[i] = data[i];
+ }
+ rawValue = static_cast<double>(rawData);
+ }
/* Adjust value as per scale and offset */
value = (value * scaleVal) + offsetVal;
diff --git a/src/PSUSensor.cpp b/src/PSUSensor.cpp
index 6b27207..f93846d 100644
--- a/src/PSUSensor.cpp
+++ b/src/PSUSensor.cpp
@@ -143,9 +143,9 @@
try
{
std::getline(responseStream, response);
- double nvalue = std::stod(response);
+ rawValue = std::stod(response);
responseStream.clear();
- nvalue /= sensorFactor;
+ double nvalue = rawValue / sensorFactor;
updateValue(nvalue);
}
diff --git a/src/TachSensor.cpp b/src/TachSensor.cpp
index ba3b0a1..acfe659 100644
--- a/src/TachSensor.cpp
+++ b/src/TachSensor.cpp
@@ -149,9 +149,9 @@
try
{
std::getline(responseStream, response);
- double nvalue = std::stod(response);
+ rawValue = std::stod(response);
responseStream.clear();
- updateValue(nvalue);
+ updateValue(rawValue);
}
catch (const std::invalid_argument&)
{
diff --git a/src/Thresholds.cpp b/src/Thresholds.cpp
index 986d183..6aa077c 100644
--- a/src/Thresholds.cpp
+++ b/src/Thresholds.cpp
@@ -244,6 +244,7 @@
static int cLoFalse = 0;
static int cLoMidstate = 0;
static int cDebugThrottle = 0;
+static constexpr int assertLogCount = 10;
struct ChangeParam
{
@@ -276,7 +277,12 @@
if (value >= threshold.value)
{
thresholdChanges.emplace_back(threshold, true, value);
- ++cHiTrue;
+ if (++cHiTrue < assertLogCount)
+ {
+ std::cerr << "Sensor " << sensor->name << " high threshold "
+ << threshold.value << " assert: value " << value
+ << " raw data " << sensor->rawValue << "\n";
+ }
}
else if (value < (threshold.value - sensor->hysteresisTrigger))
{
@@ -293,7 +299,13 @@
if (value <= threshold.value)
{
thresholdChanges.emplace_back(threshold, true, value);
- ++cLoTrue;
+ if (++cLoTrue < assertLogCount)
+ {
+ std::cerr << "Sensor " << sensor->name << " low threshold "
+ << threshold.value << " assert: value "
+ << sensor->value << " raw data "
+ << sensor->rawValue << "\n";
+ }
}
else if (value > (threshold.value + sensor->hysteresisTrigger))
{