Add MissingIsAcceptable feature to avoid failsafe
This is a partial implementation of the ideas here:
https://github.com/openbmc/phosphor-pid-control/issues/31
A new configuration item is supported in the PID object, named
"MissingIsAcceptable" (for D-Bus) or "missingIsAcceptable" (for the old
config.json). The value is an array of strings. If these strings match
sensor names, those sensors will be flagged as "missing is acceptable",
that is, they can go missing and the zone will not be thrown into
failsafe mode as a result.
This can be handy for sensors that are not always available on your
particular machine. It is independent of the existing Availability
interface, because the decision to go into failsafe mode or not is a
property of the PID loop, not of the sensor itself.
If a PID loop consists of all sensors that are missing, the output
will be deemed to be the setpoint, thus essentially making the PID
loop a no-op. Now initializing sensor values to NaN, not zero, as zero
is not a good default if PID loop is margin, undoing a bug I made:
https://gerrit.openbmc.org/c/openbmc/phosphor-pid-control/+/38228
Tested: It worked for me. Also, added a unit test case.
Change-Id: Idc7978ab06fcc9ed8c6c9df9483101376e5df4d1
Signed-off-by: Josh Lehan <krellan@google.com>
diff --git a/conf.hpp b/conf.hpp
index 0e26f55..265da09 100644
--- a/conf.hpp
+++ b/conf.hpp
@@ -34,13 +34,17 @@
/*
* Structure for decorating an input sensor's name with additional
- * information, to help out with TempToMargin conversion.
+ * information, such as TempToMargin and MissingIsAcceptable.
+ * This information comes from the PID loop configuration,
+ * not from SensorConfig, in order for it to be able to work
+ * with dynamic sensors from entity-manager.
*/
struct SensorInput
{
std::string name;
double convertMarginZero = std::numeric_limits<double>::quiet_NaN();
bool convertTempToMargin = false;
+ bool missingIsAcceptable = false;
};
/*
diff --git a/dbus/dbusconfiguration.cpp b/dbus/dbusconfiguration.cpp
index 2473252..7b78280 100644
--- a/dbus/dbusconfiguration.cpp
+++ b/dbus/dbusconfiguration.cpp
@@ -670,6 +670,15 @@
std::vector<std::string> inputSensorNames(
std::get<std::vector<std::string>>(base.at("Inputs")));
std::vector<std::string> outputSensorNames;
+ std::vector<std::string> missingAcceptableSensorNames;
+
+ auto findMissingAcceptable = base.find("MissingIsAcceptable");
+ if (findMissingAcceptable != base.end())
+ {
+ missingAcceptableSensorNames =
+ std::get<std::vector<std::string>>(
+ findMissingAcceptable->second);
+ }
// assumption: all fan pids must have at least one output
if (pidClass == "fan")
@@ -689,6 +698,9 @@
std::vector<SensorInterfaceType> inputSensorInterfaces;
std::vector<SensorInterfaceType> outputSensorInterfaces;
+ std::vector<SensorInterfaceType>
+ missingAcceptableSensorInterfaces;
+
/* populate an interface list for different sensor direction
* types (input,output)
*/
@@ -707,6 +719,12 @@
findSensors(sensors, sensorNameToDbusName(sensorName),
outputSensorInterfaces);
}
+ for (const std::string& sensorName :
+ missingAcceptableSensorNames)
+ {
+ findSensors(sensors, sensorNameToDbusName(sensorName),
+ missingAcceptableSensorInterfaces);
+ }
inputSensorNames.clear();
for (const SensorInterfaceType& inputSensorInterface :
@@ -752,6 +770,35 @@
}
}
+ // MissingIsAcceptable same postprocessing as Inputs
+ missingAcceptableSensorNames.clear();
+ for (const SensorInterfaceType&
+ missingAcceptableSensorInterface :
+ missingAcceptableSensorInterfaces)
+ {
+ const std::string& dbusInterface =
+ missingAcceptableSensorInterface.second;
+ const std::string& missingAcceptableSensorPath =
+ missingAcceptableSensorInterface.first;
+
+ std::string missingAcceptableSensorName =
+ getSensorNameFromPath(missingAcceptableSensorPath);
+ missingAcceptableSensorNames.push_back(
+ missingAcceptableSensorName);
+
+ if (dbusInterface != sensorInterface)
+ {
+ /* MissingIsAcceptable same error checking as Inputs
+ */
+ throw std::runtime_error(
+ "sensor at dbus path [" +
+ missingAcceptableSensorPath +
+ "] has an interface [" + dbusInterface +
+ "] that does not match the expected interface of " +
+ sensorInterface);
+ }
+ }
+
/* fan pids need to pair up tach sensors with their pwm
* counterparts
*/
@@ -864,7 +911,8 @@
}
std::vector<pid_control::conf::SensorInput> sensorInputs =
- spliceInputs(inputSensorNames, inputTempToMargin);
+ spliceInputs(inputSensorNames, inputTempToMargin,
+ missingAcceptableSensorNames);
if (offsetType.empty())
{
@@ -903,9 +951,19 @@
conf::PIDConf& conf = zoneConfig[index];
std::vector<std::string> inputs;
+ std::vector<std::string> missingAcceptableSensors;
+ std::vector<std::string> missingAcceptableSensorNames;
std::vector<std::string> sensorNames =
std::get<std::vector<std::string>>(base.at("Inputs"));
+ auto findMissingAcceptable = base.find("MissingIsAcceptable");
+ if (findMissingAcceptable != base.end())
+ {
+ missingAcceptableSensorNames =
+ std::get<std::vector<std::string>>(
+ findMissingAcceptable->second);
+ }
+
bool unavailableAsFailed = true;
auto findUnavailableAsFailed =
base.find("InputUnavailableAsFailed");
@@ -928,10 +986,8 @@
for (const auto& sensorPathIfacePair : sensorPathIfacePairs)
{
- size_t idx =
- sensorPathIfacePair.first.find_last_of("/") + 1;
std::string shortName =
- sensorPathIfacePair.first.substr(idx);
+ getSensorNameFromPath(sensorPathIfacePair.first);
inputs.push_back(shortName);
auto& config = sensorConfig[shortName];
@@ -950,6 +1006,30 @@
{
continue;
}
+
+ // MissingIsAcceptable same postprocessing as Inputs
+ for (const std::string& missingAcceptableSensorName :
+ missingAcceptableSensorNames)
+ {
+ std::vector<std::pair<std::string, std::string>>
+ sensorPathIfacePairs;
+ if (!findSensors(
+ sensors,
+ sensorNameToDbusName(missingAcceptableSensorName),
+ sensorPathIfacePairs))
+ {
+ break;
+ }
+
+ for (const auto& sensorPathIfacePair : sensorPathIfacePairs)
+ {
+ std::string shortName =
+ getSensorNameFromPath(sensorPathIfacePair.first);
+
+ missingAcceptableSensors.push_back(shortName);
+ }
+ }
+
conf::ControllerInfo& info = conf[pidName];
std::vector<double> inputTempToMargin;
@@ -961,7 +1041,8 @@
std::get<std::vector<double>>(findTempToMargin->second);
}
- info.inputs = spliceInputs(inputs, inputTempToMargin);
+ info.inputs = spliceInputs(inputs, inputTempToMargin,
+ missingAcceptableSensors);
info.type = "stepwise";
info.stepwiseInfo.ts = 1.0; // currently unused
diff --git a/pid/builder.cpp b/pid/builder.cpp
index bae0e81..39d0076 100644
--- a/pid/builder.cpp
+++ b/pid/builder.cpp
@@ -95,7 +95,7 @@
for (const auto& i : info.inputs)
{
inputs.push_back(i);
- zone->addFanInput(i.name);
+ zone->addFanInput(i.name, i.missingIsAcceptable);
}
auto pid = FanController::createFanPid(
@@ -108,7 +108,7 @@
for (const auto& i : info.inputs)
{
inputs.push_back(i);
- zone->addThermalInput(i.name);
+ zone->addThermalInput(i.name, i.missingIsAcceptable);
}
auto pid = ThermalController::createThermalPid(
@@ -126,7 +126,7 @@
for (const auto& i : info.inputs)
{
inputs.push_back(i);
- zone->addThermalInput(i.name);
+ zone->addThermalInput(i.name, i.missingIsAcceptable);
}
auto stepwise = StepwiseController::createStepwiseController(
zone.get(), name, splitNames(inputs), info.stepwiseInfo);
@@ -145,6 +145,10 @@
{
std::cerr << "[" << i.convertMarginZero << "]";
}
+ if (i.missingIsAcceptable)
+ {
+ std::cerr << "?";
+ }
std::cerr << ", ";
}
std::cerr << "\n";
diff --git a/pid/buildjson.cpp b/pid/buildjson.cpp
index 8396338..dd7d2a3 100644
--- a/pid/buildjson.cpp
+++ b/pid/buildjson.cpp
@@ -37,6 +37,7 @@
void from_json(const json& j, conf::ControllerInfo& c)
{
std::vector<std::string> inputNames;
+ std::vector<std::string> missingAcceptableNames;
j.at("type").get_to(c.type);
j.at("inputs").get_to(inputNames);
@@ -50,7 +51,14 @@
findTempToMargin->get_to(inputTempToMargin);
}
- c.inputs = spliceInputs(inputNames, inputTempToMargin);
+ auto findMissingAcceptable = j.find("missingIsAcceptable");
+ if (findMissingAcceptable != j.end())
+ {
+ findMissingAcceptable->get_to(missingAcceptableNames);
+ }
+
+ c.inputs = spliceInputs(inputNames, inputTempToMargin,
+ missingAcceptableNames);
/* TODO: We need to handle parsing other PID controller configurations.
* We can do that by checking for different keys and making the decision
diff --git a/pid/fancontroller.cpp b/pid/fancontroller.cpp
index 44852a0..ddfea20 100644
--- a/pid/fancontroller.cpp
+++ b/pid/fancontroller.cpp
@@ -175,7 +175,7 @@
{
auto sensor = _owner->getSensor(it);
auto redundantWrite = _owner->getRedundantWrite();
- int64_t rawWritten;
+ int64_t rawWritten = -1;
sensor->write(percent, redundantWrite, &rawWritten);
// The outputCache will be used later,
diff --git a/pid/zone.cpp b/pid/zone.cpp
index 0b46841..e5eddca 100644
--- a/pid/zone.cpp
+++ b/pid/zone.cpp
@@ -96,6 +96,17 @@
return !_failSafeSensors.empty();
}
+void DbusPidZone::markSensorMissing(const std::string& name)
+{
+ if (_missingAcceptable.find(name) != _missingAcceptable.end())
+ {
+ // Disallow sensors in MissingIsAcceptable list from causing failsafe
+ return;
+ }
+
+ _failSafeSensors.emplace(name);
+}
+
int64_t DbusPidZone::getZoneID(void) const
{
return _zoneId;
@@ -184,14 +195,25 @@
_cachedFanOutputs[std::string{name}] = values;
}
-void DbusPidZone::addFanInput(const std::string& fan)
+void DbusPidZone::addFanInput(const std::string& fan, bool missingAcceptable)
{
_fanInputs.push_back(fan);
+
+ if (missingAcceptable)
+ {
+ _missingAcceptable.emplace(fan);
+ }
}
-void DbusPidZone::addThermalInput(const std::string& therm)
+void DbusPidZone::addThermalInput(const std::string& therm,
+ bool missingAcceptable)
{
_thermalInputs.push_back(therm);
+
+ if (missingAcceptable)
+ {
+ _missingAcceptable.emplace(therm);
+ }
}
// Updates desired RPM setpoint from optional text file
@@ -389,21 +411,23 @@
void DbusPidZone::initializeCache(void)
{
+ auto nan = std::numeric_limits<double>::quiet_NaN();
+
for (const auto& f : _fanInputs)
{
- _cachedValuesByName[f] = {0, 0};
- _cachedFanOutputs[f] = {0, 0};
+ _cachedValuesByName[f] = {nan, nan};
+ _cachedFanOutputs[f] = {nan, nan};
// Start all fans in fail-safe mode.
- _failSafeSensors.insert(f);
+ markSensorMissing(f);
}
for (const auto& t : _thermalInputs)
{
- _cachedValuesByName[t] = {0, 0};
+ _cachedValuesByName[t] = {nan, nan};
// Start all sensors in fail-safe mode.
- _failSafeSensors.insert(t);
+ markSensorMissing(t);
}
// Initialize Pid FailSafePercent
initPidFailSafePercent();
diff --git a/pid/zone.hpp b/pid/zone.hpp
index 2854997..464e672 100644
--- a/pid/zone.hpp
+++ b/pid/zone.hpp
@@ -71,6 +71,7 @@
bool getRedundantWrite(void) const override;
void setManualMode(bool mode);
bool getFailSafeMode(void) const override;
+ void markSensorMissing(const std::string& name);
int64_t getZoneID(void) const override;
void addSetPoint(double setPoint, const std::string& name) override;
@@ -99,8 +100,8 @@
double getCachedValue(const std::string& name) override;
ValueCacheEntry getCachedValues(const std::string& name) override;
- void addFanInput(const std::string& fan);
- void addThermalInput(const std::string& therm);
+ void addFanInput(const std::string& fan, bool missingAcceptable);
+ void addThermalInput(const std::string& therm, bool missingAcceptable);
void initializeLog(void) override;
void writeLog(const std::string& value) override;
@@ -166,7 +167,8 @@
// check if fan fail.
if (sensor->getFailed())
{
- _failSafeSensors.insert(sensorInput);
+ markSensorMissing(sensorInput);
+
if (debugEnabled)
{
std::cerr << sensorInput << " sensor get failed\n";
@@ -174,7 +176,8 @@
}
else if (timeout != 0 && duration >= period)
{
- _failSafeSensors.insert(sensorInput);
+ markSensorMissing(sensorInput);
+
if (debugEnabled)
{
std::cerr << sensorInput << " sensor timeout\n";
@@ -191,6 +194,7 @@
std::cerr << sensorInput
<< " is erased from failsafe sensor set\n";
}
+
_failSafeSensors.erase(kt);
}
}
@@ -213,6 +217,7 @@
const conf::CycleTime _cycleTime;
std::set<std::string> _failSafeSensors;
+ std::set<std::string> _missingAcceptable;
std::vector<double> _SetPoints;
std::vector<double> _RPMCeilings;
diff --git a/test/pid_zone_unittest.cpp b/test/pid_zone_unittest.cpp
index f0b96b1..6c570cc 100644
--- a/test/pid_zone_unittest.cpp
+++ b/test/pid_zone_unittest.cpp
@@ -376,8 +376,8 @@
EXPECT_EQ(mgr.getSensor(name2), sensor_ptr2);
// Now that the sensors exist, add them to the zone.
- zone->addThermalInput(name1);
- zone->addThermalInput(name2);
+ zone->addThermalInput(name1, false);
+ zone->addThermalInput(name2, false);
// Initialize Zone
zone->initializeCache();
@@ -428,8 +428,8 @@
EXPECT_EQ(mgr.getSensor(name2), sensor_ptr2);
// Now that the sensors exist, add them to the zone.
- zone->addFanInput(name1);
- zone->addFanInput(name2);
+ zone->addFanInput(name1, false);
+ zone->addFanInput(name2, false);
// Initialize Zone
zone->initializeCache();
@@ -475,8 +475,8 @@
mgr.addSensor(type, name2, std::move(sensor2));
EXPECT_EQ(mgr.getSensor(name2), sensor_ptr2);
- zone->addThermalInput(name1);
- zone->addThermalInput(name2);
+ zone->addThermalInput(name1, false);
+ zone->addThermalInput(name2, false);
// Initialize Zone
zone->initializeCache();
@@ -512,6 +512,105 @@
EXPECT_TRUE(zone->getFailSafeMode());
}
+TEST_F(PidZoneTest, ThermalInput_MissingIsAcceptableNoFailSafe)
+{
+ // This is similar to the above test, but because missingIsAcceptable
+ // is set for sensor1, the zone should not enter failsafe mode when
+ // only sensor1 goes missing.
+ // However, sensor2 going missing should still trigger failsafe mode.
+
+ int64_t timeout = 1;
+
+ std::string name1 = "temp1";
+ std::unique_ptr<Sensor> sensor1 = std::make_unique<SensorMock>(name1,
+ timeout);
+ SensorMock* sensor_ptr1 = reinterpret_cast<SensorMock*>(sensor1.get());
+
+ std::string name2 = "temp2";
+ std::unique_ptr<Sensor> sensor2 = std::make_unique<SensorMock>(name2,
+ timeout);
+ SensorMock* sensor_ptr2 = reinterpret_cast<SensorMock*>(sensor2.get());
+
+ std::string type = "unchecked";
+ mgr.addSensor(type, name1, std::move(sensor1));
+ EXPECT_EQ(mgr.getSensor(name1), sensor_ptr1);
+ mgr.addSensor(type, name2, std::move(sensor2));
+ EXPECT_EQ(mgr.getSensor(name2), sensor_ptr2);
+
+ // Only sensor1 has MissingIsAcceptable enabled for it
+ zone->addThermalInput(name1, true);
+ zone->addThermalInput(name2, false);
+
+ // Initialize Zone
+ zone->initializeCache();
+
+ // As sensors are not initialized, zone should be in failsafe mode
+ EXPECT_TRUE(zone->getFailSafeMode());
+
+ // r1 not populated here, intentionally, to simulate a sensor that
+ // is not available yet, perhaps takes a long time to start up.
+ ReadReturn r1;
+ EXPECT_CALL(*sensor_ptr1, read()).WillOnce(Return(r1));
+
+ ReadReturn r2;
+ r2.value = 11.0;
+ r2.updated = std::chrono::high_resolution_clock::now();
+ EXPECT_CALL(*sensor_ptr2, read()).WillOnce(Return(r2));
+
+ zone->updateSensors();
+
+ // Only sensor2 has been initialized here. Failsafe should be false,
+ // because sensor1 MissingIsAcceptable so it is OK for it to go missing.
+ EXPECT_FALSE(zone->getFailSafeMode());
+
+ r1.value = 10.0;
+ r1.updated = std::chrono::high_resolution_clock::now();
+
+ EXPECT_CALL(*sensor_ptr1, read()).WillOnce(Return(r1));
+ EXPECT_CALL(*sensor_ptr2, read()).WillOnce(Return(r2));
+ zone->updateSensors();
+
+ // Both sensors are now properly initialized
+ EXPECT_FALSE(zone->getFailSafeMode());
+
+ // Ok, so we're not in failsafe mode, so let's set updated to the past.
+ // sensor1 will have an updated field older than its timeout value, but
+ // sensor2 will be fine. :D
+ r1.updated -= std::chrono::seconds(3);
+ r2.updated = std::chrono::high_resolution_clock::now();
+
+ EXPECT_CALL(*sensor_ptr1, read()).WillOnce(Return(r1));
+ EXPECT_CALL(*sensor_ptr2, read()).WillOnce(Return(r2));
+ zone->updateSensors();
+
+ // MissingIsAcceptable is true for sensor1, so the zone should not be
+ // thrown into failsafe mode.
+ EXPECT_FALSE(zone->getFailSafeMode());
+
+ // Do the same thing, but for the opposite sensors: r1 is good,
+ // but r2 is set to some time in the past.
+ r1.updated = std::chrono::high_resolution_clock::now();
+ r2.updated -= std::chrono::seconds(3);
+
+ EXPECT_CALL(*sensor_ptr1, read()).WillOnce(Return(r1));
+ EXPECT_CALL(*sensor_ptr2, read()).WillOnce(Return(r2));
+ zone->updateSensors();
+
+ // Now, the zone should be in failsafe mode, because sensor2 does not
+ // have MissingIsAcceptable set true, it is still subject to failsafe.
+ EXPECT_TRUE(zone->getFailSafeMode());
+
+ r1.updated = std::chrono::high_resolution_clock::now();
+ r2.updated = std::chrono::high_resolution_clock::now();
+
+ EXPECT_CALL(*sensor_ptr1, read()).WillOnce(Return(r1));
+ EXPECT_CALL(*sensor_ptr2, read()).WillOnce(Return(r2));
+ zone->updateSensors();
+
+ // The failsafe mode should cease, as both sensors are good again.
+ EXPECT_FALSE(zone->getFailSafeMode());
+}
+
TEST_F(PidZoneTest, FanInputTest_FailsafeToValid_ReadsSensors)
{
// This will add a couple fan inputs, and verify the values are cached.
@@ -535,8 +634,8 @@
EXPECT_EQ(mgr.getSensor(name2), sensor_ptr2);
// Now that the sensors exist, add them to the zone.
- zone->addFanInput(name1);
- zone->addFanInput(name2);
+ zone->addFanInput(name1, false);
+ zone->addFanInput(name2, false);
// Initialize Zone
zone->initializeCache();
@@ -588,8 +687,8 @@
EXPECT_EQ(mgr.getSensor(name2), sensor_ptr2);
// Now that the sensors exist, add them to the zone.
- zone->addFanInput(name1);
- zone->addFanInput(name2);
+ zone->addFanInput(name1, false);
+ zone->addFanInput(name2, false);
// Initialize Zone
zone->initializeCache();
@@ -639,7 +738,7 @@
mgr.addSensor(type, name1, std::move(sensor1));
EXPECT_EQ(mgr.getSensor(name1), sensor_ptr1);
- zone->addThermalInput(name1);
+ zone->addThermalInput(name1, false);
// Verify method under test returns the pointer we expect.
EXPECT_EQ(mgr.getSensor(name1), zone->getSensor(name1));
diff --git a/test/zone_mock.hpp b/test/zone_mock.hpp
index 8ba9c85..a3d6be9 100644
--- a/test/zone_mock.hpp
+++ b/test/zone_mock.hpp
@@ -23,7 +23,7 @@
ValueCacheEntry getCachedValues(const std::string& s)
{
auto v = getCachedValue(s);
- return ValueCacheEntry(v, v);
+ return {v, v};
}
MOCK_CONST_METHOD0(getRedundantWrite, bool(void));
diff --git a/util.cpp b/util.cpp
index 87c4deb..b83ed43 100644
--- a/util.cpp
+++ b/util.cpp
@@ -103,15 +103,16 @@
std::vector<conf::SensorInput>
spliceInputs(const std::vector<std::string>& inputNames,
- const std::vector<double>& inputTempToMargin)
+ const std::vector<double>& inputTempToMargin,
+ const std::vector<std::string>& missingAcceptableNames)
{
std::vector<conf::SensorInput> results;
- // Default to the TempToMargin feature disabled
+ // Default to TempToMargin and MissingIsAcceptable disabled
for (const auto& inputName : inputNames)
{
conf::SensorInput newInput{
- inputName, std::numeric_limits<double>::quiet_NaN(), false};
+ inputName, std::numeric_limits<double>::quiet_NaN(), false, false};
results.emplace_back(newInput);
}
@@ -132,6 +133,23 @@
results[index].convertTempToMargin = true;
}
+ std::set<std::string> acceptableSet;
+
+ // Copy vector to set, to avoid O(n^2) runtime below
+ for (const auto& name : missingAcceptableNames)
+ {
+ acceptableSet.emplace(name);
+ }
+
+ // Flag missingIsAcceptable true if name found in that set
+ for (auto& result : results)
+ {
+ if (acceptableSet.find(result.name) != acceptableSet.end())
+ {
+ result.missingIsAcceptable = true;
+ }
+ }
+
return results;
}
diff --git a/util.hpp b/util.hpp
index 0588934..7617562 100644
--- a/util.hpp
+++ b/util.hpp
@@ -46,7 +46,8 @@
*/
std::vector<conf::SensorInput>
spliceInputs(const std::vector<std::string>& inputNames,
- const std::vector<double>& inputTempToMargin);
+ const std::vector<double>& inputTempToMargin,
+ const std::vector<std::string>& missingAcceptableNames);
/*
* Recovers the original "Inputs" vector from spliceInputs().