| #include "config.h" |
| |
| #include "log_manager.hpp" |
| |
| #include "elog_entry.hpp" |
| #include "elog_meta.hpp" |
| #include "elog_serialize.hpp" |
| #include "extensions.hpp" |
| #include "util.hpp" |
| |
| #include <poll.h> |
| #include <sys/inotify.h> |
| #include <systemd/sd-bus.h> |
| #include <systemd/sd-journal.h> |
| #include <unistd.h> |
| |
| #include <cassert> |
| #include <chrono> |
| #include <cstdio> |
| #include <cstring> |
| #include <fstream> |
| #include <functional> |
| #include <future> |
| #include <iostream> |
| #include <map> |
| #include <phosphor-logging/lg2.hpp> |
| #include <sdbusplus/vtable.hpp> |
| #include <set> |
| #include <string> |
| #include <string_view> |
| #include <vector> |
| #include <xyz/openbmc_project/State/Host/server.hpp> |
| |
| using namespace std::chrono; |
| extern const std::map< |
| phosphor::logging::metadata::Metadata, |
| std::function<phosphor::logging::metadata::associations::Type>> |
| meta; |
| |
| namespace phosphor |
| { |
| namespace logging |
| { |
| namespace internal |
| { |
| inline auto getLevel(const std::string& errMsg) |
| { |
| auto reqLevel = Entry::Level::Error; // Default to Error |
| |
| auto levelmap = g_errLevelMap.find(errMsg); |
| if (levelmap != g_errLevelMap.end()) |
| { |
| reqLevel = static_cast<Entry::Level>(levelmap->second); |
| } |
| |
| return reqLevel; |
| } |
| |
| int Manager::getRealErrSize() |
| { |
| return realErrors.size(); |
| } |
| |
| int Manager::getInfoErrSize() |
| { |
| return infoErrors.size(); |
| } |
| |
| uint32_t Manager::commit(uint64_t transactionId, std::string errMsg) |
| { |
| auto level = getLevel(errMsg); |
| _commit(transactionId, std::move(errMsg), level); |
| return entryId; |
| } |
| |
| uint32_t Manager::commitWithLvl(uint64_t transactionId, std::string errMsg, |
| uint32_t errLvl) |
| { |
| _commit(transactionId, std::move(errMsg), |
| static_cast<Entry::Level>(errLvl)); |
| return entryId; |
| } |
| |
| void Manager::_commit(uint64_t transactionId [[maybe_unused]], |
| std::string&& errMsg, Entry::Level errLvl) |
| { |
| std::vector<std::string> additionalData{}; |
| |
| // When running as a test-case, the system may have a LOT of journal |
| // data and we may not have permissions to do some of the journal sync |
| // operations. Just skip over them. |
| if (!IS_UNIT_TEST) |
| { |
| static constexpr auto transactionIdVar = |
| std::string_view{"TRANSACTION_ID"}; |
| // Length of 'TRANSACTION_ID' string. |
| static constexpr auto transactionIdVarSize = transactionIdVar.size(); |
| // Length of 'TRANSACTION_ID=' string. |
| static constexpr auto transactionIdVarOffset = transactionIdVarSize + 1; |
| |
| // Flush all the pending log messages into the journal |
| journalSync(); |
| |
| sd_journal* j = nullptr; |
| int rc = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY); |
| if (rc < 0) |
| { |
| lg2::error("Failed to open journal: {ERROR}", "ERROR", |
| strerror(-rc)); |
| return; |
| } |
| |
| std::string transactionIdStr = std::to_string(transactionId); |
| std::set<std::string> metalist; |
| auto metamap = g_errMetaMap.find(errMsg); |
| if (metamap != g_errMetaMap.end()) |
| { |
| metalist.insert(metamap->second.begin(), metamap->second.end()); |
| } |
| |
| // Add _PID field information in AdditionalData. |
| metalist.insert("_PID"); |
| |
| // Read the journal from the end to get the most recent entry first. |
| // The result from the sd_journal_get_data() is of the form |
| // VARIABLE=value. |
| SD_JOURNAL_FOREACH_BACKWARDS(j) |
| { |
| const char* data = nullptr; |
| size_t length = 0; |
| |
| // Look for the transaction id metadata variable |
| rc = sd_journal_get_data(j, transactionIdVar.data(), |
| (const void**)&data, &length); |
| if (rc < 0) |
| { |
| // This journal entry does not have the TRANSACTION_ID |
| // metadata variable. |
| continue; |
| } |
| |
| // journald does not guarantee that sd_journal_get_data() returns |
| // NULL terminated strings, so need to specify the size to use to |
| // compare, use the returned length instead of anything that relies |
| // on NULL terminators like strlen(). The data variable is in the |
| // form of 'TRANSACTION_ID=1234'. Remove the TRANSACTION_ID |
| // characters plus the (=) sign to do the comparison. 'data + |
| // transactionIdVarOffset' will be in the form of '1234'. 'length - |
| // transactionIdVarOffset' will be the length of '1234'. |
| if ((length <= (transactionIdVarOffset)) || |
| (transactionIdStr.compare( |
| 0, transactionIdStr.size(), data + transactionIdVarOffset, |
| length - transactionIdVarOffset) != 0)) |
| { |
| // The value of the TRANSACTION_ID metadata is not the requested |
| // transaction id number. |
| continue; |
| } |
| |
| // Search for all metadata variables in the current journal entry. |
| for (auto i = metalist.cbegin(); i != metalist.cend();) |
| { |
| rc = sd_journal_get_data(j, (*i).c_str(), (const void**)&data, |
| &length); |
| if (rc < 0) |
| { |
| // Metadata variable not found, check next metadata |
| // variable. |
| i++; |
| continue; |
| } |
| |
| // Metadata variable found, save it and remove it from the set. |
| additionalData.emplace_back(data, length); |
| i = metalist.erase(i); |
| } |
| if (metalist.empty()) |
| { |
| // All metadata variables found, break out of journal loop. |
| break; |
| } |
| } |
| if (!metalist.empty()) |
| { |
| // Not all the metadata variables were found in the journal. |
| for (auto& metaVarStr : metalist) |
| { |
| lg2::info("Failed to find metadata: {META_FIELD}", "META_FIELD", |
| metaVarStr); |
| } |
| } |
| |
| sd_journal_close(j); |
| } |
| createEntry(errMsg, errLvl, additionalData); |
| } |
| |
| void Manager::createEntry(std::string errMsg, Entry::Level errLvl, |
| std::vector<std::string> additionalData, |
| const FFDCEntries& ffdc) |
| { |
| if (!Extensions::disableDefaultLogCaps()) |
| { |
| if (errLvl < Entry::sevLowerLimit) |
| { |
| if (realErrors.size() >= ERROR_CAP) |
| { |
| erase(realErrors.front()); |
| } |
| } |
| else |
| { |
| if (infoErrors.size() >= ERROR_INFO_CAP) |
| { |
| erase(infoErrors.front()); |
| } |
| } |
| } |
| |
| entryId++; |
| if (errLvl >= Entry::sevLowerLimit) |
| { |
| infoErrors.push_back(entryId); |
| } |
| else |
| { |
| realErrors.push_back(entryId); |
| } |
| auto ms = std::chrono::duration_cast<std::chrono::milliseconds>( |
| std::chrono::system_clock::now().time_since_epoch()) |
| .count(); |
| auto objPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); |
| |
| AssociationList objects{}; |
| processMetadata(errMsg, additionalData, objects); |
| |
| auto e = std::make_unique<Entry>( |
| busLog, objPath, entryId, |
| ms, // Milliseconds since 1970 |
| errLvl, std::move(errMsg), std::move(additionalData), |
| std::move(objects), fwVersion, getEntrySerializePath(entryId), *this); |
| |
| serialize(*e); |
| |
| if (isQuiesceOnErrorEnabled() && (errLvl < Entry::sevLowerLimit) && |
| isCalloutPresent(*e)) |
| { |
| quiesceOnError(entryId); |
| } |
| |
| // Add entry before calling the extensions so that they have access to it |
| entries.insert(std::make_pair(entryId, std::move(e))); |
| |
| doExtensionLogCreate(*entries.find(entryId)->second, ffdc); |
| |
| // Note: No need to close the file descriptors in the FFDC. |
| } |
| |
| bool Manager::isQuiesceOnErrorEnabled() |
| { |
| // When running under tests, the Logging.Settings service will not be |
| // present. Assume false. |
| if (IS_UNIT_TEST) |
| { |
| return false; |
| } |
| |
| std::variant<bool> property; |
| |
| auto method = this->busLog.new_method_call( |
| "xyz.openbmc_project.Settings", "/xyz/openbmc_project/logging/settings", |
| "org.freedesktop.DBus.Properties", "Get"); |
| |
| method.append("xyz.openbmc_project.Logging.Settings", "QuiesceOnHwError"); |
| |
| try |
| { |
| auto reply = this->busLog.call(method); |
| reply.read(property); |
| } |
| catch (const sdbusplus::exception_t& e) |
| { |
| lg2::error("Error reading QuiesceOnHwError property: {ERROR}", "ERROR", |
| e); |
| throw; |
| } |
| |
| return std::get<bool>(property); |
| } |
| |
| bool Manager::isCalloutPresent(const Entry& entry) |
| { |
| for (const auto& c : entry.additionalData()) |
| { |
| if (c.find("CALLOUT_") != std::string::npos) |
| { |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| void Manager::findAndRemoveResolvedBlocks() |
| { |
| for (auto& entry : entries) |
| { |
| if (entry.second->resolved()) |
| { |
| checkAndRemoveBlockingError(entry.first); |
| } |
| } |
| } |
| |
| void Manager::onEntryResolve(sdbusplus::message_t& msg) |
| { |
| using Interface = std::string; |
| using Property = std::string; |
| using Value = std::string; |
| using Properties = std::map<Property, std::variant<Value>>; |
| |
| Interface interface; |
| Properties properties; |
| |
| msg.read(interface, properties); |
| |
| for (const auto& p : properties) |
| { |
| if (p.first == "Resolved") |
| { |
| findAndRemoveResolvedBlocks(); |
| return; |
| } |
| } |
| } |
| |
| void Manager::checkAndQuiesceHost() |
| { |
| using Host = sdbusplus::xyz::openbmc_project::State::server::Host; |
| |
| // First check host state |
| std::variant<Host::HostState> property; |
| |
| auto method = this->busLog.new_method_call( |
| "xyz.openbmc_project.State.Host", "/xyz/openbmc_project/state/host0", |
| "org.freedesktop.DBus.Properties", "Get"); |
| |
| method.append("xyz.openbmc_project.State.Host", "CurrentHostState"); |
| |
| try |
| { |
| auto reply = this->busLog.call(method); |
| reply.read(property); |
| } |
| catch (const sdbusplus::exception_t& e) |
| { |
| // Quiescing the host is a "best effort" type function. If unable to |
| // read the host state or it comes back empty, just return. |
| // The boot block object will still be created and the associations to |
| // find the log will be present. Don't want a dependency with |
| // phosphor-state-manager service |
| lg2::info("Error reading QuiesceOnHwError property: {ERROR}", "ERROR", |
| e); |
| return; |
| } |
| |
| auto hostState = std::get<Host::HostState>(property); |
| if (hostState != Host::HostState::Running) |
| { |
| return; |
| } |
| |
| auto quiesce = this->busLog.new_method_call( |
| "org.freedesktop.systemd1", "/org/freedesktop/systemd1", |
| "org.freedesktop.systemd1.Manager", "StartUnit"); |
| |
| quiesce.append("obmc-host-graceful-quiesce@0.target"); |
| quiesce.append("replace"); |
| |
| this->busLog.call_noreply(quiesce); |
| } |
| |
| void Manager::quiesceOnError(const uint32_t entryId) |
| { |
| // Verify we don't already have this entry blocking |
| auto it = find_if(this->blockingErrors.begin(), this->blockingErrors.end(), |
| [&](const std::unique_ptr<Block>& obj) { |
| return obj->entryId == entryId; |
| }); |
| if (it != this->blockingErrors.end()) |
| { |
| // Already recorded so just return |
| lg2::debug( |
| "QuiesceOnError set and callout present but entry already logged"); |
| return; |
| } |
| |
| lg2::info("QuiesceOnError set and callout present"); |
| |
| auto blockPath = |
| std::string(OBJ_LOGGING) + "/block" + std::to_string(entryId); |
| auto blockObj = std::make_unique<Block>(this->busLog, blockPath, entryId); |
| this->blockingErrors.push_back(std::move(blockObj)); |
| |
| // Register call back if log is resolved |
| using namespace sdbusplus::bus::match::rules; |
| auto entryPath = std::string(OBJ_ENTRY) + '/' + std::to_string(entryId); |
| auto callback = std::make_unique<sdbusplus::bus::match_t>( |
| this->busLog, |
| propertiesChanged(entryPath, "xyz.openbmc_project.Logging.Entry"), |
| std::bind(std::mem_fn(&Manager::onEntryResolve), this, |
| std::placeholders::_1)); |
| |
| propChangedEntryCallback.insert( |
| std::make_pair(entryId, std::move(callback))); |
| |
| checkAndQuiesceHost(); |
| } |
| |
| void Manager::doExtensionLogCreate(const Entry& entry, const FFDCEntries& ffdc) |
| { |
| // Make the association <endpointpath>/<endpointtype> paths |
| std::vector<std::string> assocs; |
| for (const auto& [forwardType, reverseType, endpoint] : |
| entry.associations()) |
| { |
| std::string e{endpoint}; |
| e += '/' + reverseType; |
| assocs.push_back(e); |
| } |
| |
| for (auto& create : Extensions::getCreateFunctions()) |
| { |
| try |
| { |
| create(entry.message(), entry.id(), entry.timestamp(), |
| entry.severity(), entry.additionalData(), assocs, ffdc); |
| } |
| catch (const std::exception& e) |
| { |
| lg2::error( |
| "An extension's create function threw an exception: {ERROR}", |
| "ERROR", e); |
| } |
| } |
| } |
| |
| void Manager::processMetadata(const std::string& /*errorName*/, |
| const std::vector<std::string>& additionalData, |
| AssociationList& objects) const |
| { |
| // additionalData is a list of "metadata=value" |
| constexpr auto separator = '='; |
| for (const auto& entryItem : additionalData) |
| { |
| auto found = entryItem.find(separator); |
| if (std::string::npos != found) |
| { |
| auto metadata = entryItem.substr(0, found); |
| auto iter = meta.find(metadata); |
| if (meta.end() != iter) |
| { |
| (iter->second)(metadata, additionalData, objects); |
| } |
| } |
| } |
| } |
| |
| void Manager::checkAndRemoveBlockingError(uint32_t entryId) |
| { |
| // First look for blocking object and remove |
| auto it = find_if(blockingErrors.begin(), blockingErrors.end(), |
| [&](const std::unique_ptr<Block>& obj) { |
| return obj->entryId == entryId; |
| }); |
| if (it != blockingErrors.end()) |
| { |
| blockingErrors.erase(it); |
| } |
| |
| // Now remove the callback looking for the error to be resolved |
| auto resolveFind = propChangedEntryCallback.find(entryId); |
| if (resolveFind != propChangedEntryCallback.end()) |
| { |
| propChangedEntryCallback.erase(resolveFind); |
| } |
| |
| return; |
| } |
| |
| void Manager::erase(uint32_t entryId) |
| { |
| auto entryFound = entries.find(entryId); |
| if (entries.end() != entryFound) |
| { |
| for (auto& func : Extensions::getDeleteProhibitedFunctions()) |
| { |
| try |
| { |
| bool prohibited = false; |
| func(entryId, prohibited); |
| if (prohibited) |
| { |
| // Future work remains to throw an error here. |
| return; |
| } |
| } |
| catch (const std::exception& e) |
| { |
| lg2::error("An extension's deleteProhibited function threw an " |
| "exception: {ERROR}", |
| "ERROR", e); |
| } |
| } |
| |
| // Delete the persistent representation of this error. |
| fs::path errorPath(ERRLOG_PERSIST_PATH); |
| errorPath /= std::to_string(entryId); |
| fs::remove(errorPath); |
| |
| auto removeId = [](std::list<uint32_t>& ids, uint32_t id) { |
| auto it = std::find(ids.begin(), ids.end(), id); |
| if (it != ids.end()) |
| { |
| ids.erase(it); |
| } |
| }; |
| if (entryFound->second->severity() >= Entry::sevLowerLimit) |
| { |
| removeId(infoErrors, entryId); |
| } |
| else |
| { |
| removeId(realErrors, entryId); |
| } |
| entries.erase(entryFound); |
| |
| checkAndRemoveBlockingError(entryId); |
| |
| for (auto& remove : Extensions::getDeleteFunctions()) |
| { |
| try |
| { |
| remove(entryId); |
| } |
| catch (const std::exception& e) |
| { |
| lg2::error("An extension's delete function threw an exception: " |
| "{ERROR}", |
| "ERROR", e); |
| } |
| } |
| } |
| else |
| { |
| lg2::error("Invalid entry ID ({ID}) to delete", "ID", entryId); |
| } |
| } |
| |
| void Manager::restore() |
| { |
| auto sanity = [](const auto& id, const auto& restoredId) { |
| return id == restoredId; |
| }; |
| |
| fs::path dir(ERRLOG_PERSIST_PATH); |
| if (!fs::exists(dir) || fs::is_empty(dir)) |
| { |
| return; |
| } |
| |
| for (auto& file : fs::directory_iterator(dir)) |
| { |
| auto id = file.path().filename().c_str(); |
| auto idNum = std::stol(id); |
| auto e = std::make_unique<Entry>( |
| busLog, std::string(OBJ_ENTRY) + '/' + id, idNum, *this); |
| if (deserialize(file.path(), *e)) |
| { |
| // validate the restored error entry id |
| if (sanity(static_cast<uint32_t>(idNum), e->id())) |
| { |
| e->path(file.path(), true); |
| if (e->severity() >= Entry::sevLowerLimit) |
| { |
| infoErrors.push_back(idNum); |
| } |
| else |
| { |
| realErrors.push_back(idNum); |
| } |
| |
| entries.insert(std::make_pair(idNum, std::move(e))); |
| } |
| else |
| { |
| lg2::error( |
| "Failed in sanity check while restoring error entry. " |
| "Ignoring error entry {ID_NUM}/{ENTRY_ID}.", |
| "ID_NUM", idNum, "ENTRY_ID", e->id()); |
| } |
| } |
| } |
| |
| if (!entries.empty()) |
| { |
| entryId = entries.rbegin()->first; |
| } |
| } |
| |
| void Manager::journalSync() |
| { |
| bool syncRequested = false; |
| auto fd = -1; |
| auto rc = -1; |
| auto wd = -1; |
| auto bus = sdbusplus::bus::new_default(); |
| |
| auto start = |
| duration_cast<microseconds>(steady_clock::now().time_since_epoch()) |
| .count(); |
| |
| // Each time an error log is committed, a request to sync the journal |
| // must occur and block that error log commit until it completes. A 5sec |
| // block is done to allow sufficient time for the journal to be synced. |
| // |
| // Number of loop iterations = 3 for the following reasons: |
| // Iteration #1: Requests a journal sync by killing the journald service. |
| // Iteration #2: Setup an inotify watch to monitor the synced file that |
| // journald updates with the timestamp the last time the |
| // journal was flushed. |
| // Iteration #3: Poll to wait until inotify reports an event which blocks |
| // the error log from being commited until the sync completes. |
| constexpr auto maxRetry = 3; |
| for (int i = 0; i < maxRetry; i++) |
| { |
| // Read timestamp from synced file |
| constexpr auto syncedPath = "/run/systemd/journal/synced"; |
| std::ifstream syncedFile(syncedPath); |
| if (syncedFile.fail()) |
| { |
| // If the synced file doesn't exist, a sync request will create it. |
| if (errno != ENOENT) |
| { |
| lg2::error( |
| "Failed to open journal synced file {FILENAME}: {ERROR}", |
| "FILENAME", syncedPath, "ERROR", strerror(errno)); |
| return; |
| } |
| } |
| else |
| { |
| // Only read the synced file if it exists. |
| // See if a sync happened by now |
| std::string timestampStr; |
| std::getline(syncedFile, timestampStr); |
| auto timestamp = std::stoll(timestampStr); |
| if (timestamp >= start) |
| { |
| break; |
| } |
| } |
| |
| // Let's ask for a sync, but only once |
| if (!syncRequested) |
| { |
| syncRequested = true; |
| |
| constexpr auto JOURNAL_UNIT = "systemd-journald.service"; |
| auto signal = SIGRTMIN + 1; |
| |
| auto method = bus.new_method_call(SYSTEMD_BUSNAME, SYSTEMD_PATH, |
| SYSTEMD_INTERFACE, "KillUnit"); |
| method.append(JOURNAL_UNIT, "main", signal); |
| bus.call(method); |
| if (method.is_method_error()) |
| { |
| lg2::error("Failed to kill journal service"); |
| break; |
| } |
| |
| continue; |
| } |
| |
| // Let's install the inotify watch, if we didn't do that yet. This watch |
| // monitors the syncedFile for when journald updates it with a newer |
| // timestamp. This means the journal has been flushed. |
| if (fd < 0) |
| { |
| fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); |
| if (fd < 0) |
| { |
| lg2::error("Failed to create inotify watch: {ERROR}", "ERROR", |
| strerror(errno)); |
| return; |
| } |
| |
| constexpr auto JOURNAL_RUN_PATH = "/run/systemd/journal"; |
| wd = inotify_add_watch(fd, JOURNAL_RUN_PATH, |
| IN_MOVED_TO | IN_DONT_FOLLOW | IN_ONLYDIR); |
| if (wd < 0) |
| { |
| lg2::error("Failed to watch journal directory: {PATH}: {ERROR}", |
| "PATH", JOURNAL_RUN_PATH, "ERROR", strerror(errno)); |
| close(fd); |
| return; |
| } |
| continue; |
| } |
| |
| // Let's wait until inotify reports an event |
| struct pollfd fds = { |
| fd, |
| POLLIN, |
| 0, |
| }; |
| constexpr auto pollTimeout = 5; // 5 seconds |
| rc = poll(&fds, 1, pollTimeout * 1000); |
| if (rc < 0) |
| { |
| lg2::error("Failed to add event: {ERROR}", "ERROR", |
| strerror(errno)); |
| inotify_rm_watch(fd, wd); |
| close(fd); |
| return; |
| } |
| else if (rc == 0) |
| { |
| lg2::info("Poll timeout ({TIMEOUT}), no new journal synced data", |
| "TIMEOUT", pollTimeout); |
| break; |
| } |
| |
| // Read from the specified file descriptor until there is no new data, |
| // throwing away everything read since the timestamp will be read at the |
| // beginning of the loop. |
| constexpr auto maxBytes = 64; |
| uint8_t buffer[maxBytes]; |
| while (read(fd, buffer, maxBytes) > 0) |
| ; |
| } |
| |
| if (fd != -1) |
| { |
| if (wd != -1) |
| { |
| inotify_rm_watch(fd, wd); |
| } |
| close(fd); |
| } |
| |
| return; |
| } |
| |
| std::string Manager::readFWVersion() |
| { |
| auto version = util::getOSReleaseValue("VERSION_ID"); |
| |
| if (!version) |
| { |
| lg2::error("Unable to read BMC firmware version"); |
| } |
| |
| return version.value_or(""); |
| } |
| |
| void Manager::create(const std::string& message, Entry::Level severity, |
| const std::map<std::string, std::string>& additionalData) |
| { |
| // Convert the map into a vector of "key=value" strings |
| std::vector<std::string> ad; |
| metadata::associations::combine(additionalData, ad); |
| |
| createEntry(message, severity, ad); |
| } |
| |
| void Manager::createWithFFDC( |
| const std::string& message, Entry::Level severity, |
| const std::map<std::string, std::string>& additionalData, |
| const FFDCEntries& ffdc) |
| { |
| // Convert the map into a vector of "key=value" strings |
| std::vector<std::string> ad; |
| metadata::associations::combine(additionalData, ad); |
| |
| createEntry(message, severity, ad, ffdc); |
| } |
| |
| } // namespace internal |
| } // namespace logging |
| } // namespace phosphor |