nvl32: get through enumerating all devices
Adds support for enumerating devices through
entity-manager
Tested by:
putting on nvl32-obmc and booting bmc, all devices
are enumerated:
```
root@nvl32-obmc:~# busctl tree xyz.openbmc_project.EntityManager
`- /xyz
`- /xyz/openbmc_project
|- /xyz/openbmc_project/EntityManager
`- /xyz/openbmc_project/inventory
`- /xyz/openbmc_project/inventory/system
`- /xyz/openbmc_project/inventory/system/board
|- /xyz/openbmc_project/inventory/system/board/NVIDIA_Alon_cx8_Fru
| `- /xyz/openbmc_project/inventory/system/board/NVIDIA_Alon_cx8_Fru/NVIDIA_Alon_cx8_Fru
|- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_50
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_50/GPU_0
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_50/GPU_0SMA
| `- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_50/NVIDIA_RTXPro6000_50
|- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_51
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_51/GPU_1
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_51/GPU_1SMA
| `- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_51/NVIDIA_RTXPro6000_51
|- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_54
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_54/GPU_2
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_54/GPU_2SMA
| `- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_54/NVIDIA_RTXPro6000_54
|- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_55
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_55/GPU_3
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_55/GPU_3SMA
| `- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_55/NVIDIA_RTXPro6000_55
|- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_58
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_58/GPU_4
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_58/GPU_4SMA
| `- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_58/NVIDIA_RTXPro6000_58
|- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_59
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_59/GPU_5
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_59/GPU_5SMA
| `- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_59/NVIDIA_RTXPro6000_59
|- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_62
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_62/GPU_6
| |- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_62/GPU_6SMA
| `- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_62/NVIDIA_RTXPro6000_62
`- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_63
|- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_63/GPU_7
|- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_63/GPU_7SMA
`- /xyz/openbmc_project/inventory/system/board/Nvidia_RTX6000_GPU_63/NVIDIA_RTXPro6000_63
```
Change-Id: Ie524d8444be2cc19061227455ba71b4f61ee6683
Signed-off-by: Marc Olberding <molberding@nvidia.com>
diff --git a/meson.build b/meson.build
index 63b8773..2b5d5a3 100644
--- a/meson.build
+++ b/meson.build
@@ -9,6 +9,8 @@
gpiodcxx_dep = dependency('libgpiodcxx', default_options: ['bindings=cxx'])
systemd_dep = dependency('systemd')
libsystemd_dep = dependency('libsystemd')
+sdbusplus = dependency('sdbusplus', include_type: 'system')
+
cli11_dep = dependency('CLI11', required: true, include_type: 'system')
i2c_dep = meson.get_compiler('cpp').find_library('i2c')
@@ -19,7 +21,7 @@
exe = executable(
'platform',
['platform.cpp'] + platform_srcs + util_srcs,
- dependencies: [gpiodcxx_dep, libsystemd_dep, cli11_dep, i2c_dep],
+ dependencies: [gpiodcxx_dep, libsystemd_dep, cli11_dep, i2c_dep, sdbusplus],
include_directories: ['.', 'nvidia'],
install: true,
install_dir: get_option('libexecdir'),
diff --git a/nvidia/nvl32.cpp b/nvidia/nvl32.cpp
index 0dde0d8..12f0a94 100644
--- a/nvidia/nvl32.cpp
+++ b/nvidia/nvl32.cpp
@@ -4,12 +4,19 @@
#include <systemd/sd-daemon.h>
+#include <sdbusplus/asio/connection.hpp>
+
#include <chrono>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <thread>
+#include <unordered_map>
+using JsonVariantType =
+ std::variant<uint8_t, std::vector<std::string>, std::vector<double>,
+ std::string, int64_t, uint64_t, double, int32_t, uint32_t,
+ int16_t, uint16_t, bool>;
namespace nvidia
{
@@ -75,9 +82,11 @@
void probe_dev(size_t bus, uint8_t address, std::string_view dev_type)
{
- std::filesystem::path path =
+ std::string path =
std::format("/sys/bus/i2c/devices/i2c-{}/new_device", bus);
+ wait_for_path_to_exist(path, std::chrono::milliseconds{1000});
+
std::ofstream f{path};
if (!f.good())
{
@@ -97,7 +106,7 @@
{
probe_dev(bus, address, dev_type);
- std::filesystem::path idle =
+ std::string idle =
std::format("/sys/bus/i2c/devices/{}-{:04x}/idle_state", bus, address);
std::ofstream idle_f{idle};
if (!idle_f.good())
@@ -196,22 +205,297 @@
bringup_gpus_on_mcio(bus);
}
-void enumerate_mctp(int dev_num)
-{
- // TODO: Make this a proper dbus client
- std::string preamble = "busctl call au.com.codeconstruct.MCTP1";
- std::string postamble =
- "au.com.codeconstruct.MCTP1.BusOwner1 AssignEndpoint ay 0";
+const char* mctpd_service = "au.com.codeconstruct.MCTP1";
+const char* mctp_obj = "/au/com/codeconstruct/mctp1/";
+const char* mctp_busowner = "au.com.codeconstruct.MCTP.BusOwner1";
+const char* mctp_bridge = "au.com.codeconstruct.MCTP.Bridge1";
- std::string cmd =
- std::format("{} /au/com/codeconstruct/mctp1/interfaces/mctpusb{} {}",
- preamble, dev_num, postamble);
- logged_system(cmd);
+template <typename PropertyType>
+PropertyType get_property(const char* service, const char* object,
+ const char* interface, const char* property_name)
+{
+ auto b = sdbusplus::bus::new_default_system();
+ auto m = b.new_method_call(service, object,
+ "org.freedesktop.DBus.Properties", "Get");
+ m.append(interface, property_name);
+
+ std::variant<PropertyType> t;
+ auto reply = b.call(m);
+
+ reply.read(t);
+ return std::get<PropertyType>(t);
}
-void wait_for_usb_to_probe()
+// given a device index
+// enumerate the mctp interface
+// and give back the eid
+uint8_t enumerate_mctp(uint8_t device_idx)
{
- std::this_thread::sleep_for(std::chrono::seconds{20});
+ std::vector<uint8_t> address = {};
+ std::string obj = std::format(
+ "/au/com/codeconstruct/mctp1/interfaces/mctpusb{}", device_idx);
+
+ std::cerr << "calling " << obj << std::endl;
+
+ auto b = sdbusplus::bus::new_default_system();
+ auto m = b.new_method_call(mctpd_service, obj.c_str(), mctp_busowner,
+ "AssignEndpoint");
+ m.append(address);
+
+ auto reply = b.call(m);
+
+ uint8_t eid;
+ int32_t net;
+ std::string intf;
+ bool probed;
+ reply.read(eid, net, intf, probed);
+
+ return eid;
+}
+
+// We need to get the pool start and size
+std::tuple<uint8_t, uint8_t> get_pool_start_and_size(uint8_t eid)
+{
+ std::string obj =
+ std::format("/au/com/codeconstruct/mctp1/networks/1/endpoints/{}", eid);
+ std::cerr << "calling " << obj << std::endl;
+
+ uint8_t poolstart = get_property<uint8_t>(mctpd_service, obj.c_str(),
+ mctp_bridge, "PoolStart");
+ uint8_t poolend = get_property<uint8_t>(mctpd_service, obj.c_str(),
+ mctp_bridge, "PoolEnd");
+
+ uint8_t poolsize = poolend - poolstart + 1;
+
+ std::cerr << std::format("eid {} has pool start {} and size {}", eid,
+ poolstart, poolsize)
+ << std::endl;
+ return {poolstart, poolsize};
+}
+
+int get_device_from_port_string(std::string_view port_string)
+{
+ std::filesystem::path path = port_string;
+ path /= "net";
+ int dev_index = -1;
+ auto p = path.native();
+ wait_for_path_to_exist(p, std::chrono::milliseconds{20000});
+
+ for (const auto& dir : std::filesystem::directory_iterator(path))
+ {
+ // this looks something like:
+ // /sys/devices/platform/ahb/1e6a3000.usb/usb1/1-1/1-1.2/1-1.2.3/1-1.2.3:1.0/net/mctpusb7
+ // we want to extract the final "7"
+ std::cerr << "Looking at " << dir.path().native() << std::endl;
+
+ auto f_name = dir.path().filename().native();
+ if (f_name.starts_with("mctpusb"))
+ {
+ std::from_chars(f_name.data() + 7, f_name.data() + f_name.size(),
+ dev_index);
+ break;
+ }
+ }
+
+ if (dev_index == -1)
+ {
+ std::cerr << std::format("Unable to find an mctpusb net device at {}\n",
+ path.native());
+ }
+
+ std::cerr << "found mctp device index " << dev_index << std::endl;
+ return dev_index;
+}
+
+bool is_populated(std::string board, std::string name)
+{
+ std::string obj = std::format(
+ "/xyz/openbmc_project/inventory/system/board/{}/{}", board, name);
+ std::cerr << "inspecting " << obj << std::endl;
+ try
+ {
+ uint8_t eid = get_property<uint8_t>(
+ "xyz.openbmc_project.EntityManager", obj.c_str(),
+ "xyz.openbmc_project.Configuration.NvidiaMctpVdm", "StaticEid");
+ (void)eid;
+ return true;
+ }
+ catch (...)
+ {
+ return false;
+ }
+}
+
+void force_rescan()
+{
+ auto b = sdbusplus::bus::new_default_system();
+ auto m = b.new_method_call("xyz.openbmc_project.EntityManager",
+ "/xyz/openbmc_project/EntityManager",
+ "xyz.openbmc_project.EntityManager", "ReScan");
+ b.call(m);
+}
+
+void populate_gpu(std::string board, uint8_t eid, std::string name)
+{
+ if (is_populated(board, name))
+ {
+ std::cerr << name << " already exists" << std::endl;
+ return;
+ }
+
+ std::string obj =
+ std::format("/xyz/openbmc_project/inventory/system/board/{}", board);
+
+ std::cerr << "calling with " << obj << std::endl;
+
+ std::chrono::steady_clock::time_point start =
+ std::chrono::steady_clock::now();
+ std::chrono::steady_clock::time_point end = start + std::chrono::minutes{3};
+ auto b = sdbusplus::bus::new_default_system();
+ auto m = b.new_method_call("xyz.openbmc_project.EntityManager", obj.c_str(),
+ "xyz.openbmc_project.AddObject", "AddObject");
+ std::unordered_map<std::string, JsonVariantType> param;
+ param["Name"] = name;
+ param["StaticEid"] = eid;
+ param["Type"] = "NvidiaMctpVdm";
+
+ m.append(param);
+
+ do
+ {
+ auto now = std::chrono::steady_clock::now();
+ if (now >= end)
+ {
+ std::cerr << "Timeout: Failed to add " << obj << std::endl;
+ return;
+ }
+ try
+ {
+ b.call(m);
+ return;
+ }
+ catch (...)
+ {
+ std::cerr << "Failed to find " << obj << " trying again"
+ << std::endl;
+ std::this_thread::sleep_for(std::chrono::seconds{10});
+ continue;
+ }
+ } while (true);
+}
+
+struct bridge_device
+{
+ std::string usb_path;
+ std::string name;
+ std::string board_name;
+};
+
+void bringup_devices()
+{
+ // There's a lot of hackery going on here
+ // This is for handling (as of today) unsupported bridged endpoints
+ // The MCU's on this platform act as MCTP bridges
+ // We know their absolute USB path through the platform hub, and that's
+ // symlinked to a mctp net device So we will start there we also know that
+ // each device the USB device is bridging to will always have the same
+ // relative ordering
+ // inside of a given pool. This is not a generally true assumption but it
+ // is true for our MCU's
+ // So we can put each bridge and is downstream devices through enumeration
+ // with mctpd, when we get the response, we know the bridges eid we can then
+ // ask mctpd what the pool size and start eid is for the bridge pool. From
+ // there we can infer the eid of each bridged device behind it and call
+ // AddObject on EntityManager for each board to bring up the requisite nodes
+ // beneath it which will allow the rest of the system to start behaving as
+ // expected. Once we have real support for bridged eid's, we can and should
+ // delete this mess.
+ static constexpr const char* usb_prefix =
+ "/sys/devices/platform/ahb/1e6a3000.usb/usb1/1-1/";
+ const std::array<bridge_device, 10> device_name_map = {
+ {{.usb_path = "1-1.2/1-1.2.1/1-1.2.1:1.0",
+ .name = "GPU_0",
+ .board_name = "Nvidia_RTX_PRO_6000_Blackwell_1"},
+ {.usb_path = "1-1.1/1-1.1.2/1-1.1.2.1/1-1.1.2.1:1.0",
+ .name = "GPU_1",
+ .board_name = "Nvidia_RTX_PRO_6000_Blackwell_2"},
+ {.usb_path = "1-1.4/1-1.4.1/1-1.4.1:1.0",
+ .name = "GPU_2",
+ .board_name = "Nvidia_RTX_PRO_6000_Blackwell_3"},
+ {.usb_path = "1-1.2/1-1.2.2/1-1.2.2:1.0",
+ .name = "GPU_3",
+ .board_name = "Nvidia_RTX_PRO_6000_Blackwell_4"},
+ {.usb_path = "1-1.1/1-1.1.4/1-1.1.4.1/1-1.1.4.1:1.0",
+ .name = "GPU_4",
+ .board_name = "Nvidia_RTX_PRO_6000_Blackwell_5"},
+ {.usb_path = "1-1.1/1-1.1.2/1-1.1.2.2/1-1.1.2.2:1.0",
+ .name = "GPU_5",
+ .board_name = "Nvidia_RTX_PRO_6000_Blackwell_6"},
+ {.usb_path = "1-1.4/1-1.4.2/1-1.4.2:1.0",
+ .name = "GPU_6",
+ .board_name = "Nvidia_RTX_PRO_6000_Blackwell_7"},
+ {.usb_path = "1-1.2/1-1.2.3/1-1.2.3:1.0",
+ .name = "CX8_0",
+ .board_name = "NVIDIA_Alon_cx8_Fru"},
+ {.usb_path = "1-1.1/1-1.1.4/1-1.1.4.2/1-1.1.4.2:1.0",
+ .name = "GPU_7",
+ .board_name = "Nvidia_RTX_PRO_6000_Blackwell_8"},
+ {.usb_path = "1-1.1/1-1.1.2/1-1.1.2.3/1-1.1.2.3:1.0",
+ .name = "CX8_1",
+ .board_name = "NVIDIA_Alon_cx8_Fru"}}};
+
+ for (const auto& [usb_path, name, board_name] : device_name_map)
+ {
+ std::cerr << "looking at device " << name << std::endl;
+ std::string path = std::format("{}/{}", usb_prefix, usb_path);
+ int dev_index = get_device_from_port_string(path);
+ if (dev_index < 0)
+ {
+ std::cerr << std::format(
+ "Unable to bring up {} because it doesn't seem to exist\n",
+ name);
+ continue;
+ }
+
+ // enumerate the bridge device
+ uint8_t bridge_eid = enumerate_mctp(dev_index);
+
+ auto [pool_start, pool_size] = get_pool_start_and_size(bridge_eid);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds{500});
+
+ // yes this sucks, no I don't like it but we know we'll only have two
+ // types of bridged endpoints on this platform and its 9PM the night
+ // before it needs to work so we're going to do it *to* it
+ if (name.starts_with("GPU"))
+ {
+ // each GPU has an SMA, as well as a GPU, they both talk over vdm
+ // so add both as seperate nodes
+ std::cerr << "Adding SMA\n";
+ populate_gpu(board_name, bridge_eid, name + "SMA");
+ std::cerr << "Adding GPU\n";
+ populate_gpu(board_name, pool_start, name);
+ }
+ else if (name.starts_with("CX8"))
+ {
+ // TODO: deal with this
+ std::cerr << "Skipping CX8's for now\n";
+ }
+ else
+ {
+ std::cerr << std::format(
+ "Something awful happened with path: {}, name {}\n", path,
+ name);
+ }
+ }
+}
+
+void wait_for_frus_to_probe()
+{
+ std::string path = "/sys/bus/i2c/devices/17-0056";
+ wait_for_path_to_exist(path, std::chrono::milliseconds{30 * 1000});
+
+ std::this_thread::sleep_for(std::chrono::seconds{30});
}
int init_nvl32()
@@ -221,6 +505,9 @@
sd_notify(0, "READY=1");
wait_for_i2c_ready();
+ // we suspect that the CPLD tells us we're ready before
+ // we actually are. This sleep stabilizes this discrepency
+ std::this_thread::sleep_for(std::chrono::seconds{1});
create_i2c_mux(5, 0x70, "pca9548");
create_i2c_mux(5, 0x71, "pca9548");
@@ -232,11 +519,18 @@
bringup_cx8_mcio(0x73, 3, true);
bringup_cx8_mcio(0x73, 7, false);
- wait_for_usb_to_probe();
- for (int ctr = 0; ctr < 10; ++ctr)
- {
- enumerate_mctp(ctr);
- }
+ // there's a weird bug in EntityManager
+ // Where Fru devices don't probe automatically
+ // We'll wait for the drivers to be probed
+ // and then force a rescan
+ // we'll follow up with a proper fix
+ wait_for_frus_to_probe();
+
+ force_rescan();
+ // allow for things to settle
+ std::this_thread::sleep_for(std::chrono::seconds{1});
+
+ bringup_devices();
std::cerr << "platform init complete\n";
pause();
std::cerr << "Releasing platform\n";