blob: 12f0a94a5a0cf39835c3ec21c1083e4f56a2d2cd [file] [log] [blame]
Marc Olberding5d50e522025-09-03 18:23:32 -07001#include "gpio.hpp"
2#include "i2c.hpp"
3#include "utilities.hpp"
4
5#include <systemd/sd-daemon.h>
6
Marc Olberdingc9c86122025-09-08 17:45:21 -07007#include <sdbusplus/asio/connection.hpp>
8
Marc Olberding5d50e522025-09-03 18:23:32 -07009#include <chrono>
10#include <filesystem>
11#include <fstream>
12#include <iostream>
13#include <thread>
Marc Olberdingc9c86122025-09-08 17:45:21 -070014#include <unordered_map>
Marc Olberding5d50e522025-09-03 18:23:32 -070015
Marc Olberdingc9c86122025-09-08 17:45:21 -070016using JsonVariantType =
17 std::variant<uint8_t, std::vector<std::string>, std::vector<double>,
18 std::string, int64_t, uint64_t, double, int32_t, uint32_t,
19 int16_t, uint16_t, bool>;
Marc Olberding5d50e522025-09-03 18:23:32 -070020namespace nvidia
21{
22
23using steady_clock = std::chrono::steady_clock;
24using namespace std::chrono_literals;
25
26void logged_system(std::string_view cmd)
27{
28 std::cerr << std::format("calling {} \n", cmd);
29 int rc = std::system(cmd.data());
30 (void)rc;
31}
32
33void setup_devmem()
34{
35 logged_system("mknod /dev/mem c 1 1");
36}
37
38void handle_passthrough_registers(bool enable)
39{
40 static constexpr uint32_t reg = 0x1e6e24bc;
41 std::string command;
42 if (enable)
43 {
44 command = std::format("devmem 0x{:x} 32 0x3f000000", reg);
45 }
46 else
47 {
48 command = std::format("devmem 0x{:x} 32 0", reg);
49 }
50 logged_system(command);
51}
52
53void wait_for_i2c_ready()
54{
55 // hpm cpld is at bus 4, address 0x17
56 i2c::RawDevice cpld{4, 0x17};
57 auto now = steady_clock::now();
58 auto end = now + 20min;
59 while (steady_clock::now() < end)
60 {
61 static constexpr uint8_t i2c_ready = 0xf2;
62 uint8_t result;
63 int rc = cpld.read_byte(i2c_ready, result);
64 if (rc)
65 {
66 std::string err =
67 std::format("Unable to communicate with cpld. rc: {}\n", rc);
68 std::cerr << err;
69 throw std::runtime_error(err);
70 }
71
72 if (result == 1)
73 {
74 return;
75 }
76
77 std::this_thread::sleep_for(std::chrono::seconds{10});
78 }
79
80 throw std::runtime_error("Waiting for host timed out!\n");
81}
82
83void probe_dev(size_t bus, uint8_t address, std::string_view dev_type)
84{
Marc Olberdingc9c86122025-09-08 17:45:21 -070085 std::string path =
Marc Olberding5d50e522025-09-03 18:23:32 -070086 std::format("/sys/bus/i2c/devices/i2c-{}/new_device", bus);
87
Marc Olberdingc9c86122025-09-08 17:45:21 -070088 wait_for_path_to_exist(path, std::chrono::milliseconds{1000});
89
Marc Olberding5d50e522025-09-03 18:23:32 -070090 std::ofstream f{path};
91 if (!f.good())
92 {
93 std::cerr << std::format("Unable to open {}\n", path.c_str());
94 std::exit(EXIT_FAILURE);
95 }
96
97 f << std::format("{} 0x{:02x}", dev_type, address);
98 f.close();
99
100 std::string created_path =
101 std::format("/sys/bus/i2c/devices/{}-{:04x}", bus, address);
102 wait_for_path_to_exist(created_path, 10ms);
103}
104
105void create_i2c_mux(size_t bus, uint8_t address, std::string_view dev_type)
106{
107 probe_dev(bus, address, dev_type);
108
Marc Olberdingc9c86122025-09-08 17:45:21 -0700109 std::string idle =
Marc Olberding5d50e522025-09-03 18:23:32 -0700110 std::format("/sys/bus/i2c/devices/{}-{:04x}/idle_state", bus, address);
111 std::ofstream idle_f{idle};
112 if (!idle_f.good())
113 {
114 std::string err = std::format("Unable to open {}\n", idle.c_str());
115 std::cerr << err;
116 throw std::runtime_error(err);
117 }
118
119 // -2 is idle-mux-disconnect
120 idle_f << -2;
121 idle_f.close();
122}
123
124size_t get_bus_from_channel(size_t parent_bus, uint8_t address, size_t channel)
125{
126 std::filesystem::path path =
127 std::format("/sys/bus/i2c/devices/{}-{:04x}/channel-{}/i2c-dev/",
128 parent_bus, address, channel);
129 int bus = -1;
130 std::error_code ec{};
131 for (const auto& f : std::filesystem::directory_iterator(path, ec))
132 {
133 // we expect to see i2c-<bus>, trim and parse everything after the dash
134 const std::string& p = f.path().filename().string();
135 std::cerr << "Reading from " << p << "\n";
136 auto [_, err] = std::from_chars(p.data() + 4, p.data() + p.size(), bus);
137 if (err != std::errc{})
138 {
139 std::string err_s = std::format("Failed to parse {}\n", p);
140 std::cerr << err_s;
141 throw std::runtime_error(err_s);
142 }
143 }
144 if (bus == -1 || ec)
145 {
146 std::string err_s =
147 std::format("Failed to find a channel at {}\n", path.string());
148 std::cerr << err_s;
149 throw std::runtime_error(err_s);
150 }
151 return bus;
152}
153
154void bringup_cx8_mcu(size_t bus)
155{
156 probe_dev(bus, 0x26, "pca9555");
157 std::string gpio_p =
158 std::format("/sys/bus/i2c/devices/{}-{:04x}/", bus, 0x26);
159 int chip_num = gpio::find_chip_idx_from_dir(gpio_p);
160 if (chip_num < 0)
161 {
162 std::cerr << std::format("Failed to find cx8 gpio at {}\n", gpio_p);
163 std::exit(EXIT_FAILURE);
164 }
165
166 // 14 is the reset pin on the MCU
167 // reset pin is active low
168 gpio::set_raw(chip_num, 14, 1);
169}
170
171void gringup_gpu_sma(size_t bus, size_t channel)
172{
173 size_t gpu_bus = get_bus_from_channel(bus, 0x72, channel);
174 probe_dev(gpu_bus, 0x20, "pca6408");
175 std::string gpio_p =
176 std::format("/sys/bus/i2c/devices/{}-{:04x}/", gpu_bus, 0x20);
177 int chip_num = gpio::find_chip_idx_from_dir(gpio_p);
178 if (chip_num < 0)
179 {
180 std::cerr << std::format("Failed to find gpu gpio {}\n", gpio_p);
181 std::exit(EXIT_FAILURE);
182 }
183
184 // pin 4 is the reset pin, active low
185 // pin 5 engages the telemetry path from the SMA
186 gpio::set_raw(chip_num, 5, 1);
187 gpio::set_raw(chip_num, 4, 1);
188}
189
190void bringup_gpus_on_mcio(size_t bus)
191{
192 create_i2c_mux(bus, 0x72, "pca9546");
193
194 gringup_gpu_sma(bus, 2);
195 gringup_gpu_sma(bus, 3);
196}
197
198void bringup_cx8_mcio(size_t mux_addr, size_t channel, bool has_cx8)
199{
200 size_t bus = get_bus_from_channel(5, mux_addr, channel);
201 if (has_cx8)
202 {
203 bringup_cx8_mcu(bus);
204 }
205 bringup_gpus_on_mcio(bus);
206}
207
Marc Olberdingc9c86122025-09-08 17:45:21 -0700208const char* mctpd_service = "au.com.codeconstruct.MCTP1";
209const char* mctp_obj = "/au/com/codeconstruct/mctp1/";
210const char* mctp_busowner = "au.com.codeconstruct.MCTP.BusOwner1";
211const char* mctp_bridge = "au.com.codeconstruct.MCTP.Bridge1";
Marc Olberding5d50e522025-09-03 18:23:32 -0700212
Marc Olberdingc9c86122025-09-08 17:45:21 -0700213template <typename PropertyType>
214PropertyType get_property(const char* service, const char* object,
215 const char* interface, const char* property_name)
216{
217 auto b = sdbusplus::bus::new_default_system();
218 auto m = b.new_method_call(service, object,
219 "org.freedesktop.DBus.Properties", "Get");
220 m.append(interface, property_name);
221
222 std::variant<PropertyType> t;
223 auto reply = b.call(m);
224
225 reply.read(t);
226 return std::get<PropertyType>(t);
Marc Olberding5d50e522025-09-03 18:23:32 -0700227}
228
Marc Olberdingc9c86122025-09-08 17:45:21 -0700229// given a device index
230// enumerate the mctp interface
231// and give back the eid
232uint8_t enumerate_mctp(uint8_t device_idx)
Marc Olberding5d50e522025-09-03 18:23:32 -0700233{
Marc Olberdingc9c86122025-09-08 17:45:21 -0700234 std::vector<uint8_t> address = {};
235 std::string obj = std::format(
236 "/au/com/codeconstruct/mctp1/interfaces/mctpusb{}", device_idx);
237
238 std::cerr << "calling " << obj << std::endl;
239
240 auto b = sdbusplus::bus::new_default_system();
241 auto m = b.new_method_call(mctpd_service, obj.c_str(), mctp_busowner,
242 "AssignEndpoint");
243 m.append(address);
244
245 auto reply = b.call(m);
246
247 uint8_t eid;
248 int32_t net;
249 std::string intf;
250 bool probed;
251 reply.read(eid, net, intf, probed);
252
253 return eid;
254}
255
256// We need to get the pool start and size
257std::tuple<uint8_t, uint8_t> get_pool_start_and_size(uint8_t eid)
258{
259 std::string obj =
260 std::format("/au/com/codeconstruct/mctp1/networks/1/endpoints/{}", eid);
261 std::cerr << "calling " << obj << std::endl;
262
263 uint8_t poolstart = get_property<uint8_t>(mctpd_service, obj.c_str(),
264 mctp_bridge, "PoolStart");
265 uint8_t poolend = get_property<uint8_t>(mctpd_service, obj.c_str(),
266 mctp_bridge, "PoolEnd");
267
268 uint8_t poolsize = poolend - poolstart + 1;
269
270 std::cerr << std::format("eid {} has pool start {} and size {}", eid,
271 poolstart, poolsize)
272 << std::endl;
273 return {poolstart, poolsize};
274}
275
276int get_device_from_port_string(std::string_view port_string)
277{
278 std::filesystem::path path = port_string;
279 path /= "net";
280 int dev_index = -1;
281 auto p = path.native();
282 wait_for_path_to_exist(p, std::chrono::milliseconds{20000});
283
284 for (const auto& dir : std::filesystem::directory_iterator(path))
285 {
286 // this looks something like:
287 // /sys/devices/platform/ahb/1e6a3000.usb/usb1/1-1/1-1.2/1-1.2.3/1-1.2.3:1.0/net/mctpusb7
288 // we want to extract the final "7"
289 std::cerr << "Looking at " << dir.path().native() << std::endl;
290
291 auto f_name = dir.path().filename().native();
292 if (f_name.starts_with("mctpusb"))
293 {
294 std::from_chars(f_name.data() + 7, f_name.data() + f_name.size(),
295 dev_index);
296 break;
297 }
298 }
299
300 if (dev_index == -1)
301 {
302 std::cerr << std::format("Unable to find an mctpusb net device at {}\n",
303 path.native());
304 }
305
306 std::cerr << "found mctp device index " << dev_index << std::endl;
307 return dev_index;
308}
309
310bool is_populated(std::string board, std::string name)
311{
312 std::string obj = std::format(
313 "/xyz/openbmc_project/inventory/system/board/{}/{}", board, name);
314 std::cerr << "inspecting " << obj << std::endl;
315 try
316 {
317 uint8_t eid = get_property<uint8_t>(
318 "xyz.openbmc_project.EntityManager", obj.c_str(),
319 "xyz.openbmc_project.Configuration.NvidiaMctpVdm", "StaticEid");
320 (void)eid;
321 return true;
322 }
323 catch (...)
324 {
325 return false;
326 }
327}
328
329void force_rescan()
330{
331 auto b = sdbusplus::bus::new_default_system();
332 auto m = b.new_method_call("xyz.openbmc_project.EntityManager",
333 "/xyz/openbmc_project/EntityManager",
334 "xyz.openbmc_project.EntityManager", "ReScan");
335 b.call(m);
336}
337
338void populate_gpu(std::string board, uint8_t eid, std::string name)
339{
340 if (is_populated(board, name))
341 {
342 std::cerr << name << " already exists" << std::endl;
343 return;
344 }
345
346 std::string obj =
347 std::format("/xyz/openbmc_project/inventory/system/board/{}", board);
348
349 std::cerr << "calling with " << obj << std::endl;
350
351 std::chrono::steady_clock::time_point start =
352 std::chrono::steady_clock::now();
353 std::chrono::steady_clock::time_point end = start + std::chrono::minutes{3};
354 auto b = sdbusplus::bus::new_default_system();
355 auto m = b.new_method_call("xyz.openbmc_project.EntityManager", obj.c_str(),
356 "xyz.openbmc_project.AddObject", "AddObject");
357 std::unordered_map<std::string, JsonVariantType> param;
358 param["Name"] = name;
359 param["StaticEid"] = eid;
360 param["Type"] = "NvidiaMctpVdm";
361
362 m.append(param);
363
364 do
365 {
366 auto now = std::chrono::steady_clock::now();
367 if (now >= end)
368 {
369 std::cerr << "Timeout: Failed to add " << obj << std::endl;
370 return;
371 }
372 try
373 {
374 b.call(m);
375 return;
376 }
377 catch (...)
378 {
379 std::cerr << "Failed to find " << obj << " trying again"
380 << std::endl;
381 std::this_thread::sleep_for(std::chrono::seconds{10});
382 continue;
383 }
384 } while (true);
385}
386
387struct bridge_device
388{
389 std::string usb_path;
390 std::string name;
391 std::string board_name;
392};
393
394void bringup_devices()
395{
396 // There's a lot of hackery going on here
397 // This is for handling (as of today) unsupported bridged endpoints
398 // The MCU's on this platform act as MCTP bridges
399 // We know their absolute USB path through the platform hub, and that's
400 // symlinked to a mctp net device So we will start there we also know that
401 // each device the USB device is bridging to will always have the same
402 // relative ordering
403 // inside of a given pool. This is not a generally true assumption but it
404 // is true for our MCU's
405 // So we can put each bridge and is downstream devices through enumeration
406 // with mctpd, when we get the response, we know the bridges eid we can then
407 // ask mctpd what the pool size and start eid is for the bridge pool. From
408 // there we can infer the eid of each bridged device behind it and call
409 // AddObject on EntityManager for each board to bring up the requisite nodes
410 // beneath it which will allow the rest of the system to start behaving as
411 // expected. Once we have real support for bridged eid's, we can and should
412 // delete this mess.
413 static constexpr const char* usb_prefix =
414 "/sys/devices/platform/ahb/1e6a3000.usb/usb1/1-1/";
415 const std::array<bridge_device, 10> device_name_map = {
416 {{.usb_path = "1-1.2/1-1.2.1/1-1.2.1:1.0",
417 .name = "GPU_0",
418 .board_name = "Nvidia_RTX_PRO_6000_Blackwell_1"},
419 {.usb_path = "1-1.1/1-1.1.2/1-1.1.2.1/1-1.1.2.1:1.0",
420 .name = "GPU_1",
421 .board_name = "Nvidia_RTX_PRO_6000_Blackwell_2"},
422 {.usb_path = "1-1.4/1-1.4.1/1-1.4.1:1.0",
423 .name = "GPU_2",
424 .board_name = "Nvidia_RTX_PRO_6000_Blackwell_3"},
425 {.usb_path = "1-1.2/1-1.2.2/1-1.2.2:1.0",
426 .name = "GPU_3",
427 .board_name = "Nvidia_RTX_PRO_6000_Blackwell_4"},
428 {.usb_path = "1-1.1/1-1.1.4/1-1.1.4.1/1-1.1.4.1:1.0",
429 .name = "GPU_4",
430 .board_name = "Nvidia_RTX_PRO_6000_Blackwell_5"},
431 {.usb_path = "1-1.1/1-1.1.2/1-1.1.2.2/1-1.1.2.2:1.0",
432 .name = "GPU_5",
433 .board_name = "Nvidia_RTX_PRO_6000_Blackwell_6"},
434 {.usb_path = "1-1.4/1-1.4.2/1-1.4.2:1.0",
435 .name = "GPU_6",
436 .board_name = "Nvidia_RTX_PRO_6000_Blackwell_7"},
437 {.usb_path = "1-1.2/1-1.2.3/1-1.2.3:1.0",
438 .name = "CX8_0",
439 .board_name = "NVIDIA_Alon_cx8_Fru"},
440 {.usb_path = "1-1.1/1-1.1.4/1-1.1.4.2/1-1.1.4.2:1.0",
441 .name = "GPU_7",
442 .board_name = "Nvidia_RTX_PRO_6000_Blackwell_8"},
443 {.usb_path = "1-1.1/1-1.1.2/1-1.1.2.3/1-1.1.2.3:1.0",
444 .name = "CX8_1",
445 .board_name = "NVIDIA_Alon_cx8_Fru"}}};
446
447 for (const auto& [usb_path, name, board_name] : device_name_map)
448 {
449 std::cerr << "looking at device " << name << std::endl;
450 std::string path = std::format("{}/{}", usb_prefix, usb_path);
451 int dev_index = get_device_from_port_string(path);
452 if (dev_index < 0)
453 {
454 std::cerr << std::format(
455 "Unable to bring up {} because it doesn't seem to exist\n",
456 name);
457 continue;
458 }
459
460 // enumerate the bridge device
461 uint8_t bridge_eid = enumerate_mctp(dev_index);
462
463 auto [pool_start, pool_size] = get_pool_start_and_size(bridge_eid);
464
465 std::this_thread::sleep_for(std::chrono::milliseconds{500});
466
467 // yes this sucks, no I don't like it but we know we'll only have two
468 // types of bridged endpoints on this platform and its 9PM the night
469 // before it needs to work so we're going to do it *to* it
470 if (name.starts_with("GPU"))
471 {
472 // each GPU has an SMA, as well as a GPU, they both talk over vdm
473 // so add both as seperate nodes
474 std::cerr << "Adding SMA\n";
475 populate_gpu(board_name, bridge_eid, name + "SMA");
476 std::cerr << "Adding GPU\n";
477 populate_gpu(board_name, pool_start, name);
478 }
479 else if (name.starts_with("CX8"))
480 {
481 // TODO: deal with this
482 std::cerr << "Skipping CX8's for now\n";
483 }
484 else
485 {
486 std::cerr << std::format(
487 "Something awful happened with path: {}, name {}\n", path,
488 name);
489 }
490 }
491}
492
493void wait_for_frus_to_probe()
494{
495 std::string path = "/sys/bus/i2c/devices/17-0056";
496 wait_for_path_to_exist(path, std::chrono::milliseconds{30 * 1000});
497
498 std::this_thread::sleep_for(std::chrono::seconds{30});
Marc Olberding5d50e522025-09-03 18:23:32 -0700499}
500
501int init_nvl32()
502{
503 setup_devmem();
504 handle_passthrough_registers(false);
505 sd_notify(0, "READY=1");
506
507 wait_for_i2c_ready();
Marc Olberdingc9c86122025-09-08 17:45:21 -0700508 // we suspect that the CPLD tells us we're ready before
509 // we actually are. This sleep stabilizes this discrepency
510 std::this_thread::sleep_for(std::chrono::seconds{1});
Marc Olberding5d50e522025-09-03 18:23:32 -0700511
512 create_i2c_mux(5, 0x70, "pca9548");
513 create_i2c_mux(5, 0x71, "pca9548");
514 create_i2c_mux(5, 0x73, "pca9548");
515 create_i2c_mux(5, 0x75, "pca9548");
516
517 bringup_cx8_mcio(0x70, 1, true);
518 bringup_cx8_mcio(0x70, 5, false);
519 bringup_cx8_mcio(0x73, 3, true);
520 bringup_cx8_mcio(0x73, 7, false);
521
Marc Olberdingc9c86122025-09-08 17:45:21 -0700522 // there's a weird bug in EntityManager
523 // Where Fru devices don't probe automatically
524 // We'll wait for the drivers to be probed
525 // and then force a rescan
526 // we'll follow up with a proper fix
527 wait_for_frus_to_probe();
528
529 force_rescan();
530 // allow for things to settle
531 std::this_thread::sleep_for(std::chrono::seconds{1});
532
533 bringup_devices();
Marc Olberding5d50e522025-09-03 18:23:32 -0700534 std::cerr << "platform init complete\n";
535 pause();
536 std::cerr << "Releasing platform\n";
537
538 return EXIT_SUCCESS;
539}
540
541} // namespace nvidia