blob: 19b992993942e95367279a7a22d7801b4f3c638d [file] [log] [blame]
Harshit Aghera560e6af2025-04-21 20:04:56 +05301/*
2 * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION &
3 * AFFILIATES. All rights reserved.
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#pragma once
8
9#include <OcpMctpVdm.hpp>
10
Rohit PAI86786b62025-06-10 09:46:33 +053011#include <array>
12#include <cstddef>
Harshit Aghera560e6af2025-04-21 20:04:56 +053013#include <cstdint>
14#include <span>
Rohit PAI86786b62025-06-10 09:46:33 +053015#include <string>
16#include <variant>
17#include <vector>
Harshit Aghera560e6af2025-04-21 20:04:56 +053018
19namespace gpu
20{
21
Rohit PAI86786b62025-06-10 09:46:33 +053022using InventoryValue = std::variant<std::string, std::vector<uint8_t>>;
23constexpr size_t maxInventoryDataSize = 256;
24
Harshit Aghera560e6af2025-04-21 20:04:56 +053025constexpr uint16_t nvidiaPciVendorId = 0x10de;
26
27enum class MessageType : uint8_t
28{
29 DEVICE_CAPABILITY_DISCOVERY = 0,
30 PLATFORM_ENVIRONMENTAL = 3
31};
32
33enum class DeviceCapabilityDiscoveryCommands : uint8_t
34{
35 QUERY_DEVICE_IDENTIFICATION = 0x09,
36};
37
38enum class PlatformEnvironmentalCommands : uint8_t
39{
40 GET_TEMPERATURE_READING = 0x00,
Harshit Aghera5e7decc2025-05-07 16:20:16 +053041 READ_THERMAL_PARAMETERS = 0x02,
Harshit Aghera902c6492025-05-08 15:57:42 +053042 GET_CURRENT_POWER_DRAW = 0x03,
Harshit Aghera775199d2025-05-27 14:20:24 +053043 GET_CURRENT_ENERGY_COUNTER = 0x06,
Rohit PAI86786b62025-06-10 09:46:33 +053044 GET_INVENTORY_INFORMATION = 0x0C,
Harshit Agherabef4d412025-05-27 14:53:56 +053045 GET_VOLTAGE = 0x0F,
Harshit Aghera560e6af2025-04-21 20:04:56 +053046};
47
48enum class DeviceIdentification : uint8_t
49{
Harshit Aghera8951c872025-06-25 15:25:33 +053050 DEVICE_GPU = 0,
51 DEVICE_SMA = 5
Harshit Aghera560e6af2025-04-21 20:04:56 +053052};
53
Rohit PAI86786b62025-06-10 09:46:33 +053054enum class InventoryPropertyId : uint8_t
55{
56 BOARD_PART_NUMBER = 0,
57 SERIAL_NUMBER = 1,
58 MARKETING_NAME = 2,
59 DEVICE_PART_NUMBER = 3,
60 FRU_PART_NUMBER = 4,
61 MEMORY_VENDOR = 5,
62 MEMORY_PART_NUMBER = 6,
63 MAX_MEMORY_CAPACITY = 7,
64 BUILD_DATE = 8,
65 FIRMWARE_VERSION = 9,
66 DEVICE_GUID = 10,
67 INFOROM_VERSION = 11,
68 PRODUCT_LENGTH = 12,
69 PRODUCT_WIDTH = 13,
70 PRODUCT_HEIGHT = 14,
71 RATED_DEVICE_POWER_LIMIT = 15,
72 MIN_DEVICE_POWER_LIMIT = 16,
73 MAX_DEVICE_POWER_LIMIT = 17,
74 MAX_MODULE_POWER_LIMIT = 18,
75 MIN_MODULE_POWER_LIMIT = 19,
76 RATED_MODULE_POWER_LIMIT = 20,
77 DEFAULT_BOOST_CLOCKS = 21,
78 DEFAULT_BASE_CLOCKS = 22,
79 DEFAULT_EDPP_SCALING = 23,
80 MIN_EDPP_SCALING = 24,
81 MAX_EDPP_SCALING = 25,
82 MIN_GRAPHICS_CLOCK = 26,
83 MAX_GRAPHICS_CLOCK = 27,
84 MIN_MEMORY_CLOCK = 28,
85 MAX_MEMORY_CLOCK = 29,
86 INFINIBAND_GUID = 30,
87 RACK_GUID = 31,
88 RACK_SLOT_NUMBER = 32,
89 COMPUTE_SLOT_INDEX = 33,
90 NODE_INDEX = 34,
91 GPU_NODE_ID = 35,
92 NVLINK_PEER_TYPE = 36
93};
94
Harshit Aghera560e6af2025-04-21 20:04:56 +053095struct QueryDeviceIdentificationRequest
96{
97 ocp::accelerator_management::CommonRequest hdr;
98} __attribute__((packed));
99
100struct QueryDeviceIdentificationResponse
101{
102 ocp::accelerator_management::CommonResponse hdr;
103 uint8_t device_identification;
104 uint8_t instance_id;
105} __attribute__((packed));
106
107struct GetNumericSensorReadingRequest
108{
109 ocp::accelerator_management::CommonRequest hdr;
110 uint8_t sensor_id;
111} __attribute__((packed));
112
113using GetTemperatureReadingRequest = GetNumericSensorReadingRequest;
114
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530115using ReadThermalParametersRequest = GetNumericSensorReadingRequest;
116
Harshit Aghera902c6492025-05-08 15:57:42 +0530117struct GetCurrentPowerDrawRequest
118{
119 ocp::accelerator_management::CommonRequest hdr;
120 uint8_t sensorId;
121 uint8_t averagingInterval;
122} __attribute__((packed));
123
Harshit Aghera775199d2025-05-27 14:20:24 +0530124using GetCurrentEnergyCounterRequest = GetNumericSensorReadingRequest;
125
Harshit Agherabef4d412025-05-27 14:53:56 +0530126using GetVoltageRequest = GetNumericSensorReadingRequest;
127
Harshit Aghera560e6af2025-04-21 20:04:56 +0530128struct GetTemperatureReadingResponse
129{
130 ocp::accelerator_management::CommonResponse hdr;
131 int32_t reading;
132} __attribute__((packed));
133
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530134struct ReadThermalParametersResponse
135{
136 ocp::accelerator_management::CommonResponse hdr;
137 int32_t threshold;
138} __attribute__((packed));
139
Harshit Aghera902c6492025-05-08 15:57:42 +0530140struct GetCurrentPowerDrawResponse
141{
142 ocp::accelerator_management::CommonResponse hdr;
143 uint32_t power;
144} __attribute__((packed));
145
Harshit Aghera775199d2025-05-27 14:20:24 +0530146struct GetCurrentEnergyCounterResponse
147{
148 ocp::accelerator_management::CommonResponse hdr;
149 uint64_t energy;
150} __attribute__((packed));
151
Harshit Agherabef4d412025-05-27 14:53:56 +0530152struct GetVoltageResponse
153{
154 ocp::accelerator_management::CommonResponse hdr;
155 uint32_t voltage;
156} __attribute__((packed));
157
Rohit PAI86786b62025-06-10 09:46:33 +0530158struct GetInventoryInformationRequest
159{
160 ocp::accelerator_management::CommonRequest hdr;
161 uint8_t property_id;
162} __attribute__((packed));
163
164struct GetInventoryInformationResponse
165{
166 ocp::accelerator_management::CommonResponse hdr;
167 std::array<uint8_t, maxInventoryDataSize> data;
168} __attribute__((packed));
169
Harshit Aghera560e6af2025-04-21 20:04:56 +0530170int packHeader(const ocp::accelerator_management::BindingPciVidInfo& hdr,
171 ocp::accelerator_management::BindingPciVid& msg);
172
173int encodeQueryDeviceIdentificationRequest(uint8_t instanceId,
174 std::span<uint8_t> buf);
175
176int decodeQueryDeviceIdentificationResponse(
177 std::span<const uint8_t> buf,
178 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
179 uint8_t& deviceIdentification, uint8_t& deviceInstance);
180
181int encodeGetTemperatureReadingRequest(uint8_t instanceId, uint8_t sensorId,
182 std::span<uint8_t> buf);
183
184int decodeGetTemperatureReadingResponse(
185 std::span<const uint8_t> buf,
186 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
187 double& temperatureReading);
188
Harshit Aghera5e7decc2025-05-07 16:20:16 +0530189int encodeReadThermalParametersRequest(uint8_t instanceId, uint8_t sensorId,
190 std::span<uint8_t> buf);
191
192int decodeReadThermalParametersResponse(
193 std::span<const uint8_t> buf,
194 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
195 int32_t& threshold);
196
Harshit Aghera902c6492025-05-08 15:57:42 +0530197int encodeGetCurrentPowerDrawRequest(uint8_t instanceId, uint8_t sensorId,
198 uint8_t averagingInterval,
199 std::span<uint8_t> buf);
200
201int decodeGetCurrentPowerDrawResponse(
202 std::span<const uint8_t> buf,
203 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
204 uint32_t& power);
Harshit Aghera775199d2025-05-27 14:20:24 +0530205
206int encodeGetCurrentEnergyCounterRequest(uint8_t instanceId, uint8_t sensorId,
207 std::span<uint8_t> buf);
208
209int decodeGetCurrentEnergyCounterResponse(
210 std::span<const uint8_t> buf,
211 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
212 uint64_t& energy);
Harshit Agherabef4d412025-05-27 14:53:56 +0530213
214int encodeGetVoltageRequest(uint8_t instanceId, uint8_t sensorId,
215 std::span<uint8_t> buf);
216
217int decodeGetVoltageResponse(std::span<const uint8_t> buf,
218 ocp::accelerator_management::CompletionCode& cc,
219 uint16_t& reasonCode, uint32_t& voltage);
Rohit PAI86786b62025-06-10 09:46:33 +0530220
221int encodeGetInventoryInformationRequest(uint8_t instanceId, uint8_t propertyId,
222 std::span<uint8_t> buf);
223
224int decodeGetInventoryInformationResponse(
225 std::span<const uint8_t> buf,
226 ocp::accelerator_management::CompletionCode& cc, uint16_t& reasonCode,
227 InventoryPropertyId propertyId, InventoryValue& value);
228
Harshit Aghera560e6af2025-04-21 20:04:56 +0530229} // namespace gpu