clusterforge/amd-gpu-operator/ConfigMap_gpu-config.yaml
2025-10-06 09:34:03 +00:00

128 lines
3.8 KiB
YAML

---
apiVersion: v1
data:
config.json: |
{
"GPUConfig": {
"Fields": [
"GPU_NODES_TOTAL",
"GPU_PACKAGE_POWER",
"GPU_AVERAGE_PACKAGE_POWER",
"GPU_EDGE_TEMPERATURE",
"GPU_JUNCTION_TEMPERATURE",
"GPU_MEMORY_TEMPERATURE",
"GPU_HBM_TEMPERATURE",
"GPU_GFX_ACTIVITY",
"GPU_UMC_ACTIVITY",
"GPU_MMA_ACTIVITY",
"GPU_VCN_ACTIVITY",
"GPU_JPEG_ACTIVITY",
"GPU_VOLTAGE",
"GPU_GFX_VOLTAGE",
"GPU_MEMORY_VOLTAGE",
"PCIE_SPEED",
"PCIE_MAX_SPEED",
"PCIE_BANDWIDTH",
"GPU_ENERGY_CONSUMED",
"PCIE_REPLAY_COUNT",
"PCIE_RECOVERY_COUNT",
"PCIE_REPLAY_ROLLOVER_COUNT",
"PCIE_NACK_SENT_COUNT",
"PCIE_NAC_RECEIVED_COUNT",
"GPU_CLOCK",
"GPU_POWER_USAGE",
"GPU_TOTAL_VRAM",
"GPU_ECC_CORRECT_TOTAL",
"GPU_ECC_UNCORRECT_TOTAL",
"GPU_ECC_CORRECT_SDMA",
"GPU_ECC_UNCORRECT_SDMA",
"GPU_ECC_CORRECT_GFX",
"GPU_ECC_UNCORRECT_GFX",
"GPU_ECC_CORRECT_MMHUB",
"GPU_ECC_UNCORRECT_MMHUB",
"GPU_ECC_CORRECT_ATHUB",
"GPU_ECC_UNCORRECT_ATHUB",
"GPU_ECC_CORRECT_BIF",
"GPU_ECC_UNCORRECT_BIF",
"GPU_ECC_CORRECT_HDP",
"GPU_ECC_UNCORRECT_HDP",
"GPU_ECC_CORRECT_XGMI_WAFL",
"GPU_ECC_UNCORRECT_XGMI_WAFL",
"GPU_ECC_CORRECT_DF",
"GPU_ECC_UNCORRECT_DF",
"GPU_ECC_CORRECT_SMN",
"GPU_ECC_UNCORRECT_SMN",
"GPU_ECC_CORRECT_SEM",
"GPU_ECC_UNCORRECT_SEM",
"GPU_ECC_CORRECT_MP0",
"GPU_ECC_UNCORRECT_MP0",
"GPU_ECC_CORRECT_MP1",
"GPU_ECC_UNCORRECT_MP1",
"GPU_ECC_CORRECT_FUSE",
"GPU_ECC_UNCORRECT_FUSE",
"GPU_ECC_CORRECT_UMC",
"GPU_ECC_UNCORRECT_UMC",
"GPU_XGMI_NBR_0_NOP_TX",
"GPU_XGMI_NBR_0_REQ_TX",
"GPU_XGMI_NBR_0_RESP_TX",
"GPU_XGMI_NBR_0_BEATS_TX",
"GPU_XGMI_NBR_1_NOP_TX",
"GPU_XGMI_NBR_1_REQ_TX",
"GPU_XGMI_NBR_1_RESP_TX",
"GPU_XGMI_NBR_1_BEATS_TX",
"GPU_XGMI_NBR_0_TX_THRPUT",
"GPU_XGMI_NBR_1_TX_THRPUT",
"GPU_XGMI_NBR_2_TX_THRPUT",
"GPU_XGMI_NBR_3_TX_THRPUT",
"GPU_XGMI_NBR_4_TX_THRPUT",
"GPU_XGMI_NBR_5_TX_THRPUT",
"GPU_USED_VRAM",
"GPU_FREE_VRAM",
"GPU_TOTAL_VISIBLE_VRAM",
"GPU_USED_VISIBLE_VRAM",
"GPU_FREE_VISIBLE_VRAM",
"GPU_TOTAL_GTT",
"GPU_USED_GTT",
"GPU_FREE_GTT",
"GPU_ECC_CORRECT_MCA",
"GPU_ECC_UNCORRECT_MCA",
"GPU_ECC_CORRECT_VCN",
"GPU_ECC_UNCORRECT_VCN",
"GPU_ECC_CORRECT_JPEG",
"GPU_ECC_UNCORRECT_JPEG",
"GPU_ECC_CORRECT_IH",
"GPU_ECC_UNCORRECT_IH",
"GPU_ECC_CORRECT_MPIO",
"GPU_ECC_UNCORRECT_MPIO"
],
"Labels": [
"GPU_UUID",
"SERIAL_NUMBER",
"GPU_ID",
"POD",
"NAMESPACE",
"CONTAINER",
"CLUSTER_NAME",
"CARD_SERIES",
"CARD_MODEL",
"CARD_VENDOR",
"DRIVER_VERSION",
"VBIOS_VERSION",
"HOSTNAME"
],
"ExtraPodLabels" : {
"WORKLOAD_ID" : "airm.silogen.ai/workload-id",
"USERGROUP_ID" : "airm.silogen.ai/usergroup-id",
"PROJECT_ID" : "airm.silogen.ai/project-id"
},
"CustomLabels" : {
"ORG_NAME" : "demo",
"KUBE_CLUSTER_NAME" : "demo-cluster"
}
}
}
kind: ConfigMap
metadata:
name: gpu-config
namespace: kube-amd-gpu