--- apiVersion: v1 data: amd-gpu-dashboard.json: |- { "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "grafana", "uid": "-- Grafana --" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": 1, "links": [], "panels": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "gridPos": { "h": 6, "w": 4, "x": 0, "y": 0 }, "id": 10, "options": { "code": { "language": "plaintext", "showLineNumbers": false, "showMiniMap": false }, "content": "**GPU MODEL**\n## ${gpu_model}\n\n**GPU S/N**\n## ${gpu_serial_number}", "mode": "markdown" }, "pluginVersion": "11.1.4", "type": "text" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 3, "w": 5, "x": 4, "y": 0 }, "id": 6, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "/^gpu_power_usage$/", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_power_usage{gpu_uuid=~\"$gpu\", job=\"$scope\"}", "format": "time_series", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "gpu_power_usage", "range": false, "refId": "gpu_power_usage", "useBackend": false } ], "title": "Power Usage (W)", "transformations": [ { "id": "merge", "options": {} }, { "id": "reduce", "options": { "includeTimeField": false, "mode": "reduceFields", "reducers": [ "mean" ] } } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 6, "w": 5, "x": 9, "y": 0 }, "id": 4, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_used_vram{gpu_uuid=~\"$gpu\", job=\"$scope\"} / on(gpu_uuid) gpu_total_vram{gpu_uuid=~\"$gpu\", job=\"$scope\"}", "format": "time_series", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "gpu_vram_utilization", "range": false, "refId": "gpu_used_vram", "useBackend": false } ], "title": "Memory Utilization %", "transformations": [ { "id": "joinByField", "options": { "byField": "Time", "mode": "outer" } }, { "id": "reduce", "options": { "reducers": [ "mean" ] } } ], "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 6, "w": 5, "x": 14, "y": 0 }, "id": 12, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "/^gpu_gfx_activity$/", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true, "sizing": "auto" }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_gfx_activity{gpu_uuid=~\"$gpu\", job=\"$scope\"}", "format": "time_series", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "gpu_gfx_activity", "range": false, "refId": "gpu_gfx_activity", "useBackend": false } ], "title": "GPU Utilization %", "transformations": [ { "id": "merge", "options": {} }, { "id": "reduce", "options": { "includeTimeField": false, "mode": "reduceFields", "reducers": [ "mean" ] } } ], "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 6, "w": 5, "x": 19, "y": 0 }, "id": 9, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_edge_temperature{gpu_uuid=~\"$gpu\", job=\"$scope\"}", "format": "time_series", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "gpu_edge_temperature", "range": false, "refId": "gpu_edge_temperature", "useBackend": false } ], "title": "GPU Temperature °C", "transformations": [ { "id": "merge", "options": {} }, { "id": "reduce", "options": { "includeTimeField": false, "labelsToFields": false, "mode": "seriesToRows", "reducers": [ "mean" ] } } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 3, "w": 5, "x": 4, "y": 3 }, "id": 13, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "/^gpu_used_vram$/", "values": false }, "showPercentChange": false, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_used_vram{gpu_uuid=~\"$gpu\", job=\"$scope\"}", "format": "time_series", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": true, "legendFormat": "gpu_used_vram", "range": false, "refId": "gpu_used_vram", "useBackend": false } ], "title": "VRAM Usage (MB)", "transformations": [ { "id": "merge", "options": {} }, { "id": "reduce", "options": { "includeTimeField": false, "mode": "reduceFields", "reducers": [ "mean" ] } } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 6 }, "id": 3, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_gfx_activity{gpu_uuid=~\"$gpu\", job=\"$scope\"}", "format": "time_series", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{gpu_uuid}}", "range": true, "refId": "gpu_gfx_activity", "useBackend": false } ], "title": "GPU Utilization %", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1, "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 6 }, "id": 2, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_used_vram{gpu_uuid=~\"$gpu\", job=\"$scope\"} / on(gpu_uuid) gpu_total_vram{gpu_uuid=~\"$gpu\", job=\"$scope\"}", "format": "time_series", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "gpu_used_vram", "useBackend": false } ], "title": "Memory Utilization %", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 6 }, "id": 11, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", "exemplar": false, "expr": "gpu_edge_temperature{gpu_uuid=~\"$gpu\", job=\"$scope\"}", "format": "time_series", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, "legendFormat": "{{gpu_uuid}}", "range": true, "refId": "gpu_edge_temperature", "useBackend": false } ], "title": "GPU Temperature °C", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto", "wrapText": false }, "filterable": true, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] } }, "overrides": [] }, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 13 }, "id": 14, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", "expr": "{gpu_uuid=~\"$gpu\", job=\"$scope\"}", "format": "table", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, "instant": false, "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false } ], "title": "Raw data", "type": "table" } ], "schemaVersion": 39, "tags": [], "templating": { "list": [ { "current": { "selected": false, "text": "kubernetes-pods", "value": "kubernetes-pods" }, "description": "Filter variable for job scope", "hide": 0, "includeAll": false, "label": "Scope", "multi": false, "name": "scope", "options": [ { "selected": true, "text": "kubernetes-pods", "value": "kubernetes-pods" }, { "selected": false, "text": "kubernetes-service-endpoints", "value": "kubernetes-service-endpoints" } ], "query": "kubernetes-pods,kubernetes-service-endpoints", "queryValue": "", "skipUrlSync": false, "type": "custom" }, { "current": { "selected": false, "text": "atlpkuc4app06", "value": "atlpkuc4app06" }, "datasource": { "type": "prometheus", "uid": "prometheus" }, "definition": "label_values(node)", "hide": 0, "includeAll": false, "label": "Node", "multi": false, "name": "node", "options": [], "query": { "qryType": 1, "query": "label_values(node)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "type": "query" }, { "current": { "selected": true, "text": "All", "value": "$__all" }, "datasource": { "type": "prometheus", "uid": "prometheus" }, "definition": "label_values({node=\"$node\"},gpu_uuid)", "hide": 0, "includeAll": true, "label": "GPU", "multi": false, "name": "gpu", "options": [], "query": { "qryType": 1, "query": "label_values({node=\"$node\"},gpu_uuid)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "type": "query" }, { "current": { "isNone": true, "selected": false, "text": "None", "value": "" }, "datasource": { "type": "prometheus", "uid": "prometheus" }, "definition": "label_values({gpu_uuid=\"$gpu\"},amd_com_gpu_family)", "hide": 2, "includeAll": false, "label": "GPU Family", "multi": false, "name": "gpu_family", "options": [], "query": { "qryType": 1, "query": "label_values({gpu_uuid=\"$gpu\"},amd_com_gpu_family)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "type": "query" }, { "current": { "selected": true, "text": [ "102-D65210-00" ], "value": [ "102-D65210-00" ] }, "datasource": { "type": "prometheus", "uid": "prometheus" }, "definition": "label_values({gpu_uuid=~\"$gpu\"},card_model)", "hide": 2, "includeAll": false, "label": "GPU Model", "multi": true, "name": "gpu_model", "options": [], "query": { "qryType": 1, "query": "label_values({gpu_uuid=~\"$gpu\"},card_model)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "type": "query" }, { "current": { "selected": true, "text": [ "692324000620" ], "value": [ "692324000620" ] }, "datasource": { "type": "prometheus", "uid": "prometheus" }, "definition": "label_values({gpu_uuid=~\"$gpu\"},serial_number)", "hide": 2, "includeAll": false, "label": "GPU Serial Number", "multi": true, "name": "gpu_serial_number", "options": [], "query": { "qryType": 1, "query": "label_values({gpu_uuid=~\"$gpu\"},serial_number)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "type": "query" } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "AMD GPU Metrics", "uid": "ee6sumbzuzl6oc", "version": 25, "weekStart": "" } kind: ConfigMap metadata: annotations: grafana_folder: common labels: grafana_dashboard: "1" name: lgtm-amd-gpu-dashboard namespace: otel-lgtm-stack