DGX Spark Monitoring

sudo apt install netdata -y

make netdata listening on all IPs

file=/etc/netdata/netdata.conf
enable = no
run as user = netdata
web files owner = root
web files group = root
bind socket to IP = 0.0.0.0

create a plugin conf for nvidia-smi

file=/etc/netdata/python.d/nvidia_smi.conf
update_every: 2
priority: 60000

local:
  name: 'nvidia'
  binary_path: /usr/bin/nvidia-smi

enable the plugin

file=/etc/netdata/python.d.conf
nvidia_smi: yes

restart netdata

systemctl restart netdata.service

test nvidia-smi output

nvidia-smi --query-gpu=utilization.gpu,temperature.gpu,memory.used --format=csv

output should looks like this

utilization.gpu [%], temperature.gpu, memory.used [MiB]
0 %, 40, [N/A]