# -*- coding: utf-8 -*-
"""
@Reference: https://github.com/snap-stanford/GraphGym/blob/master/graphgym/utils/device.py
"""
import os
import torch
import subprocess
import numpy as np
from yacs.config import CfgNode as CN
[docs]def count_parameters(model):
"""Count the parameters' number of the input model.
Note: The unit of return value is millions(M) if exceeds 1,000,000.
Args:
model (torch.nn.Module): The model instance you want to count.
Returns:
float: The number of model parameters, in Million (M).
"""
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
return round(num_params / 1e6, 6)
[docs]def get_gpu_memory_map():
"""Get the current gpu usage.
Returns:
np.ndarray: The current gpu memory usage.
"""
result = subprocess.check_output([
'nvidia-smi', '--query-gpu=memory.used',
'--format=csv,nounits,noheader'
], encoding='utf-8')
gpu_memory = np.array([int(x) for x in result.strip().split('\n')])
return gpu_memory
[docs]def get_current_gpu_usage(gpu_mem, device: str):
"""Get the current GPU memory usage.
Args:
gpu_mem (np.ndarray): The current gpu memory usage.
device (str): The device.
Returns:
int: The current GPU memory usage.
"""
if gpu_mem and device != 'cpu' and torch.cuda.is_available():
result = subprocess.check_output([
'nvidia-smi', '--query-compute-apps=pid,used_memory',
'--format=csv,nounits,noheader'
], encoding='utf-8')
current_pid = os.getpid()
used_memory = 0
for line in result.strip().split('\n'):
line = line.split(', ')
if current_pid == int(line[0]):
used_memory += int(line[1])
return used_memory
else:
return -1
[docs]def auto_select_device(logger,
cfg: CN,
memory_max: int = 8000,
memory_bias: int = 200,
strategy: str = 'random'):
"""Auto select device for the experiment. Useful when having multiple GPUs.
Args:
logger: Logger.
cfg (CN): Config.
memory_max (int, optional): Threshold of existing GPU memory usage. GPUs with
memory usage beyond this threshold will be deprioritized. Defaults to 8000.
memory_bias (int, optional): A bias GPU memory usage added to all the GPUs.
Avoid divided by zero error. Defaults to 200.
strategy (str, optional): 'random' (random select GPU) or 'greedy'
(greedily select GPU). Defaults to 'random'.
Returns:
CN: Config.
"""
if cfg.device != 'cpu' and torch.cuda.is_available():
if cfg.device == 'auto':
memory_raw = get_gpu_memory_map()
if strategy == 'greedy' or np.all(memory_raw > memory_max):
cuda = np.argmin(memory_raw)
logger.info('GPU Mem: {}'.format(memory_raw))
logger.info(
'Greedy select GPU, select GPU {} with mem: {}'.format(
cuda, memory_raw[cuda]))
else:
memory = 1 / (memory_raw + memory_bias)
memory[memory_raw > memory_max] = 0
gpu_prob = memory / memory.sum()
cuda = np.random.choice(len(gpu_prob), p=gpu_prob)
logger.info('GPU Mem: {}'.format(memory_raw))
logger.info('GPU Prob: {}'.format(gpu_prob.round(2)))
logger.info(
'Random select GPU, select GPU {} with mem: {}'.format(
cuda, memory_raw[cuda]))
cfg.device = 'cuda:{}'.format(cuda)
else:
cfg.device = 'cpu'
return cfg