Source code for pydgc.utils.device

# -*- coding: utf-8 -*-
"""
@Reference: https://github.com/snap-stanford/GraphGym/blob/master/graphgym/utils/device.py
"""
import os
import torch
import subprocess
import numpy as np


from yacs.config import CfgNode as CN


[docs]def count_parameters(model):
    """Count the parameters' number of the input model.

    Note: The unit of return value is millions(M) if exceeds 1,000,000.

    Args:
        model (torch.nn.Module): The model instance you want to count.

    Returns:
        float: The number of model parameters, in Million (M).
    """
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return round(num_params / 1e6, 6)


[docs]def get_gpu_memory_map():
    """Get the current gpu usage.

    Returns:
        np.ndarray: The current gpu memory usage.
    """
    result = subprocess.check_output([
        'nvidia-smi', '--query-gpu=memory.used',
        '--format=csv,nounits,noheader'
    ], encoding='utf-8')
    gpu_memory = np.array([int(x) for x in result.strip().split('\n')])
    return gpu_memory


[docs]def get_current_gpu_usage(gpu_mem, device: str):
    """Get the current GPU memory usage.

    Args:
        gpu_mem (np.ndarray): The current gpu memory usage.
        device (str): The device.

    Returns:
        int: The current GPU memory usage.
    """
    if gpu_mem and device != 'cpu' and torch.cuda.is_available():
        result = subprocess.check_output([
            'nvidia-smi', '--query-compute-apps=pid,used_memory',
            '--format=csv,nounits,noheader'
        ], encoding='utf-8')
        current_pid = os.getpid()
        used_memory = 0
        for line in result.strip().split('\n'):
            line = line.split(', ')
            if current_pid == int(line[0]):
                used_memory += int(line[1])
        return used_memory
    else:
        return -1


[docs]def auto_select_device(logger,
                       cfg: CN,
                       memory_max: int = 8000,
                       memory_bias: int = 200,
                       strategy: str = 'random'):
    """Auto select device for the experiment. Useful when having multiple GPUs.

    Args:
        logger: Logger.
        cfg (CN): Config.
        memory_max (int, optional): Threshold of existing GPU memory usage. GPUs with
            memory usage beyond this threshold will be deprioritized. Defaults to 8000.
        memory_bias (int, optional): A bias GPU memory usage added to all the GPUs.
            Avoid divided by zero error. Defaults to 200.
        strategy (str, optional): 'random' (random select GPU) or 'greedy'
            (greedily select GPU). Defaults to 'random'.

    Returns:
        CN: Config.
    """
    if cfg.device != 'cpu' and torch.cuda.is_available():
        if cfg.device == 'auto':
            memory_raw = get_gpu_memory_map()
            if strategy == 'greedy' or np.all(memory_raw > memory_max):
                cuda = np.argmin(memory_raw)
                logger.info('GPU Mem: {}'.format(memory_raw))
                logger.info(
                    'Greedy select GPU, select GPU {} with mem: {}'.format(
                        cuda, memory_raw[cuda]))
            else:
                memory = 1 / (memory_raw + memory_bias)
                memory[memory_raw > memory_max] = 0
                gpu_prob = memory / memory.sum()
                cuda = np.random.choice(len(gpu_prob), p=gpu_prob)
                logger.info('GPU Mem: {}'.format(memory_raw))
                logger.info('GPU Prob: {}'.format(gpu_prob.round(2)))
                logger.info(
                    'Random select GPU, select GPU {} with mem: {}'.format(
                        cuda, memory_raw[cuda]))

            cfg.device = 'cuda:{}'.format(cuda)
    else:
        cfg.device = 'cpu'
    return cfg