抢占GPU的脚本

时间:2023-08-26
本文章向大家介绍抢占GPU的脚本,主要内容包括其使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

我主要利用Python多进程编程,通过占用GPU内存,从而达到占用GPU的目的。关于代码的解释见我的个人博客,以下主要介绍如何使用该脚本。

我的Python版本为3.11,执行命令如下

python grab_gpu.py --n 3 --otime 30 --spath ./train.sh

其中n表示需要占用的GPU个数,otime表示占用时间,spath表示一旦释放GPU后,我们需要执行的脚本。

运行结果如下:

完整代码如下

import os
import subprocess
import time
import argparse
from multiprocessing import Process, Value, Lock, Array

def get_gpu_mem(gpu_id):
    gpu_query = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader'])
    gpu_memory = [int(x) for x in gpu_query.decode('utf-8').split('\n')[:-1]]
    return gpu_memory[gpu_id]

def get_free_gpus()->list:
    gpu_query = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader'])
    gpu_memory = [int(x) for x in gpu_query.decode('utf-8').split('\n')[:-1]]
    free_gpus = [i for i, mem in enumerate(gpu_memory) if mem < 100]
    return free_gpus

def occupy_gpu(gpu_id:int, n, occupy_num, ocpy_gpus, lock, a_dim=140000):
    with lock:
        if get_gpu_mem(gpu_id) < 100 and occupy_num.value < n:
            import torch
            a = torch.ones((a_dim,a_dim)).cuda(gpu_id)
            ocpy_gpus[occupy_num.value]= gpu_id
            occupy_num.value += 1
            print(f"Occupying GPU {gpu_id}, Total Occupied: {occupy_num.value}")
    while True:
        time.sleep(10)

def occupy_all_gpus(n:int, occupy_num, ocpy_gpus, interval=10):
    print("Launching process to occupy GPU ...")
    lock = Lock()
    processes = [] #List to store the processes
    while occupy_num.value < n:
        free_gpus = get_free_gpus()
        will_occupy_num = min(n, max(0,len(free_gpus)))
        for i in range(will_occupy_num):
            if occupy_num.value < n:
                p = Process(target=occupy_gpu, args=(free_gpus[i], n, occupy_num, ocpy_gpus, lock))
                p.start()
                processes.append(p)
        time.sleep(interval) # enough time to occupy gpus and update nvidia-smi
    return processes, ocpy_gpus

def run_my_program(n, desired_script, processes, ocpy_gpus, occupy_num):
    for p in processes:
        p.terminate()
    ocpy_gpus_list = list(ocpy_gpus[:occupy_num.value])
    cuda_visible_devices = ",".join(map(str, ocpy_gpus_list))
    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
    subprocess.run([desired_script, str(n)])
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Arguments for Occupy GPUs")
    parser.add_argument(
        "--n", type=int, default=2, help="Number of GPUs to occupy"
    )
    parser.add_argument(
        "--otime", type=int, default=10, help="Time of occupying gpu" 
    )
    parser.add_argument(
        "--spath", type=str, default='./train.sh', help="the execute script path"
    )
    args = parser.parse_args()
    n = args.n
    occupy_time = args.otime
    desired_script = args.spath
    occupy_num = Value('i', 0) # Shared variable to count occupied GPUs
    ocpy_gpus =  Array('i', [-1 for _ in range(8)])# Shared array to store occupied gpu
    processes,ocpy_gpus = occupy_all_gpus(n, occupy_num, ocpy_gpus)
    time.sleep(occupy_time)
    run_my_program(n, desired_script, processes, ocpy_gpus, occupy_num)

原文地址:https://www.cnblogs.com/littletreee/p/17658006.html