|
20 | 20 | import traceback
|
21 | 21 | from typing import List, Union
|
22 | 22 |
|
23 |
| -from openrl.supports.opengpu.gpu_info import get_local_GPU_info, get_remote_GPU_info |
24 |
| - |
25 |
| - |
26 |
| -class RemoteGPUManager: |
27 |
| - def __init__(self, pytorch_config=None, check: bool = False): |
28 |
| - self.gpu_info_dict = get_remote_GPU_info() |
29 |
| - self.pytorch_config = pytorch_config |
30 |
| - self.server_list = [] |
31 |
| - if self.pytorch_config is not None: |
32 |
| - for server_address in self.pytorch_config.GPU_usage_dict: |
33 |
| - self.server_list.append(server_address) |
34 |
| - |
35 |
| - if check: |
36 |
| - self.check_gpus() |
37 |
| - |
38 |
| - self.cal_learner_number() |
39 |
| - |
40 |
| - def check_gpus(self): |
41 |
| - assert self.pytorch_config is not None |
42 |
| - assert len(self.server_list) > 0 |
43 |
| - |
44 |
| - bad_gpus = [] |
45 |
| - for server_address in self.server_list: |
46 |
| - assert ( |
47 |
| - server_address in self.gpu_info_dict |
48 |
| - ), "can not get gpu info from {}".format(server_address) |
49 |
| - assert len(self.gpu_info_dict[server_address]["gpu_infos"]) > 0 |
50 |
| - |
51 |
| - for gpu_info in self.gpu_info_dict[server_address]["gpu_infos"]: |
52 |
| - if ( |
53 |
| - self.pytorch_config.GPU_usage_dict[server_address]["gpus"] == "all" |
54 |
| - or gpu_info["gpu"] |
55 |
| - in self.pytorch_config.GPU_usage_dict[server_address]["gpus"] |
56 |
| - ): |
57 |
| - if ( |
58 |
| - gpu_info["memory"]["total"] - gpu_info["memory"]["used"] |
59 |
| - < self.pytorch_config.min_memory_per_gpu |
60 |
| - ): |
61 |
| - bad_gpus.append( |
62 |
| - { |
63 |
| - "server": server_address, |
64 |
| - "gpu": gpu_info["gpu"], |
65 |
| - "free": ( |
66 |
| - gpu_info["memory"]["total"] |
67 |
| - - gpu_info["memory"]["used"] |
68 |
| - ), |
69 |
| - } |
70 |
| - ) |
71 |
| - if len(bad_gpus) > 0: |
72 |
| - for bad_gpu in bad_gpus: |
73 |
| - print( |
74 |
| - "server:{} GPU:{}, minimal memory {}GB, but only get {}GB free" |
75 |
| - " memory.".format( |
76 |
| - bad_gpu["server"], |
77 |
| - bad_gpu["gpu"], |
78 |
| - self.pytorch_config.min_memory_per_gpu, |
79 |
| - bad_gpu["free"], |
80 |
| - ) |
81 |
| - ) |
82 |
| - assert False, "GPUs not satisfy." |
83 |
| - |
84 |
| - def cal_learner_number(self): |
85 |
| - self.server_gpu_mapping = {} |
86 |
| - gpu_num = 0 |
87 |
| - for server_address in self.server_list: |
88 |
| - gpu_mapping = {} |
89 |
| - for gpu_info in self.gpu_info_dict[server_address]["gpu_infos"]: |
90 |
| - if ( |
91 |
| - self.pytorch_config.GPU_usage_dict[server_address]["gpus"] == "all" |
92 |
| - or gpu_info["gpu"] |
93 |
| - in self.pytorch_config.GPU_usage_dict[server_address]["gpus"] |
94 |
| - ): |
95 |
| - gpu_mapping[gpu_info["gpu"]] = gpu_num |
96 |
| - gpu_num += 1 |
97 |
| - self.server_gpu_mapping[server_address] = gpu_mapping |
98 |
| - self.learner_num = gpu_num |
99 |
| - |
100 |
| - def get_gpu_info(self, server_list: list): |
101 |
| - gpu_infos = {} |
102 |
| - for server_address in server_list: |
103 |
| - if server_address in self.gpu_info_dict: |
104 |
| - gpu_infos[server_address] = self.gpu_info_dict[server_address] |
105 |
| - return gpu_infos |
| 23 | +from openrl.supports.opengpu.gpu_info import get_local_GPU_info |
| 24 | + |
| 25 | +# from openrl.supports.opengpu.gpu_info import get_remote_GPU_info |
| 26 | + |
| 27 | + |
| 28 | +# class RemoteGPUManager: |
| 29 | +# def __init__(self, pytorch_config=None, check: bool = False): |
| 30 | +# self.gpu_info_dict = get_remote_GPU_info() |
| 31 | +# self.pytorch_config = pytorch_config |
| 32 | +# self.server_list = [] |
| 33 | +# if self.pytorch_config is not None: |
| 34 | +# for server_address in self.pytorch_config.GPU_usage_dict: |
| 35 | +# self.server_list.append(server_address) |
| 36 | +# |
| 37 | +# if check: |
| 38 | +# self.check_gpus() |
| 39 | +# |
| 40 | +# self.cal_learner_number() |
| 41 | +# |
| 42 | +# def check_gpus(self): |
| 43 | +# assert self.pytorch_config is not None |
| 44 | +# assert len(self.server_list) > 0 |
| 45 | +# |
| 46 | +# bad_gpus = [] |
| 47 | +# for server_address in self.server_list: |
| 48 | +# assert ( |
| 49 | +# server_address in self.gpu_info_dict |
| 50 | +# ), "can not get gpu info from {}".format(server_address) |
| 51 | +# assert len(self.gpu_info_dict[server_address]["gpu_infos"]) > 0 |
| 52 | +# |
| 53 | +# for gpu_info in self.gpu_info_dict[server_address]["gpu_infos"]: |
| 54 | +# if ( |
| 55 | +# self.pytorch_config.GPU_usage_dict[server_address]["gpus"] == "all" |
| 56 | +# or gpu_info["gpu"] |
| 57 | +# in self.pytorch_config.GPU_usage_dict[server_address]["gpus"] |
| 58 | +# ): |
| 59 | +# if ( |
| 60 | +# gpu_info["memory"]["total"] - gpu_info["memory"]["used"] |
| 61 | +# < self.pytorch_config.min_memory_per_gpu |
| 62 | +# ): |
| 63 | +# bad_gpus.append( |
| 64 | +# { |
| 65 | +# "server": server_address, |
| 66 | +# "gpu": gpu_info["gpu"], |
| 67 | +# "free": ( |
| 68 | +# gpu_info["memory"]["total"] |
| 69 | +# - gpu_info["memory"]["used"] |
| 70 | +# ), |
| 71 | +# } |
| 72 | +# ) |
| 73 | +# if len(bad_gpus) > 0: |
| 74 | +# for bad_gpu in bad_gpus: |
| 75 | +# print( |
| 76 | +# "server:{} GPU:{}, minimal memory {}GB, but only get {}GB free" |
| 77 | +# " memory.".format( |
| 78 | +# bad_gpu["server"], |
| 79 | +# bad_gpu["gpu"], |
| 80 | +# self.pytorch_config.min_memory_per_gpu, |
| 81 | +# bad_gpu["free"], |
| 82 | +# ) |
| 83 | +# ) |
| 84 | +# assert False, "GPUs not satisfy." |
| 85 | +# |
| 86 | +# def cal_learner_number(self): |
| 87 | +# self.server_gpu_mapping = {} |
| 88 | +# gpu_num = 0 |
| 89 | +# for server_address in self.server_list: |
| 90 | +# gpu_mapping = {} |
| 91 | +# for gpu_info in self.gpu_info_dict[server_address]["gpu_infos"]: |
| 92 | +# if ( |
| 93 | +# self.pytorch_config.GPU_usage_dict[server_address]["gpus"] == "all" |
| 94 | +# or gpu_info["gpu"] |
| 95 | +# in self.pytorch_config.GPU_usage_dict[server_address]["gpus"] |
| 96 | +# ): |
| 97 | +# gpu_mapping[gpu_info["gpu"]] = gpu_num |
| 98 | +# gpu_num += 1 |
| 99 | +# self.server_gpu_mapping[server_address] = gpu_mapping |
| 100 | +# self.learner_num = gpu_num |
| 101 | +# |
| 102 | +# def get_gpu_info(self, server_list: list): |
| 103 | +# gpu_infos = {} |
| 104 | +# for server_address in server_list: |
| 105 | +# if server_address in self.gpu_info_dict: |
| 106 | +# gpu_infos[server_address] = self.gpu_info_dict[server_address] |
| 107 | +# return gpu_infos |
106 | 108 |
|
107 | 109 |
|
108 | 110 | class LocalGPUManager:
|
|
0 commit comments