-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
87 lines (72 loc) · 3.51 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torch
import subprocess
import time
import os
#import gc
def print_gpu_info():
cmd = 'nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu --format=csv'
result = subprocess.check_output(cmd, shell=True).decode('utf-8').strip()
gpu_info = [line.split(', ') for line in result.split('\n')[1:]]
for info in gpu_info:
gpu_index, memory_used, memory_total, gpu_utilization = info
device_name = torch.cuda.get_device_name(int(gpu_index))
memory_used = int(memory_used.split()[0])
memory_total = int(memory_total.split()[0])
gpu_utilization = int(gpu_utilization.split()[0])
print(f"GPU {gpu_index}: Device {device_name} Memory Used: {memory_used:.2f} MB, Memory Total: {memory_total:.2f} MB, GPU Utilization: {gpu_utilization} %")
def return_gpu_info():
cmd = 'nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu --format=csv'
result = subprocess.check_output(cmd, shell=True).decode('utf-8').strip()
gpu_info = [line.split(', ') for line in result.split('\n')[1:]]
for info in gpu_info:
gpu_index, memory_used, memory_total, gpu_utilization = info
device_name = torch.cuda.get_device_name(int(gpu_index))
memory_used = int(memory_used.split()[0])
memory_total = int(memory_total.split()[0])
gpu_utilization = int(gpu_utilization.split()[0])
return ( gpu_index, device_name, memory_used, memory_total, gpu_utilization )
def get_info( event_start_read_GPU_info, queue_gpu_info ):
while True:
event_start_read_GPU_info.wait()
queue_gpu_info.put(return_gpu_info())
event_start_read_GPU_info.clear()
# print('GPU event is cleared')
def clear_gpu_mem_util(model, images, labels):
# model = None
# images = None
# labels = None
# gc.collect()
torch.cuda.empty_cache()
def check_gpu_info_queue(queue_gpu_info):
# Make sure that no more data remains in queue
while(True):
if(queue_gpu_info.empty()):
break
else:
_ = queue_gpu_info.get()
def trig_GPU_read( queue_gpu_info, event_start_read_GPU_info ):
if ( queue_gpu_info.empty() and (not event_start_read_GPU_info.is_set() ) ) :
event_start_read_GPU_info.set()
# print("GPU read event is set")
def get_info_from_GPU_queue( queue_gpu_info, event_start_read_GPU_info):
if not queue_gpu_info.empty() and (not event_start_read_GPU_info.is_set()):
# print("GPU info is read")
queue_gpu_info_taken = queue_gpu_info.get()
device_name = queue_gpu_info_taken[1]
device_mem_cap = queue_gpu_info_taken[3]
gpu_mem_usage = int(queue_gpu_info_taken[2])
gpu_util = int(queue_gpu_info_taken[4])
return(device_name, device_mem_cap, gpu_mem_usage, gpu_util)
else:
return None
def save_results_as_checkpoint(output_path, device_name, training_params, optimizer_types, train_gpu_mem_usage):
folder_path = os.path.join(output_path, device_name)
if not os.path.exists( folder_path ):
os.makedirs(folder_path)
torch.save({ 'training_params': training_params,
'optimizer_types': optimizer_types,
'train_gpu_mem_usage': train_gpu_mem_usage, }, os.path.join(folder_path, 'checkpoint.pth'))
if __name__ == '__main__':
while True:
print_gpu_info()
time.sleep(1)