diff --git a/docs/en/reference/utils/torch_utils.md b/docs/en/reference/utils/torch_utils.md index 8ec53d82691..8242b70aba0 100644 --- a/docs/en/reference/utils/torch_utils.md +++ b/docs/en/reference/utils/torch_utils.md @@ -127,6 +127,10 @@ keywords: Ultralytics, torch utils, model optimization, device selection, infere



+## ::: ultralytics.utils.torch_utils.cuda_memory_usage + +



+ ## ::: ultralytics.utils.torch_utils.profile

diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py index c91c4cc584f..177afda2c32 100644 --- a/ultralytics/__init__.py +++ b/ultralytics/__init__.py @@ -1,6 +1,6 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -__version__ = "8.3.51" +__version__ = "8.3.52" import os diff --git a/ultralytics/utils/torch_utils.py b/ultralytics/utils/torch_utils.py index a6e7447629b..fc8dc71e15e 100644 --- a/ultralytics/utils/torch_utils.py +++ b/ultralytics/utils/torch_utils.py @@ -617,6 +617,32 @@ def convert_optimizer_state_dict_to_fp16(state_dict): return state_dict +@contextmanager +def cuda_memory_usage(device=None): + """ + Monitor and manage CUDA memory usage. + + This function checks if CUDA is available and, if so, empties the CUDA cache to free up unused memory. + It then yields a dictionary containing memory usage information, which can be updated by the caller. + Finally, it updates the dictionary with the amount of memory reserved by CUDA on the specified device. + + Args: + device (torch.device, optional): The CUDA device to query memory usage for. Defaults to None. + + Yields: + (dict): A dictionary with a key 'memory' initialized to 0, which will be updated with the reserved memory. + """ + cuda_info = dict(memory=0) + if torch.cuda.is_available(): + torch.cuda.empty_cache() + try: + yield cuda_info + finally: + cuda_info["memory"] = torch.cuda.memory_reserved(device) + else: + yield cuda_info + + def profile(input, ops, n=10, device=None, max_num_obj=0): """ Ultralytics speed, memory and FLOPs profiler. @@ -653,27 +679,31 @@ def profile(input, ops, n=10, device=None, max_num_obj=0): flops = 0 try: + mem = 0 for _ in range(n): - t[0] = time_sync() - y = m(x) - t[1] = time_sync() - try: - (sum(yi.sum() for yi in y) if isinstance(y, list) else y).sum().backward() - t[2] = time_sync() - except Exception: # no backward method - # print(e) # for debug - t[2] = float("nan") + with cuda_memory_usage(device) as cuda_info: + t[0] = time_sync() + y = m(x) + t[1] = time_sync() + try: + (sum(yi.sum() for yi in y) if isinstance(y, list) else y).sum().backward() + t[2] = time_sync() + except Exception: # no backward method + # print(e) # for debug + t[2] = float("nan") + mem += cuda_info["memory"] / 1e9 # (GB) tf += (t[1] - t[0]) * 1000 / n # ms per op forward tb += (t[2] - t[1]) * 1000 / n # ms per op backward if max_num_obj: # simulate training with predictions per image grid (for AutoBatch) - torch.randn( - x.shape[0], - max_num_obj, - int(sum((x.shape[-1] / s) * (x.shape[-2] / s) for s in m.stride.tolist())), - device=device, - dtype=torch.float32, - ) - mem = torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0 # (GB) + with cuda_memory_usage(device) as cuda_info: + torch.randn( + x.shape[0], + max_num_obj, + int(sum((x.shape[-1] / s) * (x.shape[-2] / s) for s in m.stride.tolist())), + device=device, + dtype=torch.float32, + ) + mem += cuda_info["memory"] / 1e9 # (GB) s_in, s_out = (tuple(x.shape) if isinstance(x, torch.Tensor) else "list" for x in (x, y)) # shapes p = sum(x.numel() for x in m.parameters()) if isinstance(m, nn.Module) else 0 # parameters LOGGER.info(f"{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}")