From 46e9b763fd5da96e8b24df4a63bee965c0edfb16 Mon Sep 17 00:00:00 2001
From: Tim Grant <tgrant@nvidia.com>
Date: Wed, 6 Nov 2024 11:43:46 -0700
Subject: [PATCH] Wrap the cudaMalloc and cudaMemcpy calls.

Signed-off-by: Tim Grant <tgrant@nvidia.com>
---
 src/testrender/optixraytracer.cpp | 217 ++++++++++--------------------
 src/testrender/optixraytracer.h   |   4 +-
 src/testshade/optixgridrender.cpp |  78 ++++-------
 src/testshade/optixgridrender.h   |   4 +-
 4 files changed, 100 insertions(+), 203 deletions(-)
diff --git a/src/testrender/optixraytracer.cpp b/src/testrender/optixraytracer.cpp
index a17e62937..ab5aff9d1 100644
--- a/src/testrender/optixraytracer.cpp
+++ b/src/testrender/optixraytracer.cpp
@@ -83,6 +83,12 @@ OSL_NAMESPACE_ENTER
         }                                                                  \
     }
 
+
+#define DEVICE_ALLOC(size) reinterpret_cast<CUdeviceptr>(device_alloc(size))
+#define COPY_TO_DEVICE(dst_device, src_host, size) \
+    copy_to_device(reinterpret_cast<void*>(dst_device), src_host, size)
+
+
 static void
 context_log_cb(unsigned int level, const char* tag, const char* message,
                void* /*cbdata */)
@@ -116,10 +122,10 @@ OptixRaytracer::~OptixRaytracer()
 {
     if (m_optix_ctx)
         OPTIX_CHECK(optixDeviceContextDestroy(m_optix_ctx));
-    for (void* ptr : m_ptrs_to_free)
+    for (CUdeviceptr ptr : m_ptrs_to_free)
         cudaFree(reinterpret_cast<void*>(ptr));
-    for (void* ptr : m_arrays_to_free)
-        cudaFreeArray(reinterpret_cast<cudaArray_t>(ptr));
+    for (cudaArray_t arr : m_arrays_to_free)
+        cudaFreeArray(arr);
 }
 
 
@@ -133,6 +139,7 @@ OptixRaytracer::device_alloc(size_t size)
         errhandler().errorfmt("cudaMalloc({}) failed with error: {}\n", size,
                               cudaGetErrorString(res));
     }
+    m_ptrs_to_free.push_back(reinterpret_cast<CUdeviceptr>(ptr));
     return ptr;
 }
 
@@ -237,18 +244,13 @@ OptixRaytracer::synch_attributes()
         const size_t podDataSize = cpuDataSize
                                    - sizeof(ustringhash) * numStrings;
 
-        CUDA_CHECK(
-            cudaMalloc(reinterpret_cast<void**>(&d_color_system),
-                       podDataSize + sizeof(ustringhash_pod) * numStrings));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_color_system), colorSys,
-                              podDataSize, cudaMemcpyHostToDevice));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_color_system));
+        d_color_system = DEVICE_ALLOC(podDataSize
+                                      + sizeof(ustringhash_pod) * numStrings);
+        COPY_TO_DEVICE(d_color_system, colorSys, podDataSize);
 
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_osl_printf_buffer),
-                              OSL_PRINTF_BUFFER_SIZE));
+        d_osl_printf_buffer = DEVICE_ALLOC(OSL_PRINTF_BUFFER_SIZE);
         CUDA_CHECK(cudaMemset(reinterpret_cast<void*>(d_osl_printf_buffer), 0,
                               OSL_PRINTF_BUFFER_SIZE));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_osl_printf_buffer));
 
         // then copy the device string to the end, first strings starting at dataPtr - (numStrings)
         // FIXME -- Should probably handle alignment better.
@@ -260,8 +262,7 @@ OptixRaytracer::synch_attributes()
         for (const ustringhash* end = cpuStringHash + numStrings;
              cpuStringHash < end; ++cpuStringHash) {
             ustringhash_pod devStr = cpuStringHash->hash();
-            CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(gpuStrings), &devStr,
-                                  sizeof(devStr), cudaMemcpyHostToDevice));
+            COPY_TO_DEVICE(gpuStrings, &devStr, sizeof(devStr));
             gpuStrings += sizeof(ustringhash_pod);
         }
     }
@@ -556,13 +557,10 @@ OptixRaytracer::create_shaders()
     }
 
     // Upload per-material interactive buffer table
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_interactive_params),
-                          sizeof(void*) * material_interactive_params.size()));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_interactive_params),
-                          material_interactive_params.data(),
-                          sizeof(void*) * material_interactive_params.size(),
-                          cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_interactive_params));
+    d_interactive_params = DEVICE_ALLOC(sizeof(void*)
+                                        * material_interactive_params.size());
+    COPY_TO_DEVICE(d_interactive_params, material_interactive_params.data(),
+                   sizeof(void*) * material_interactive_params.size());
 }
 
 
@@ -650,12 +648,8 @@ OptixRaytracer::create_sbt()
         GenericRecord raygen_record;
         CUdeviceptr d_raygen_record;
         OPTIX_CHECK(optixSbtRecordPackHeader(m_raygen_group, &raygen_record));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_raygen_record),
-                              sizeof(GenericRecord)));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_raygen_record),
-                              &raygen_record, sizeof(GenericRecord),
-                              cudaMemcpyHostToDevice));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_raygen_record));
+        d_raygen_record = DEVICE_ALLOC(sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_raygen_record, &raygen_record, sizeof(GenericRecord));
 
         m_optix_sbt.raygenRecord = d_raygen_record;
     }
@@ -665,12 +659,8 @@ OptixRaytracer::create_sbt()
         GenericRecord miss_record;
         CUdeviceptr d_miss_record;
         OPTIX_CHECK(optixSbtRecordPackHeader(m_miss_group, &miss_record));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_miss_record),
-                              sizeof(GenericRecord)));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_miss_record),
-                              &miss_record, sizeof(GenericRecord),
-                              cudaMemcpyHostToDevice));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_miss_record));
+        d_miss_record = DEVICE_ALLOC(sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_miss_record, &miss_record, sizeof(GenericRecord));
 
         m_optix_sbt.missRecordBase          = d_miss_record;
         m_optix_sbt.missRecordStrideInBytes = sizeof(GenericRecord);
@@ -685,13 +675,9 @@ OptixRaytracer::create_sbt()
         OPTIX_CHECK(
             optixSbtRecordPackHeader(m_closesthit_group, &hitgroup_records[0]));
 
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_hitgroup_records),
-                              nhitgroups * sizeof(GenericRecord)));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_hitgroup_records),
-                              &hitgroup_records[0],
-                              nhitgroups * sizeof(GenericRecord),
-                              cudaMemcpyHostToDevice));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_hitgroup_records));
+        d_hitgroup_records = DEVICE_ALLOC(nhitgroups * sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_hitgroup_records, &hitgroup_records[0],
+                       nhitgroups * sizeof(GenericRecord));
 
         m_optix_sbt.hitgroupRecordBase          = d_hitgroup_records;
         m_optix_sbt.hitgroupRecordStrideInBytes = sizeof(GenericRecord);
@@ -709,13 +695,9 @@ OptixRaytracer::create_sbt()
                                                  &callable_records[idx]));
         }
 
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_callable_records),
-                              (nshaders) * sizeof(GenericRecord)));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_callable_records),
-                              callable_records.data(),
-                              (nshaders) * sizeof(GenericRecord),
-                              cudaMemcpyHostToDevice));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_callable_records));
+        d_callable_records = DEVICE_ALLOC((nshaders) * sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_callable_records, callable_records.data(),
+                       (nshaders) * sizeof(GenericRecord));
 
         m_optix_sbt.callablesRecordBase          = d_callable_records;
         m_optix_sbt.callablesRecordStrideInBytes = sizeof(GenericRecord);
@@ -733,14 +715,9 @@ OptixRaytracer::create_sbt()
         CUdeviceptr d_setglobals_raygen_record;
         OPTIX_CHECK(
             optixSbtRecordPackHeader(m_setglobals_raygen_group, &record));
-        CUDA_CHECK(
-            cudaMalloc(reinterpret_cast<void**>(&d_setglobals_raygen_record),
-                       sizeof(GenericRecord)));
-        CUDA_CHECK(
-            cudaMemcpy(reinterpret_cast<void*>(d_setglobals_raygen_record),
-                       &record, sizeof(GenericRecord), cudaMemcpyHostToDevice));
-        m_ptrs_to_free.push_back(
-            reinterpret_cast<void*>(d_setglobals_raygen_record));
+        d_setglobals_raygen_record = DEVICE_ALLOC(sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_setglobals_raygen_record, &record,
+                       sizeof(GenericRecord));
 
         m_setglobals_optix_sbt.raygenRecord = d_setglobals_raygen_record;
     }
@@ -750,14 +727,9 @@ OptixRaytracer::create_sbt()
         GenericRecord record;
         CUdeviceptr d_setglobals_miss_record;
         OPTIX_CHECK(optixSbtRecordPackHeader(m_setglobals_miss_group, &record));
-        CUDA_CHECK(
-            cudaMalloc(reinterpret_cast<void**>(&d_setglobals_miss_record),
-                       sizeof(GenericRecord)));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_setglobals_miss_record),
-                              &record, sizeof(GenericRecord),
-                              cudaMemcpyHostToDevice));
-        m_ptrs_to_free.push_back(
-            reinterpret_cast<void*>(d_setglobals_miss_record));
+        d_setglobals_miss_record = DEVICE_ALLOC(sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_setglobals_miss_record, &record,
+                       sizeof(GenericRecord));
 
         m_setglobals_optix_sbt.missRecordBase = d_setglobals_miss_record;
         m_setglobals_optix_sbt.missRecordStrideInBytes = sizeof(GenericRecord);
@@ -797,20 +769,12 @@ OptixRaytracer::build_accel()
     accel_options.operation              = OPTIX_BUILD_OPERATION_BUILD;
 
     const size_t vertices_size = sizeof(Vec3) * scene.verts.size();
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_vertices), vertices_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_vertices),
-                          scene.verts.data(), vertices_size,
-                          cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_vertices));
+    d_vertices                 = DEVICE_ALLOC(vertices_size);
+    COPY_TO_DEVICE(d_vertices, scene.verts.data(), vertices_size);
 
     const size_t indices_size = scene.triangles.size() * sizeof(int32_t) * 3;
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_vert_indices), indices_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_vert_indices),
-                          scene.triangles.data(), indices_size,
-                          cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_vert_indices));
+    d_vert_indices            = DEVICE_ALLOC(indices_size);
+    COPY_TO_DEVICE(d_vert_indices, scene.triangles.data(), indices_size);
 
     const uint32_t triangle_input_flags[1] = { OPTIX_GEOMETRY_FLAG_NONE };
     OptixBuildInput triangle_input         = {};
@@ -836,9 +800,7 @@ OptixRaytracer::build_accel()
     CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_temp_buffer),
                           gas_buffer_sizes.tempSizeInBytes));
 
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_accel_output_buffer),
-                          gas_buffer_sizes.outputSizeInBytes));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_accel_output_buffer));
+    d_accel_output_buffer = DEVICE_ALLOC(gas_buffer_sizes.outputSizeInBytes);
 
     OPTIX_CHECK(optixAccelBuild(
         m_optix_ctx, 0, &accel_options, &triangle_input, 1, d_temp_buffer,
@@ -855,16 +817,9 @@ OptixRaytracer::prepare_background()
 {
     if (getBackgroundShaderID() >= 0) {
         const int bg_res = std::max<int>(32, getBackgroundResolution());
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_bg_values),
-                              3 * sizeof(float) * bg_res * bg_res));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_bg_rows),
-                              sizeof(float) * bg_res));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_bg_cols),
-                              sizeof(float) * bg_res * bg_res));
-
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_bg_values));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_bg_rows));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_bg_cols));
+        d_bg_values      = DEVICE_ALLOC(3 * sizeof(float) * bg_res * bg_res);
+        d_bg_rows        = DEVICE_ALLOC(sizeof(float) * bg_res);
+        d_bg_cols        = DEVICE_ALLOC(sizeof(float) * bg_res * bg_res);
     }
 }
 
@@ -875,46 +830,29 @@ OptixRaytracer::upload_mesh_data()
 {
     // Upload the extra geometry data to the device
     const size_t uvs_size = sizeof(Vec2) * scene.uvs.size();
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_uvs), uvs_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_uvs), scene.uvs.data(),
-                          uvs_size, cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_uvs));
+    d_uvs                 = DEVICE_ALLOC(uvs_size);
+    COPY_TO_DEVICE(d_uvs, scene.uvs.data(), uvs_size);
 
     const size_t uv_indices_size = scene.uv_triangles.size() * sizeof(int32_t)
                                    * 3;
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_uv_indices), uv_indices_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_uv_indices),
-                          scene.uv_triangles.data(), uv_indices_size,
-                          cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_uv_indices));
+    d_uv_indices = DEVICE_ALLOC(uv_indices_size);
+    COPY_TO_DEVICE(d_uv_indices, scene.uv_triangles.data(), uv_indices_size);
 
     const size_t normals_size = sizeof(Vec3) * scene.normals.size();
     if (normals_size > 0) {
-        CUDA_CHECK(
-            cudaMalloc(reinterpret_cast<void**>(&d_normals), normals_size));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_normals),
-                              scene.normals.data(), normals_size,
-                              cudaMemcpyHostToDevice));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_normals));
+        d_normals = DEVICE_ALLOC(normals_size);
+        COPY_TO_DEVICE(d_normals, scene.normals.data(), normals_size);
     }
 
     const size_t normal_indices_size = scene.n_triangles.size()
                                        * sizeof(int32_t) * 3;
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_normal_indices),
-                          normal_indices_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_normal_indices),
-                          scene.n_triangles.data(), normal_indices_size,
-                          cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_normal_indices));
+    d_normal_indices = DEVICE_ALLOC(normal_indices_size);
+    COPY_TO_DEVICE(d_normal_indices, scene.n_triangles.data(),
+                   normal_indices_size);
 
     const size_t shader_ids_size = scene.shaderids.size() * sizeof(int);
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_shader_ids), shader_ids_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_shader_ids),
-                          scene.shaderids.data(), shader_ids_size,
-                          cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_shader_ids));
+    d_shader_ids                 = DEVICE_ALLOC(shader_ids_size);
+    COPY_TO_DEVICE(d_shader_ids, scene.shaderids.data(), shader_ids_size);
 
     // TODO: These could be packed, but for now just use ints instead of bools
     std::vector<int32_t> shader_is_light;
@@ -922,22 +860,16 @@ OptixRaytracer::upload_mesh_data()
         shader_is_light.push_back(is_light);
     const size_t shader_is_light_size = shader_is_light.size()
                                         * sizeof(int32_t);
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_shader_is_light),
-                          shader_is_light_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_shader_is_light),
-                          shader_is_light.data(), shader_is_light_size,
-                          cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_shader_is_light));
+    d_shader_is_light = DEVICE_ALLOC(shader_is_light_size);
+    COPY_TO_DEVICE(d_shader_is_light, shader_is_light.data(),
+                   shader_is_light_size);
 
 
     const size_t lightprims_size = OptixRaytracer::lightprims().size()
                                    * sizeof(uint32_t);
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_lightprims), lightprims_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_lightprims),
-                          OptixRaytracer::lightprims().data(), lightprims_size,
-                          cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_lightprims));
+    d_lightprims = DEVICE_ALLOC(lightprims_size);
+    COPY_TO_DEVICE(d_lightprims, OptixRaytracer::lightprims().data(),
+                   lightprims_size);
 
     // Copy the mesh ID for each triangle to the device
     std::vector<int> mesh_ids;
@@ -948,11 +880,8 @@ OptixRaytracer::upload_mesh_data()
         mesh_ids.push_back(meshid);
     }
     const size_t mesh_ids_size = mesh_ids.size() * sizeof(int32_t);
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_mesh_ids), mesh_ids_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_mesh_ids), mesh_ids.data(),
-                          mesh_ids_size, cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_mesh_ids));
+    d_mesh_ids                 = DEVICE_ALLOC(mesh_ids_size);
+    COPY_TO_DEVICE(d_mesh_ids, mesh_ids.data(), mesh_ids_size);
 
     // Copy the mesh surface areas to the device
     std::vector<float> mesh_surfacearea;
@@ -971,12 +900,9 @@ OptixRaytracer::upload_mesh_data()
 
     const size_t mesh_surfacearea_size = mesh_surfacearea.size()
                                          * sizeof(float);
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_surfacearea),
-                          mesh_surfacearea_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_surfacearea),
-                          mesh_surfacearea.data(), mesh_surfacearea_size,
-                          cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_surfacearea));
+    d_surfacearea = DEVICE_ALLOC(mesh_surfacearea_size);
+    COPY_TO_DEVICE(d_surfacearea, mesh_surfacearea.data(),
+                   mesh_surfacearea_size);
 }
 
 
@@ -1061,7 +987,7 @@ OptixRaytracer::get_texture_handle(ustring filename,
         CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, level_pixels[0].data(),
                                        pitch, pitch, img_height,
                                        cudaMemcpyHostToDevice));
-        m_arrays_to_free.push_back(reinterpret_cast<void*>(pixelArray));
+        m_arrays_to_free.push_back(pixelArray);
 
         cudaResourceDesc res_desc  = {};
         res_desc.resType           = cudaResourceTypeMipmappedArray;
@@ -1123,12 +1049,8 @@ OptixRaytracer::warmup()
 void
 OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED)
 {
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_output_buffer),
-                          xres * yres * 4 * sizeof(float)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_launch_params),
-                          sizeof(RenderParams)));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_output_buffer));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_launch_params));
+    d_output_buffer = DEVICE_ALLOC(xres * yres * 4 * sizeof(float));
+    d_launch_params = DEVICE_ALLOC(sizeof(RenderParams));
 
     m_xres = xres;
     m_yres = yres;
@@ -1186,8 +1108,7 @@ OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED)
     params.bg_rows   = d_bg_rows;
     params.bg_cols   = d_bg_cols;
 
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_launch_params), &params,
-                          sizeof(RenderParams), cudaMemcpyHostToDevice));
+    COPY_TO_DEVICE(d_launch_params, &params, sizeof(RenderParams));
 
     // Set up global variables
     OPTIX_CHECK(optixLaunch(m_optix_pipeline, m_cuda_stream, d_launch_params,
diff --git a/src/testrender/optixraytracer.h b/src/testrender/optixraytracer.h
index 1763420fd..b15a3147d 100644
--- a/src/testrender/optixraytracer.h
+++ b/src/testrender/optixraytracer.h
@@ -142,8 +142,8 @@ class OptixRaytracer final : public SimpleRaytracer {
     std::unordered_map<ustringhash, optix::TextureSampler> m_samplers;
 
     // CUdeviceptrs that need to be freed after we are done
-    std::vector<void*> m_ptrs_to_free;
-    std::vector<void*> m_arrays_to_free;
+    std::vector<CUdeviceptr> m_ptrs_to_free;
+    std::vector<cudaArray_t> m_arrays_to_free;
 };
 
 
diff --git a/src/testshade/optixgridrender.cpp b/src/testshade/optixgridrender.cpp
index db9a8ac36..877a5691f 100644
--- a/src/testshade/optixgridrender.cpp
+++ b/src/testshade/optixgridrender.cpp
@@ -86,6 +86,11 @@ OSL_NAMESPACE_ENTER
     }
 
 
+#define DEVICE_ALLOC(size) reinterpret_cast<CUdeviceptr>(device_alloc(size))
+#define COPY_TO_DEVICE(dst_device, src_host, size) \
+    copy_to_device(reinterpret_cast<void*>(dst_device), src_host, size)
+
+
 static void
 context_log_cb(unsigned int level, const char* tag, const char* message,
                void* /*cbdata */)
@@ -180,12 +185,12 @@ OptixGridRenderer::load_ptx_file(string_view filename)
 
 OptixGridRenderer::~OptixGridRenderer()
 {
-    for (void* p : m_ptrs_to_free)
-        cudaFree(p);
-    for (void* p : m_arrays_to_free)
-        cudaFreeArray(reinterpret_cast<cudaArray_t>(p));
     if (m_optix_ctx)
         OPTIX_CHECK(optixDeviceContextDestroy(m_optix_ctx));
+    for (CUdeviceptr ptr : m_ptrs_to_free)
+        cudaFree(reinterpret_cast<void*>(ptr));
+    for (cudaArray_t arr : m_arrays_to_free)
+        cudaFreeArray(arr);
 }
 
 
@@ -252,30 +257,24 @@ OptixGridRenderer::synch_attributes()
         const size_t podDataSize = cpuDataSize
                                    - sizeof(ustringhash) * numStrings;
 
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_color_system),
-                              podDataSize + sizeof(uint64_t) * numStrings));
+        d_color_system = DEVICE_ALLOC(podDataSize
+                                      + sizeof(uint64_t) * numStrings);
         CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_color_system), colorSys,
                               podDataSize, cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_osl_printf_buffer),
-                              OSL_PRINTF_BUFFER_SIZE));
+        d_osl_printf_buffer = DEVICE_ALLOC(OSL_PRINTF_BUFFER_SIZE);
         CUDA_CHECK(cudaMemset(reinterpret_cast<void*>(d_osl_printf_buffer), 0,
                               OSL_PRINTF_BUFFER_SIZE));
 
         // Transforms
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_object2common),
-                              sizeof(OSL::Matrix44)));
+        d_object2common = DEVICE_ALLOC(sizeof(OSL::Matrix44));
         CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_object2common),
                               &m_object2common, sizeof(OSL::Matrix44),
                               cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_shader2common),
-                              sizeof(OSL::Matrix44)));
+        d_shader2common = DEVICE_ALLOC(sizeof(OSL::Matrix44));
         CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_shader2common),
                               &m_shader2common, sizeof(OSL::Matrix44),
                               cudaMemcpyHostToDevice));
 
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_color_system));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_osl_printf_buffer));
-
         // then copy the device string to the end, first strings starting at dataPtr - (numStrings)
         // FIXME -- Should probably handle alignment better.
         const ustringhash* cpuStringHash
@@ -730,26 +729,12 @@ OptixGridRenderer::make_optix_materials()
     setglobals_raygenRecord.data = nullptr;
     setglobals_missRecord.data   = nullptr;
 
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_raygenRecord),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_missRecord),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_hitgroupRecord),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_callablesRecord),
-                          callables * sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_setglobals_raygenRecord),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_setglobals_missRecord),
-                          sizeof(GenericRecord)));
-
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_raygenRecord));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_missRecord));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_hitgroupRecord));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_callablesRecord));
-    m_ptrs_to_free.push_back(
-        reinterpret_cast<void*>(d_setglobals_raygenRecord));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_setglobals_missRecord));
+    d_raygenRecord            = DEVICE_ALLOC(sizeof(GenericRecord));
+    d_missRecord              = DEVICE_ALLOC(sizeof(GenericRecord));
+    d_hitgroupRecord          = DEVICE_ALLOC(sizeof(GenericRecord));
+    d_callablesRecord         = DEVICE_ALLOC(callables * sizeof(GenericRecord));
+    d_setglobals_raygenRecord = DEVICE_ALLOC(sizeof(GenericRecord));
+    d_setglobals_missRecord   = DEVICE_ALLOC(sizeof(GenericRecord));
 
     CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_raygenRecord),
                           &raygenRecord, sizeof(GenericRecord),
@@ -882,7 +867,7 @@ OptixGridRenderer::get_texture_handle(ustring filename,
         CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, level_pixels[0].data(),
                                        pitch, pitch, img_height,
                                        cudaMemcpyHostToDevice));
-        m_arrays_to_free.push_back(reinterpret_cast<void*>(pixelArray));
+        m_arrays_to_free.push_back(pixelArray);
 
         cudaResourceDesc res_desc  = {};
         res_desc.resType           = cudaResourceTypeMipmappedArray;
@@ -941,13 +926,8 @@ OptixGridRenderer::warmup()
 void
 OptixGridRenderer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED)
 {
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_output_buffer),
-                          xres * yres * 4 * sizeof(float)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_launch_params),
-                          sizeof(RenderParams)));
-
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_output_buffer));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_launch_params));
+    d_output_buffer = DEVICE_ALLOC(xres * yres * 4 * sizeof(float));
+    d_launch_params = DEVICE_ALLOC(sizeof(RenderParams));
 
     m_xres = xres;
     m_yres = yres;
@@ -1153,19 +1133,15 @@ OptixGridRenderer::register_named_transforms()
     }
 
     // Push the names and transforms to the device
-    size_t sz = sizeof(uint64_t) * xform_name_buffer.size();
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_xform_name_buffer), sz));
+    size_t sz           = sizeof(uint64_t) * xform_name_buffer.size();
+    d_xform_name_buffer = DEVICE_ALLOC(sz);
     CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_xform_name_buffer),
                           xform_name_buffer.data(), sz,
                           cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_xform_name_buffer));
-
-    sz = sizeof(OSL::Matrix44) * xform_buffer.size();
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_xform_buffer), sz));
+    sz             = sizeof(OSL::Matrix44) * xform_buffer.size();
+    d_xform_buffer = DEVICE_ALLOC(sz);
     CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_xform_buffer),
                           xform_buffer.data(), sz, cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_xform_buffer));
-
     m_num_named_xforms = xform_name_buffer.size();
 }
 
diff --git a/src/testshade/optixgridrender.h b/src/testshade/optixgridrender.h
index f689217d2..9cb2719fd 100644
--- a/src/testshade/optixgridrender.h
+++ b/src/testshade/optixgridrender.h
@@ -104,8 +104,8 @@ class OptixGridRenderer final : public SimpleRenderer {
     OSL::Matrix44 m_object2common;  // "object" space to "common" space matrix
 
     // CUdeviceptrs that need to be freed after we are done
-    std::vector<void*> m_ptrs_to_free;
-    std::vector<void*> m_arrays_to_free;
+    std::vector<CUdeviceptr> m_ptrs_to_free;
+    std::vector<cudaArray_t> m_arrays_to_free;
 };