You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Here is a simple code to do out = a * input, where out and input are array on CUDA. The gradient to 'a' should be based on reduction via atomic.Add However, it seems enzymeAd does not do that, therefore it gives different results for different runs. The enzyme explorer does not work at this moment, therefore I paste the code here
#include <stdio.h>
#include <cuda_runtime.h>
#define device_func __device__
#define Real float
typedef int dsize_t;
// External Enzyme constants for automatic differentiation
extern device_func int enzyme_dup;
extern device_func int enzyme_dupnoneed;
extern device_func int enzyme_out;
extern device_func int enzyme_const;
template <typename return_type, typename... T>
device_func return_type __enzyme_autodiff(void*, T...);
template <class T>
class Buffer
{
public:
typedef T value_type;
__host__ __device__ Buffer(T* buffer, size_t size) : buffer(buffer), size(size) {}
__host__ __device__ Buffer() : Buffer(nullptr, 0) {}
device_func T& operator()(size_t index)
{
return buffer[index];
}
device_func const T& operator()(size_t index) const
{
return buffer[index];
}
private:
T* buffer;
size_t size;
};
using BufferView = Buffer<Real>;
struct ScaleKernel {
__host__ __device__ ScaleKernel() = default;
__device__ void operator()(dsize_t idx) {
for (int i = 0; i < nsize; ++i) {
out[i](idx) = a * x[i](idx);
}
}
BufferView* out;
BufferView* x;
Real a;
int nsize;
};
// Device function for one-dimensional kernel execution
template <class Operation>
__device__ void one_dimensional_kernel_cuda_device(dsize_t size, void* op_ptr) {
auto idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx >= size) return;
Operation* operation = static_cast<Operation*>(op_ptr);
(*operation)(idx);
}
// Modified kernel: now takes pointers to Operation structures in device memory
template <class Operation>
__global__ void lanch_kernel_reverse(dsize_t size, Operation* operation_ptr, Operation* d_operation_ptr) {
__enzyme_autodiff<void>((void*)one_dimensional_kernel_cuda_device<Operation>,
enzyme_const, size,
enzyme_dup, (void*)operation_ptr, (void*)d_operation_ptr
);
}
// CUDA error checking macro
#define cudaCheckError() { \
cudaError_t e = cudaGetLastError(); \
if (e != cudaSuccess) { \
printf("CUDA Error %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
}
int main() {
const dsize_t ARRAY_SIZE = 10000;
const dsize_t NUM_ROWS = 1;
const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(Real);
// Host arrays
Real* h_x[NUM_ROWS];
Real* h_out[NUM_ROWS];
Real* h_d_x[NUM_ROWS];
Real* h_d_out[NUM_ROWS];
Real scale_factor = 2.5f;
// Allocate and initialize host arrays
for (int i = 0; i < NUM_ROWS; ++i) {
h_x[i] = (Real*)malloc(ARRAY_BYTES);
h_out[i] = (Real*)malloc(ARRAY_BYTES);
h_d_x[i] = (Real*)malloc(ARRAY_BYTES);
h_d_out[i] = (Real*)malloc(ARRAY_BYTES);
for (dsize_t j = 0; j < ARRAY_SIZE; ++j) {
h_x[i][j] = static_cast<Real>(j + 1);
h_d_x[i][j] = 0.0f;
h_d_out[i][j] = 1.0f;
}
}
// Device pointers for BufferView arrays
BufferView* d_x;
BufferView* d_out;
BufferView* d_d_x;
BufferView* d_d_out;
// Allocate device memory for BufferView arrays
cudaMalloc(&d_x, NUM_ROWS * sizeof(BufferView));
cudaCheckError();
cudaMalloc(&d_out, NUM_ROWS * sizeof(BufferView));
cudaCheckError();
cudaMalloc(&d_d_x, NUM_ROWS * sizeof(BufferView));
cudaCheckError();
cudaMalloc(&d_d_out, NUM_ROWS * sizeof(BufferView));
cudaCheckError();
// Allocate memory for each row on the device and copy the host data
Real* d_x_rows[NUM_ROWS];
Real* d_out_rows[NUM_ROWS];
Real* d_d_x_rows[NUM_ROWS];
Real* d_d_out_rows[NUM_ROWS];
for (int i = 0; i < NUM_ROWS; ++i) {
cudaMalloc(&d_x_rows[i], ARRAY_BYTES);
cudaCheckError();
cudaMalloc(&d_out_rows[i], ARRAY_BYTES);
cudaCheckError();
cudaMalloc(&d_d_x_rows[i], ARRAY_BYTES);
cudaCheckError();
cudaMalloc(&d_d_out_rows[i], ARRAY_BYTES);
cudaCheckError();
cudaMemcpy(d_x_rows[i], h_x[i], ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaCheckError();
cudaMemcpy(d_d_x_rows[i], h_d_x[i], ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaCheckError();
cudaMemcpy(d_d_out_rows[i], h_d_out[i], ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaCheckError();
}
// Create BufferView instances on the host
BufferView h_x_views[NUM_ROWS];
BufferView h_out_views[NUM_ROWS];
BufferView h_d_x_views[NUM_ROWS];
BufferView h_d_out_views[NUM_ROWS];
for (int i = 0; i < NUM_ROWS; ++i) {
h_x_views[i] = BufferView(d_x_rows[i], ARRAY_SIZE);
h_out_views[i] = BufferView(d_out_rows[i], ARRAY_SIZE);
h_d_x_views[i] = BufferView(d_d_x_rows[i], ARRAY_SIZE);
h_d_out_views[i] = BufferView(d_d_out_rows[i], ARRAY_SIZE);
}
// Copy the BufferView arrays to device memory
cudaMemcpy(d_x, h_x_views, NUM_ROWS * sizeof(BufferView), cudaMemcpyHostToDevice);
cudaCheckError();
cudaMemcpy(d_out, h_out_views, NUM_ROWS * sizeof(BufferView), cudaMemcpyHostToDevice);
cudaCheckError();
cudaMemcpy(d_d_x, h_d_x_views, NUM_ROWS * sizeof(BufferView), cudaMemcpyHostToDevice);
cudaCheckError();
cudaMemcpy(d_d_out, h_d_out_views, NUM_ROWS * sizeof(BufferView), cudaMemcpyHostToDevice);
cudaCheckError();
// Setup the ScaleKernel operation
ScaleKernel operation;
operation.x = d_x;
operation.out = d_out;
operation.a = scale_factor;
operation.nsize = NUM_ROWS;
ScaleKernel d_operation;
d_operation.x = d_d_x;
d_operation.out = d_d_out;
d_operation.a = 0.0f;
d_operation.nsize = NUM_ROWS;
// Allocate device memory for operation and d_operation
ScaleKernel* d_operation_ptr;
ScaleKernel* d_d_operation_ptr;
cudaMalloc(&d_operation_ptr, sizeof(ScaleKernel));
cudaCheckError();
cudaMalloc(&d_d_operation_ptr, sizeof(ScaleKernel));
cudaCheckError();
// Copy the operation structures to device
cudaMemcpy(d_operation_ptr, &operation, sizeof(ScaleKernel), cudaMemcpyHostToDevice);
cudaCheckError();
cudaMemcpy(d_d_operation_ptr, &d_operation, sizeof(ScaleKernel), cudaMemcpyHostToDevice);
cudaCheckError();
// Kernel configuration
const int blockSize = 256;
const int numBlocks = (ARRAY_SIZE + blockSize - 1) / blockSize;
// Execute the reverse (autodiff) kernel with device pointers
lanch_kernel_reverse<<<numBlocks, blockSize>>>(ARRAY_SIZE, d_operation_ptr, d_d_operation_ptr);
cudaCheckError();
// Synchronize the device
cudaDeviceSynchronize();
cudaCheckError();
// Copy the results and gradients back to host
for (int i = 0; i < NUM_ROWS; ++i) {
cudaMemcpy(h_out[i], d_out_rows[i], ARRAY_BYTES, cudaMemcpyDeviceToHost);
cudaCheckError();
cudaMemcpy(h_d_x[i], d_d_x_rows[i], ARRAY_BYTES, cudaMemcpyDeviceToHost);
cudaCheckError();
cudaMemcpy(h_d_out[i], d_d_out_rows[i], ARRAY_BYTES, cudaMemcpyDeviceToHost);
cudaCheckError();
}
// Now copy back the updated operation structures
cudaMemcpy(&operation, d_operation_ptr, sizeof(ScaleKernel), cudaMemcpyDeviceToHost);
cudaCheckError();
cudaMemcpy(&d_operation, d_d_operation_ptr, sizeof(ScaleKernel), cudaMemcpyDeviceToHost);
cudaCheckError();
// Print gradients of d_operation.a
printf("\nGradient of scale factor (d_operation.a): %f\n", d_operation.a);
// Free device memory
for (int i = 0; i < NUM_ROWS; ++i) {
cudaFree(d_x_rows[i]);
cudaFree(d_out_rows[i]);
cudaFree(d_d_x_rows[i]);
cudaFree(d_d_out_rows[i]);
}
cudaFree(d_x);
cudaFree(d_out);
cudaFree(d_d_x);
cudaFree(d_d_out);
cudaFree(d_operation_ptr);
cudaFree(d_d_operation_ptr);
// Free host memory
for (int i = 0; i < NUM_ROWS; ++i) {
free(h_x[i]);
free(h_out[i]);
free(h_d_x[i]);
free(h_d_out[i]);
}
return 0;
}
The right gradient should be 50005000
For #define Real double, It gives the correct gradient
For #define Real float, It gives different results for different run
The text was updated successfully, but these errors were encountered:
Here is a simple code to do out = a * input, where out and input are array on CUDA. The gradient to 'a' should be based on reduction via atomic.Add However, it seems enzymeAd does not do that, therefore it gives different results for different runs. The enzyme explorer does not work at this moment, therefore I paste the code here
The right gradient should be 50005000
For #define Real double, It gives the correct gradient
For #define Real float, It gives different results for different run
The text was updated successfully, but these errors were encountered: