EnzymeAd gives incorrect gradient for the cuda code #2199

minansys · 2024-12-18T19:53:58Z

Here is a simple code to do out = a * input, where out and input are array on CUDA. The gradient to 'a' should be based on reduction via atomic.Add However, it seems enzymeAd does not do that, therefore it gives different results for different runs. The enzyme explorer does not work at this moment, therefore I paste the code here

#include <stdio.h>
#include <cuda_runtime.h>

#define device_func __device__
#define Real float
typedef int dsize_t;

// External Enzyme constants for automatic differentiation
extern device_func int enzyme_dup;
extern device_func int enzyme_dupnoneed;
extern device_func int enzyme_out;
extern device_func int enzyme_const;

template <typename return_type, typename... T>
device_func return_type __enzyme_autodiff(void*, T...);

template <class T>
class Buffer
{
    public:
    typedef T value_type;
    __host__ __device__ Buffer(T* buffer, size_t size) : buffer(buffer), size(size) {}

    __host__ __device__ Buffer() : Buffer(nullptr, 0) {}

    device_func T& operator()(size_t index)
    {
        return buffer[index];
    }

    device_func const T& operator()(size_t index) const
    {
        return buffer[index];
    }

private:
    T* buffer;
    size_t size;
};

using BufferView = Buffer<Real>;

struct ScaleKernel {
    __host__ __device__ ScaleKernel() = default;

    __device__ void operator()(dsize_t idx) {

        for (int i = 0; i < nsize; ++i) {
            out[i](idx) = a * x[i](idx);
        }
    }

    BufferView* out;
    BufferView* x;
    Real a;
    int nsize;
};

// Device function for one-dimensional kernel execution
template <class Operation>
__device__ void one_dimensional_kernel_cuda_device(dsize_t size, void* op_ptr) {
    auto idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= size) return;

    Operation* operation = static_cast<Operation*>(op_ptr);
    (*operation)(idx);
}

// Modified kernel: now takes pointers to Operation structures in device memory
template <class Operation>
__global__ void lanch_kernel_reverse(dsize_t size, Operation* operation_ptr, Operation* d_operation_ptr) {
    __enzyme_autodiff<void>((void*)one_dimensional_kernel_cuda_device<Operation>,
                            enzyme_const, size,
                            enzyme_dup, (void*)operation_ptr, (void*)d_operation_ptr
    );
}

// CUDA error checking macro
#define cudaCheckError() {                                  \
    cudaError_t e = cudaGetLastError();                     \
    if (e != cudaSuccess) {                                 \
        printf("CUDA Error %s:%d: %s\n", __FILE__, __LINE__, \
               cudaGetErrorString(e));                      \
        exit(EXIT_FAILURE);                                 \
    }                                                       \
}

int main() {
    const dsize_t ARRAY_SIZE = 10000;
    const dsize_t NUM_ROWS = 1;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(Real);
    
    // Host arrays
    Real* h_x[NUM_ROWS];
    Real* h_out[NUM_ROWS];
    Real* h_d_x[NUM_ROWS];
    Real* h_d_out[NUM_ROWS];
    Real scale_factor = 2.5f;

    // Allocate and initialize host arrays
    for (int i = 0; i < NUM_ROWS; ++i) {
        h_x[i] = (Real*)malloc(ARRAY_BYTES);
        h_out[i] = (Real*)malloc(ARRAY_BYTES);
        h_d_x[i] = (Real*)malloc(ARRAY_BYTES);
        h_d_out[i] = (Real*)malloc(ARRAY_BYTES);
        for (dsize_t j = 0; j < ARRAY_SIZE; ++j) {
            h_x[i][j] = static_cast<Real>(j + 1);
            h_d_x[i][j] = 0.0f;
            h_d_out[i][j] = 1.0f;
        }
    }

    // Device pointers for BufferView arrays
    BufferView* d_x;
    BufferView* d_out;
    BufferView* d_d_x;
    BufferView* d_d_out;

    // Allocate device memory for BufferView arrays
    cudaMalloc(&d_x, NUM_ROWS * sizeof(BufferView));
    cudaCheckError();
    cudaMalloc(&d_out, NUM_ROWS * sizeof(BufferView));
    cudaCheckError();
    cudaMalloc(&d_d_x, NUM_ROWS * sizeof(BufferView));
    cudaCheckError();
    cudaMalloc(&d_d_out, NUM_ROWS * sizeof(BufferView));
    cudaCheckError();

    // Allocate memory for each row on the device and copy the host data
    Real* d_x_rows[NUM_ROWS];
    Real* d_out_rows[NUM_ROWS];
    Real* d_d_x_rows[NUM_ROWS];
    Real* d_d_out_rows[NUM_ROWS];

    for (int i = 0; i < NUM_ROWS; ++i) {
        cudaMalloc(&d_x_rows[i], ARRAY_BYTES);
        cudaCheckError();
        cudaMalloc(&d_out_rows[i], ARRAY_BYTES);
        cudaCheckError();
        cudaMalloc(&d_d_x_rows[i], ARRAY_BYTES);
        cudaCheckError();
        cudaMalloc(&d_d_out_rows[i], ARRAY_BYTES);
        cudaCheckError();

        cudaMemcpy(d_x_rows[i], h_x[i], ARRAY_BYTES, cudaMemcpyHostToDevice);
        cudaCheckError();
        cudaMemcpy(d_d_x_rows[i], h_d_x[i], ARRAY_BYTES, cudaMemcpyHostToDevice);
        cudaCheckError();
        cudaMemcpy(d_d_out_rows[i], h_d_out[i], ARRAY_BYTES, cudaMemcpyHostToDevice);
        cudaCheckError();
    }

    // Create BufferView instances on the host
    BufferView h_x_views[NUM_ROWS];
    BufferView h_out_views[NUM_ROWS];
    BufferView h_d_x_views[NUM_ROWS];
    BufferView h_d_out_views[NUM_ROWS];

    for (int i = 0; i < NUM_ROWS; ++i) {
        h_x_views[i] = BufferView(d_x_rows[i], ARRAY_SIZE);
        h_out_views[i] = BufferView(d_out_rows[i], ARRAY_SIZE);
        h_d_x_views[i] = BufferView(d_d_x_rows[i], ARRAY_SIZE);
        h_d_out_views[i] = BufferView(d_d_out_rows[i], ARRAY_SIZE);
    }

    // Copy the BufferView arrays to device memory
    cudaMemcpy(d_x, h_x_views, NUM_ROWS * sizeof(BufferView), cudaMemcpyHostToDevice);
    cudaCheckError();
    cudaMemcpy(d_out, h_out_views, NUM_ROWS * sizeof(BufferView), cudaMemcpyHostToDevice);
    cudaCheckError();
    cudaMemcpy(d_d_x, h_d_x_views, NUM_ROWS * sizeof(BufferView), cudaMemcpyHostToDevice);
    cudaCheckError();
    cudaMemcpy(d_d_out, h_d_out_views, NUM_ROWS * sizeof(BufferView), cudaMemcpyHostToDevice);
    cudaCheckError();

    // Setup the ScaleKernel operation
    ScaleKernel operation;
    operation.x = d_x;
    operation.out = d_out; 
    operation.a = scale_factor;
    operation.nsize = NUM_ROWS;

    ScaleKernel d_operation;
    d_operation.x = d_d_x;        
    d_operation.out = d_d_out;    
    d_operation.a = 0.0f;
    d_operation.nsize = NUM_ROWS;

    // Allocate device memory for operation and d_operation
    ScaleKernel* d_operation_ptr;
    ScaleKernel* d_d_operation_ptr;
    cudaMalloc(&d_operation_ptr, sizeof(ScaleKernel));
    cudaCheckError();
    cudaMalloc(&d_d_operation_ptr, sizeof(ScaleKernel));
    cudaCheckError();

    // Copy the operation structures to device
    cudaMemcpy(d_operation_ptr, &operation, sizeof(ScaleKernel), cudaMemcpyHostToDevice);
    cudaCheckError();
    cudaMemcpy(d_d_operation_ptr, &d_operation, sizeof(ScaleKernel), cudaMemcpyHostToDevice);
    cudaCheckError();

    // Kernel configuration
    const int blockSize = 256;
    const int numBlocks = (ARRAY_SIZE + blockSize - 1) / blockSize;

    // Execute the reverse (autodiff) kernel with device pointers
    lanch_kernel_reverse<<<numBlocks, blockSize>>>(ARRAY_SIZE, d_operation_ptr, d_d_operation_ptr);
    cudaCheckError();

    // Synchronize the device
    cudaDeviceSynchronize();
    cudaCheckError();

    // Copy the results and gradients back to host
    for (int i = 0; i < NUM_ROWS; ++i) {
        cudaMemcpy(h_out[i], d_out_rows[i], ARRAY_BYTES, cudaMemcpyDeviceToHost);
        cudaCheckError();
        cudaMemcpy(h_d_x[i], d_d_x_rows[i], ARRAY_BYTES, cudaMemcpyDeviceToHost);
        cudaCheckError();
        cudaMemcpy(h_d_out[i], d_d_out_rows[i], ARRAY_BYTES, cudaMemcpyDeviceToHost);
        cudaCheckError();
    }

    // Now copy back the updated operation structures
    cudaMemcpy(&operation, d_operation_ptr, sizeof(ScaleKernel), cudaMemcpyDeviceToHost);
    cudaCheckError();
    cudaMemcpy(&d_operation, d_d_operation_ptr, sizeof(ScaleKernel), cudaMemcpyDeviceToHost);
    cudaCheckError();

    // Print gradients of d_operation.a
    printf("\nGradient of scale factor (d_operation.a): %f\n", d_operation.a);

    // Free device memory
    for (int i = 0; i < NUM_ROWS; ++i) {
        cudaFree(d_x_rows[i]);
        cudaFree(d_out_rows[i]);
        cudaFree(d_d_x_rows[i]);
        cudaFree(d_d_out_rows[i]);
    }
    cudaFree(d_x);
    cudaFree(d_out);
    cudaFree(d_d_x);
    cudaFree(d_d_out);

    cudaFree(d_operation_ptr);
    cudaFree(d_d_operation_ptr);

    // Free host memory
    for (int i = 0; i < NUM_ROWS; ++i) {
        free(h_x[i]);
        free(h_out[i]);
        free(h_d_x[i]);
        free(h_d_out[i]);
    }

    return 0;
}

The right gradient should be 50005000
For #define Real double, It gives the correct gradient

For #define Real float, It gives different results for different run

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

EnzymeAd gives incorrect gradient for the cuda code #2199

EnzymeAd gives incorrect gradient for the cuda code #2199

minansys commented Dec 18, 2024

EnzymeAd gives incorrect gradient for the cuda code #2199

EnzymeAd gives incorrect gradient for the cuda code #2199

Comments

minansys commented Dec 18, 2024