Skip to content

Commit

Permalink
Merge pull request #45 from amdadvtech/cckao/feature/gpu_mem
Browse files Browse the repository at this point in the history
Implement GpuMemory to handle device memory operations.
  • Loading branch information
takahiroharada authored Jan 24, 2023
2 parents dc70d30 + 3a86c75 commit 6314b2b
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 12 deletions.
144 changes: 144 additions & 0 deletions Orochi/GpuMemory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#pragma once

#include <Orochi/OrochiUtils.h>
#include <utility>

namespace Oro
{

/// @brief A helper function that casts an address of a pointer to the device memory to a void pointer to be used as an argument for kernel calls.
/// @tparam T The type of the element stored in the device memory.
/// @param ptr The address of a pointer to the device memory.
/// @return A void pointer.
template<typename T>
void* arg_cast( T* const* ptr ) noexcept
{
return reinterpret_cast<void*>( const_cast<T**>( ptr ) );
}

template<typename T>
class GpuMemory final
{
public:
GpuMemory() = default;

/// @brief Allocate the device memory with the given size.
/// @param init_size The initial size which represents the number of elements.
explicit GpuMemory( const size_t init_size )
{
OrochiUtils::malloc( m_data, init_size );

m_size = init_size;
m_capacity = init_size;
}

GpuMemory( const GpuMemory& ) = delete;
GpuMemory& operator=( const GpuMemory& other ) = delete;

GpuMemory( GpuMemory&& other ) noexcept : m_data{ std::exchange( other.m_data, nullptr ) }, m_size{ std::exchange( other.m_size, 0ULL ) }, m_capacity{ std::exchange( other.m_capacity, 0ULL ) } {}

GpuMemory& operator=( GpuMemory&& other ) noexcept
{
GpuMemory tmp( std::move( *this ) );

swap( *this, other );

return *this;
}

~GpuMemory()
{
if( m_data )
{
OrochiUtils::free( m_data );
m_data = nullptr;
}
m_size = 0ULL;
m_capacity = 0ULL;
}

/// @brief Get the size of the device memory.
/// @return The size of the device memory.
size_t size() const noexcept { return m_size; }

/// @brief Get the pointer to the device memory.
/// @return The pointer to the device memory.
T* ptr() const noexcept { return m_data; }

/// @brief Get the address of the pointer to the device memory. Useful for passing arguments to the kernel call.
/// @return The address of the pointer to the device memory.
T* const* address() const noexcept { return &m_data; }

/// @brief Resize the device memory. Its capacity is unchanged if the new size is smaller than the current one.
/// The old data should be considered invalid to be used after the function is called unless @c copy is set to True.
/// @param new_size The new memory size after the function is called.
/// @param copy If true, the function will copy the data to the newly created memory space as well.
void resize( const size_t new_size, const bool copy = false ) noexcept
{
if( new_size <= m_capacity )
{
m_size = new_size;
return;
}

GpuMemory tmp( new_size );

if( copy )
{
OrochiUtils::copyDtoD( tmp.m_data, m_data, m_size );
}

*this = std::move( tmp );
}

/// @brief Reset the memory space so that all bits inside are cleared to zero.
void reset() noexcept { OrochiUtils::memset( m_data, 0, m_size * sizeof( T ) ); }

/// @brief Copy the data from device memory to host.
/// @tparam T The type of the element stored in the device memory.
/// @param host_ptr The host pointer.
/// @param host_data_size The size of the host memory which represents the number of elements.
template<typename T>
void copyFromHost( const T* host_ptr, const size_t host_data_size ) noexcept
{
resize( host_data_size );
OrochiUtils::copyHtoD( m_data, host_ptr, host_data_size );
}

/// @brief Get the content of the first element stored in the device memory.
/// @return The content of the first element in the device memory.
T getSingle() const noexcept
{
T result{};

OrochiUtils::copyDtoH( &result, m_data, 1ULL );

return result;
}

/// @brief Get all the data stored in the device memory.
/// @return A vector which contains all the data stored in the device memory.
std::vector<T> getData() const noexcept
{
std::vector<T> result{};
result.resize( m_size );

OrochiUtils::copyDtoH( result.data(), m_data, m_size );

return result;
}

private:
static void swap( GpuMemory& lhs, GpuMemory& rhs ) noexcept
{
std::swap( lhs.m_data, rhs.m_data );
std::swap( lhs.m_size, rhs.m_size );
std::swap( lhs.m_capacity, rhs.m_capacity );
}

T* m_data{ nullptr };
size_t m_size{ 0ULL };
size_t m_capacity{ 0ULL };
};

} // namespace Oro
2 changes: 1 addition & 1 deletion Orochi/OrochiUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class OrochiUtils
}

template<typename T>
static void copyHtoD( T* dst, T* src, size_t n )
static void copyHtoD( T* dst, const T* src, size_t n )
{
oroError e = oroMemcpyHtoD( (oroDeviceptr)dst, (void*)src, sizeof( T ) * n );
OROASSERT( e == oroSuccess, 0 );
Expand Down
6 changes: 2 additions & 4 deletions ParallelPrimitives/RadixSort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ RadixSort::RadixSort()
{
if( selectedScanAlgo == ScanAlgo::SCAN_GPU_PARALLEL )
{
OrochiUtils::malloc( m_partialSum, m_nWGsToExecute );
m_partialSum.resize( m_nWGsToExecute );
OrochiUtils::malloc( m_isReady, m_nWGsToExecute );
OrochiUtils::memset( m_isReady, false, m_nWGsToExecute * sizeof( bool ) );
}
Expand All @@ -52,7 +52,6 @@ RadixSort::~RadixSort()
{
if( selectedScanAlgo == ScanAlgo::SCAN_GPU_PARALLEL )
{
OrochiUtils::free( m_partialSum );
OrochiUtils::free( m_isReady );
}
}
Expand Down Expand Up @@ -168,9 +167,8 @@ RadixSort::u32 RadixSort::configure( oroDevice device, OrochiUtils& oroutils, co

if( newWGsToExecute != m_nWGsToExecute && selectedScanAlgo == ScanAlgo::SCAN_GPU_PARALLEL )
{
OrochiUtils::free( m_partialSum );
m_partialSum.resize( newWGsToExecute );
OrochiUtils::free( m_isReady );
OrochiUtils::malloc( m_partialSum, newWGsToExecute );
OrochiUtils::malloc( m_isReady, newWGsToExecute );
OrochiUtils::memsetAsync( m_isReady, false, newWGsToExecute * sizeof( bool ), stream );
}
Expand Down
13 changes: 7 additions & 6 deletions ParallelPrimitives/RadixSort.h
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#pragma once

#include <Orochi/GpuMemory.h>
#include <Orochi/Orochi.h>
#include <Orochi/OrochiUtils.h>
#include <ParallelPrimitives/RadixSortConfigs.h>
#include <Test/Stopwatch.h>
#include <cmath>
#include <cstdint>
#include <string>
#include <unordered_map>
#include <cmath>
#include <ParallelPrimitives/RadixSortConfigs.h>
#include <Test/Stopwatch.h>
#include <Orochi/OrochiUtils.h>

//#define PROFILE 1
// #define PROFILE 1

namespace Oro
{
Expand Down Expand Up @@ -101,7 +102,7 @@ class RadixSort

constexpr static auto selectedScanAlgo{ ScanAlgo::SCAN_GPU_PARALLEL };

int* m_partialSum{ nullptr };
GpuMemory<int> m_partialSum;
bool* m_isReady{ nullptr };
};

Expand Down
2 changes: 1 addition & 1 deletion ParallelPrimitives/RadixSort.inl
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ void RadixSort::sort1pass( const T src, const T dst, int n, int startBit, int en

case ScanAlgo::SCAN_GPU_PARALLEL:
{
const void* args[] = { &temps, &temps, &m_partialSum, &m_isReady };
const void* args[] = { &temps, &temps, arg_cast( m_partialSum.address() ), &m_isReady };
OrochiUtils::launch1D( oroFunctions[Kernel::SCAN_PARALLEL], SCAN_WG_SIZE * m_nWGsToExecute, args, SCAN_WG_SIZE, 0, stream );
}
break;
Expand Down
38 changes: 38 additions & 0 deletions UnitTest/main.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <gtest/gtest.h>
#include <Orochi/Orochi.h>
#include <Orochi/OrochiUtils.h>
#include <Orochi/GpuMemory.h>
#include <fstream>

#if defined( OROASSERT )
Expand Down Expand Up @@ -72,6 +73,43 @@ TEST_F( OroTestBase, kernelExec )
OROCHECK( oroFree( (oroDeviceptr)a_device ) );
}

TEST_F( OroTestBase, GpuMemoryTest )
{
OrochiUtils o;

Oro::GpuMemory<int> device_memory;
device_memory.resize( 1 );
OROASSERT( device_memory.size() == 1ULL );

device_memory.reset();

auto kernel = o.getFunctionFromFile( m_device, "../UnitTest/testKernel.h", "testKernel", 0 );
const void* args[] = { Oro::arg_cast( device_memory.address() ) };

OrochiUtils::launch1D( kernel, 64, args, 64 );
OrochiUtils::waitForCompletion();

const auto val = device_memory.getSingle();
OROASSERT( val == 2016 );

const auto values = device_memory.getData();
OROASSERT( std::size( values ) == 1ULL );
OROASSERT( values[0] == 2016 );

const auto test_value = 123;
const std::vector<int> test_data = { test_value, test_value, test_value };
device_memory.copyFromHost( std::data( test_data ), std::size( test_data ) );

OROASSERT( device_memory.size() == std::size( test_data ) );

const auto output_data = device_memory.getData();

for( auto&& out : output_data )
{
OROASSERT( out == test_value );
}
}

TEST_F( OroTestBase, Event )
{
OrochiUtils o;
Expand Down

0 comments on commit 6314b2b

Please sign in to comment.