Skip to content

Commit

Permalink
Merge pull request #123 from neil-lindquist/neil/svd-tile-life
Browse files Browse the repository at this point in the history
Remove tile life from ge2tb and he2hb
  • Loading branch information
neil-lindquist authored Nov 16, 2023
2 parents 86bb586 + adbd340 commit 75c3ba1
Show file tree
Hide file tree
Showing 25 changed files with 966 additions and 690 deletions.
2 changes: 1 addition & 1 deletion include/slate/enums.hh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ enum class TileReleaseStrategy : char {
None = 'N', ///< tiles are not release at all
Internal = 'I', ///< tiles are released by routines in slate::internal namespace
Slate = 'S', ///< tiles are released by routines directly in slate namespace
All = 'A', ///< tiles are released by rotines in all namespaces
All = 'A', ///< tiles are released by routines in all namespaces
};

namespace internal {
Expand Down
179 changes: 93 additions & 86 deletions src/ge2tb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "auxiliary/Debug.hh"
#include "slate/Matrix.hh"
#include "internal/internal.hh"
#include "internal/internal_util.hh"

namespace slate {

Expand Down Expand Up @@ -35,13 +36,20 @@ void ge2tb(
// Assumes column major
const Layout layout = Layout::ColMajor;
const int queue_0 = 0;
const int priority_0 = 0;

// Options
int64_t ib = get_option<int64_t>( opts, Option::InnerBlocking, 16 );
int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1);
max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads,
max_panel_threads );

// Use only TileReleaseStrategy::Slate for gemm.
// Internal gemm routine called here won't release
// any tiles. This routine will clean up tiles.
Options opts2 = opts;
opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate;

int64_t A_mt = A.mt();
int64_t A_nt = A.nt();
int64_t A_min_mtnt = std::min(A_mt, A_nt);
Expand Down Expand Up @@ -170,8 +178,6 @@ void ge2tb(

// Workspace for transposed panels needs one column of tiles.
auto AT = A.emptyLike(0, 0, Op::ConjTrans);
// todo: we really only want to insert 1 column's worth at a time.
AT.insertLocalTiles();

// No lookahead is possible, so no need to track dependencies --
// just execute tasks in order. Also, priority isn't needed.
Expand All @@ -189,24 +195,11 @@ void ge2tb(
auto TUl_panel = TUlocal.sub(k, A_mt-1, k, k);
auto TUr_panel = TUreduce.sub(k, A_mt-1, k, k);

// Find ranks in this column.
std::set<int> ranks_set;
U_panel.getRanks(&ranks_set);
assert(ranks_set.size() > 0);

// Find each rank's first (top-most) row in this panel,
// where the triangular tile resulting from local geqrf panel
// will reside.
std::vector< int64_t > first_indices;
first_indices.reserve(ranks_set.size());
for (int r: ranks_set) {
for (int64_t i = 0; i < U_panel.mt(); ++i) {
if (U_panel.tileRank(i, 0) == r) {
first_indices.push_back(i+k);
break;
}
}
}
std::vector< int64_t > first_indices
= internal::geqrf_compute_first_indices(U_panel, k);

//--------------------
// QR of U panel
Expand All @@ -215,36 +208,27 @@ void ge2tb(
std::move(U_panel),
std::move(TUl_panel),
dwork_array, work_size,
ib, max_panel_threads);
ib, max_panel_threads );

// triangle-triangle reductions
// ttqrt handles tile transfers internally
internal::ttqrt<Target::HostTask>(
std::move(U_panel),
std::move(TUr_panel));
std::move(TUr_panel), opts2 );

// if a trailing matrix exists
if (k < A_nt-1) {
//--------------------
// QR update trailing submatrix.
if (k+1 < A_nt) {

// bcast V across row for trailing matrix update
if (k < A_mt) {
BcastList bcast_list_V_first;
BcastList bcast_list_V;
for (int64_t i = k; i < A_mt; ++i) {
// send A(i, k) across row A(i, k+1:nt-1)
// Vs in first_indices (except main diagonal one)
// need three lives.
if ((std::find(first_indices.begin(), first_indices.end(), i) != first_indices.end()) && (i > k)) {
bcast_list_V_first.push_back(
{i, k, {A.sub(i, i, k+1, A_nt-1)}});
}
else {
bcast_list_V.push_back(
{i, k, {A.sub(i, i, k+1, A_nt-1)}});
}
bcast_list_V.push_back(
{i, k, {A.sub(i, i, k+1, A_nt-1)}});
}
A.template listBcast(bcast_list_V_first, layout, 0, 3);
A.template listBcast(bcast_list_V, layout, 0, 2);
A.template listBcast(bcast_list_V, layout, 0);
}

// bcast TUlocal across row for trailing matrix update
Expand All @@ -270,11 +254,7 @@ void ge2tb(
}
TUreduce.template listBcast(bcast_list_T, layout);
}
}

//--------------------
// QR update trailing submatrix.
if (k+1 < A_nt) {
int64_t j = k+1;
auto A_trail_j = A.sub(k, A_mt-1, j, A_nt-1);

Expand All @@ -284,7 +264,8 @@ void ge2tb(
std::move(U_panel),
std::move(TUl_panel),
std::move(A_trail_j),
W.sub(k, A_mt-1, j, A_nt-1));
W.sub(k, A_mt-1, j, A_nt-1),
priority_0, queue_0, opts2 );

// Apply triangle-triangle reduction reflectors
// ttmqr handles the tile broadcasting internally
Expand All @@ -293,7 +274,34 @@ void ge2tb(
std::move(U_panel),
std::move(TUr_panel),
std::move(A_trail_j),
j);
j, opts2 );
}

// Can release tiles parallel to the main execution
#pragma omp task
{
// Ensure the origin is up to date, then remove the panel's workspace
U_panel.tileUpdateAllOrigin();
U_panel.releaseLocalWorkspace();
U_panel.releaseRemoteWorkspace();

for (int64_t i : first_indices) {
if (TUlocal.tileIsLocal( i, k )) {
// TUlocal and TUreduce have the same process distribution
TUlocal.tileUpdateOrigin( i, k );
TUlocal.releaseLocalWorkspaceTile( i, k );
if (i != k) {
// i == k is the root of the reduction tree
// TUreduce( k, k ) isn't allocated
TUreduce.tileUpdateOrigin( i, k );
TUreduce.releaseLocalWorkspaceTile( i, k );
}
}
else {
TUlocal.releaseRemoteWorkspaceTile( i, k );
TUreduce.releaseRemoteWorkspaceTile( i, k );
}
}
}

//----------------------------------------
Expand All @@ -306,24 +314,7 @@ void ge2tb(
auto VT_panel = AT.sub(k+1, A_nt-1, k, k);
auto TVlT_panel = TVlocalT.sub(k+1, A_nt-1, k, k);

// Find ranks in this row.
ranks_set.clear();
V_panel.getRanks(&ranks_set);
assert(ranks_set.size() > 0);

// Find each rank's first (left-most) col in this panel,
// where the triangular tile resulting from local gelqf panel
// will reside.
first_indices.clear();
first_indices.reserve(ranks_set.size());
for (int r: ranks_set) {
for (int64_t j = 0; j < V_panel.nt(); ++j) {
if (V_panel.tileRank(0, j) == r) {
first_indices.push_back(k+1+j);
break;
}
}
}
first_indices = internal::gelqf_compute_first_indices(V_panel, k+1);

//--------------------
// LQ of V panel
Expand All @@ -334,7 +325,8 @@ void ge2tb(
for (int64_t j = 0; j < V_panel.nt(); ++j) {
if (V_panel.tileIsLocal(0, j)) {
V_panel.tileGetForReading( 0, j, HostNum, LayoutConvert(layout) );
VT_panel.tileGetForWriting( j, 0, HostNum, LayoutConvert(layout) );
VT_panel.tileInsert( j, 0 );
VT_panel.tileModified( j, 0, HostNum );
tile::deepConjTranspose( V_panel(0, j), VT_panel(j, 0) );
}
}
Expand All @@ -344,15 +336,15 @@ void ge2tb(
std::move(VT_panel),
std::move(TVlT_panel),
dwork_array, work_size,
ib, max_panel_threads);
ib, max_panel_threads );

// Find first local tile, which is triangular factor
// (T in I - V T^H V^H), and copy it to TVlocal.
for (int64_t i = 0; i < TVlT_panel.mt(); ++i) {
if (TVl_panel.tileIsLocal(0, i)) {
TVl_panel.tileInsert(0, i);
TVlT_panel.tileGetForReading( i, 0, HostNum, LayoutConvert(layout) );
TVl_panel.tileGetForWriting( 0, i, HostNum, LayoutConvert(layout) );
TVl_panel.tileInsert(0, i);
TVl_panel.tileModified( 0, i, HostNum );
tile::gecopy( TVlT_panel(i, 0), TVl_panel(0, i) );
break;
}
Expand All @@ -364,39 +356,30 @@ void ge2tb(
VT_panel.tileGetForReading( j, 0, HostNum, LayoutConvert(layout) );
V_panel.tileGetForWriting( 0, j, HostNum, LayoutConvert(layout) );
tile::deepConjTranspose( VT_panel(j, 0), V_panel(0, j) );
VT_panel.tileErase(j, 0, AllDevices);
}
}
// todo: VT_panel.clear();
//----------

// triangle-triangle reductions
// ttlqt handles tile transfers internally
internal::ttlqt<Target::HostTask>(
std::move(V_panel),
std::move(TVr_panel));
std::move(TVr_panel),
opts2 );

// if a trailing matrix exists
if (k < A_mt-1) {
//--------------------
// LQ update trailing submatrix
if (k+1 < A_mt) {

// bcast V down col for trailing matrix update
if (k+1 < A_nt) {
BcastList bcast_list_V_first;
BcastList bcast_list_V;
for (int64_t j = k+1; j < A_nt; ++j) {
// send A(k, j) down col A(k+1:mt-1, j)
// Vs in first_indices (except main diagonal one)
// need three lives.
if ((std::find(first_indices.begin(), first_indices.end(), j) != first_indices.end()) && (j > k+1)) {
bcast_list_V_first.push_back(
{k, j, {A.sub(k+1, A_mt-1, j, j)}});
}
else {
bcast_list_V.push_back(
{k, j, {A.sub(k+1, A_mt-1, j, j)}});
}
bcast_list_V.push_back(
{k, j, {A.sub(k+1, A_mt-1, j, j)}});
}
A.template listBcast(bcast_list_V_first, layout, 0, 3);
A.template listBcast(bcast_list_V, layout, 0, 2);
A.template listBcast(bcast_list_V, layout, 0);
}

// bcast TVlocal down col for trailing matrix update
Expand All @@ -422,11 +405,7 @@ void ge2tb(
}
TVreduce.template listBcast(bcast_list_T, layout);
}
}

//--------------------
// LQ update trailing submatrix
if (k+1 < A_mt) {
int64_t i = k+1;
auto A_trail_i = A.sub(i, A_mt-1, k+1, A_nt-1);

Expand All @@ -436,7 +415,8 @@ void ge2tb(
std::move(V_panel),
std::move(TVl_panel),
std::move(A_trail_i),
W.sub(i, A_mt-1, k+1, A_nt-1));
W.sub(i, A_mt-1, k+1, A_nt-1),
priority_0, queue_0, opts2 );

// Apply triangle-triangle reduction reflectors
// ttmlq handles the tile broadcasting internally
Expand All @@ -445,7 +425,34 @@ void ge2tb(
std::move(V_panel),
std::move(TVr_panel),
std::move(A_trail_i),
i);
i, opts2 );
}

// Can release tiles parallel to the main execution
#pragma omp task
{
// Ensure the origin is up to date, then remove the panel's workspace
V_panel.tileUpdateAllOrigin();
V_panel.releaseLocalWorkspace();
V_panel.releaseRemoteWorkspace();

for (int64_t j : first_indices) {
if (TVlocal.tileIsLocal( k, j )) {
// TVlocal and TVreduce have the same process distribution
TVlocal.tileUpdateOrigin( k, j );
TVlocal.releaseLocalWorkspaceTile( k, j );
if (j != k+1) {
// j == k+1 is the root of the reduction tree
// TVreduce( k, k+1 ) isn't allocated
TVreduce.tileUpdateOrigin( k, j );
TVreduce.releaseLocalWorkspaceTile( k, j );
}
}
else {
TVlocal.releaseRemoteWorkspaceTile( k, j );
TVreduce.releaseRemoteWorkspaceTile( k, j );
}
}
}
}
}
Expand Down
Loading

0 comments on commit 75c3ba1

Please sign in to comment.