Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove tile life from ge2tb and he2hb #123

Merged
merged 10 commits into from
Nov 16, 2023
2 changes: 1 addition & 1 deletion include/slate/enums.hh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ enum class TileReleaseStrategy : char {
None = 'N', ///< tiles are not release at all
Internal = 'I', ///< tiles are released by routines in slate::internal namespace
Slate = 'S', ///< tiles are released by routines directly in slate namespace
All = 'A', ///< tiles are released by rotines in all namespaces
All = 'A', ///< tiles are released by routines in all namespaces
};

namespace internal {
Expand Down
190 changes: 103 additions & 87 deletions src/ge2tb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "auxiliary/Debug.hh"
#include "slate/Matrix.hh"
#include "internal/internal.hh"
#include "internal/internal_util.hh"

namespace slate {

Expand Down Expand Up @@ -35,13 +36,20 @@ void ge2tb(
// Assumes column major
const Layout layout = Layout::ColMajor;
const int queue_0 = 0;
const int priority_0 = 0;

// Options
int64_t ib = get_option<int64_t>( opts, Option::InnerBlocking, 16 );
int64_t max_panel_threads = std::max(omp_get_max_threads()/2, 1);
max_panel_threads = get_option<int64_t>( opts, Option::MaxPanelThreads,
max_panel_threads );

// Use only TileReleaseStrategy::Slate for gemm.
// Internal gemm routine called here won't release
// any tiles. This routine will clean up tiles.
Options opts2 = opts;
opts2[ Option::TileReleaseStrategy ] = TileReleaseStrategy::Slate;

int64_t A_mt = A.mt();
int64_t A_nt = A.nt();
int64_t A_min_mtnt = std::min(A_mt, A_nt);
Expand Down Expand Up @@ -170,8 +178,6 @@ void ge2tb(

// Workspace for transposed panels needs one column of tiles.
auto AT = A.emptyLike(0, 0, Op::ConjTrans);
// todo: we really only want to insert 1 column's worth at a time.
AT.insertLocalTiles();

// No lookahead is possible, so no need to track dependencies --
// just execute tasks in order. Also, priority isn't needed.
Expand All @@ -189,24 +195,11 @@ void ge2tb(
auto TUl_panel = TUlocal.sub(k, A_mt-1, k, k);
auto TUr_panel = TUreduce.sub(k, A_mt-1, k, k);

// Find ranks in this column.
std::set<int> ranks_set;
U_panel.getRanks(&ranks_set);
assert(ranks_set.size() > 0);

// Find each rank's first (top-most) row in this panel,
// where the triangular tile resulting from local geqrf panel
// will reside.
std::vector< int64_t > first_indices;
first_indices.reserve(ranks_set.size());
for (int r: ranks_set) {
for (int64_t i = 0; i < U_panel.mt(); ++i) {
if (U_panel.tileRank(i, 0) == r) {
first_indices.push_back(i+k);
break;
}
}
}
std::vector< int64_t > first_indices
= internal::geqrf_compute_first_indices(U_panel, k);

//--------------------
// QR of U panel
Expand All @@ -215,36 +208,26 @@ void ge2tb(
std::move(U_panel),
std::move(TUl_panel),
dwork_array, work_size,
ib, max_panel_threads);
ib, max_panel_threads );

// triangle-triangle reductions
// ttqrt handles tile transfers internally
internal::ttqrt<Target::HostTask>(
std::move(U_panel),
std::move(TUr_panel));
std::move(TUr_panel), opts2 );

// if a trailing matrix exists
if (k < A_nt-1) {
//--------------------
// QR update trailing submatrix.
if (k+1 < A_nt) {

// bcast V across row for trailing matrix update
if (k < A_mt) {
BcastList bcast_list_V_first;
BcastList bcast_list_V;
for (int64_t i = k; i < A_mt; ++i) {
// send A(i, k) across row A(i, k+1:nt-1)
neil-lindquist marked this conversation as resolved.
Show resolved Hide resolved
// Vs in first_indices (except main diagonal one)
// need three lives.
if ((std::find(first_indices.begin(), first_indices.end(), i) != first_indices.end()) && (i > k)) {
bcast_list_V_first.push_back(
{i, k, {A.sub(i, i, k+1, A_nt-1)}});
}
else {
bcast_list_V.push_back(
{i, k, {A.sub(i, i, k+1, A_nt-1)}});
}
bcast_list_V.push_back(
{i, k, {A.sub(i, i, k+1, A_nt-1)}});
}
A.template listBcast(bcast_list_V_first, layout, 0, 3);
A.template listBcast(bcast_list_V, layout, 0, 2);
A.template listBcast(bcast_list_V, layout, 0);
}

// bcast TUlocal across row for trailing matrix update
Expand All @@ -270,11 +253,7 @@ void ge2tb(
}
TUreduce.template listBcast(bcast_list_T, layout);
}
}

//--------------------
// QR update trailing submatrix.
if (k+1 < A_nt) {
int64_t j = k+1;
auto A_trail_j = A.sub(k, A_mt-1, j, A_nt-1);

Expand All @@ -284,7 +263,8 @@ void ge2tb(
std::move(U_panel),
std::move(TUl_panel),
std::move(A_trail_j),
W.sub(k, A_mt-1, j, A_nt-1));
W.sub(k, A_mt-1, j, A_nt-1),
priority_0, queue_0, opts2 );

// Apply triangle-triangle reduction reflectors
// ttmqr handles the tile broadcasting internally
Expand All @@ -293,7 +273,39 @@ void ge2tb(
std::move(U_panel),
std::move(TUr_panel),
std::move(A_trail_j),
j);
j, opts2 );
}

// Can release tiles parallel to the main execution
#pragma omp task
{
for (int64_t i = k; i < A_mt; ++i) {
if (A.tileIsLocal(i, k)) {
A.tileUpdateOrigin(i, k);
A.releaseLocalWorkspaceTile(i, k);
}
else {
A.releaseRemoteWorkspaceTile(i, k);
}
}

for (int64_t i : first_indices) {
if (TUlocal.tileIsLocal( i, k )) {
// TUlocal and TUreduce have the same process distribution
TUlocal.tileUpdateOrigin( i, k );
TUlocal.releaseLocalWorkspaceTile( i, k );
if (i != k) {
// i == k is the root of the reduction tree
// TUreduce( k, k ) isn't allocated
TUreduce.tileUpdateOrigin( i, k );
TUreduce.releaseLocalWorkspaceTile( i, k );
}
}
else {
TUlocal.releaseRemoteWorkspaceTile( i, k );
TUreduce.releaseRemoteWorkspaceTile( i, k );
}
}
neil-lindquist marked this conversation as resolved.
Show resolved Hide resolved
}

//----------------------------------------
Expand All @@ -306,24 +318,7 @@ void ge2tb(
auto VT_panel = AT.sub(k+1, A_nt-1, k, k);
auto TVlT_panel = TVlocalT.sub(k+1, A_nt-1, k, k);

// Find ranks in this row.
ranks_set.clear();
V_panel.getRanks(&ranks_set);
assert(ranks_set.size() > 0);

// Find each rank's first (left-most) col in this panel,
// where the triangular tile resulting from local gelqf panel
// will reside.
first_indices.clear();
first_indices.reserve(ranks_set.size());
for (int r: ranks_set) {
for (int64_t j = 0; j < V_panel.nt(); ++j) {
if (V_panel.tileRank(0, j) == r) {
first_indices.push_back(k+1+j);
break;
}
}
}
first_indices = internal::gelqf_compute_first_indices(V_panel, k+1);

//--------------------
// LQ of V panel
Expand All @@ -334,7 +329,8 @@ void ge2tb(
for (int64_t j = 0; j < V_panel.nt(); ++j) {
if (V_panel.tileIsLocal(0, j)) {
V_panel.tileGetForReading( 0, j, HostNum, LayoutConvert(layout) );
VT_panel.tileGetForWriting( j, 0, HostNum, LayoutConvert(layout) );
VT_panel.tileInsert( j, 0 );
VT_panel.tileModified( j, 0, HostNum );
tile::deepConjTranspose( V_panel(0, j), VT_panel(j, 0) );
}
}
Expand All @@ -344,15 +340,15 @@ void ge2tb(
std::move(VT_panel),
std::move(TVlT_panel),
dwork_array, work_size,
ib, max_panel_threads);
ib, max_panel_threads );

// Find first local tile, which is triangular factor
// (T in I - V T^H V^H), and copy it to TVlocal.
for (int64_t i = 0; i < TVlT_panel.mt(); ++i) {
if (TVl_panel.tileIsLocal(0, i)) {
TVl_panel.tileInsert(0, i);
TVlT_panel.tileGetForReading( i, 0, HostNum, LayoutConvert(layout) );
TVl_panel.tileGetForWriting( 0, i, HostNum, LayoutConvert(layout) );
TVl_panel.tileInsert(0, i);
TVl_panel.tileModified( 0, i, HostNum );
tile::gecopy( TVlT_panel(i, 0), TVl_panel(0, i) );
break;
}
Expand All @@ -364,39 +360,30 @@ void ge2tb(
VT_panel.tileGetForReading( j, 0, HostNum, LayoutConvert(layout) );
V_panel.tileGetForWriting( 0, j, HostNum, LayoutConvert(layout) );
tile::deepConjTranspose( VT_panel(j, 0), V_panel(0, j) );
VT_panel.tileErase(j, 0, AllDevices);
}
}
// todo: VT_panel.clear();
//----------

// triangle-triangle reductions
// ttlqt handles tile transfers internally
internal::ttlqt<Target::HostTask>(
std::move(V_panel),
std::move(TVr_panel));
std::move(TVr_panel),
opts2 );

// if a trailing matrix exists
if (k < A_mt-1) {
//--------------------
// LQ update trailing submatrix
if (k+1 < A_mt) {

// bcast V down col for trailing matrix update
if (k+1 < A_nt) {
BcastList bcast_list_V_first;
BcastList bcast_list_V;
for (int64_t j = k+1; j < A_nt; ++j) {
// send A(k, j) down col A(k+1:mt-1, j)
// Vs in first_indices (except main diagonal one)
// need three lives.
if ((std::find(first_indices.begin(), first_indices.end(), j) != first_indices.end()) && (j > k+1)) {
bcast_list_V_first.push_back(
{k, j, {A.sub(k+1, A_mt-1, j, j)}});
}
else {
bcast_list_V.push_back(
{k, j, {A.sub(k+1, A_mt-1, j, j)}});
}
bcast_list_V.push_back(
{k, j, {A.sub(k+1, A_mt-1, j, j)}});
}
A.template listBcast(bcast_list_V_first, layout, 0, 3);
A.template listBcast(bcast_list_V, layout, 0, 2);
A.template listBcast(bcast_list_V, layout, 0);
}

// bcast TVlocal down col for trailing matrix update
Expand All @@ -422,11 +409,7 @@ void ge2tb(
}
TVreduce.template listBcast(bcast_list_T, layout);
}
}

//--------------------
// LQ update trailing submatrix
if (k+1 < A_mt) {
int64_t i = k+1;
auto A_trail_i = A.sub(i, A_mt-1, k+1, A_nt-1);

Expand All @@ -436,7 +419,8 @@ void ge2tb(
std::move(V_panel),
std::move(TVl_panel),
std::move(A_trail_i),
W.sub(i, A_mt-1, k+1, A_nt-1));
W.sub(i, A_mt-1, k+1, A_nt-1),
priority_0, queue_0, opts2 );

// Apply triangle-triangle reduction reflectors
// ttmlq handles the tile broadcasting internally
Expand All @@ -445,7 +429,39 @@ void ge2tb(
std::move(V_panel),
std::move(TVr_panel),
std::move(A_trail_i),
i);
i, opts2 );
}

// Can release tiles parallel to the main execution
#pragma omp task
{
for (int64_t j = k+1; j < A_nt; ++j) {
if (A.tileIsLocal(k, j)) {
A.tileUpdateOrigin(k, j);
A.releaseLocalWorkspaceTile(k, j);
}
else {
A.releaseRemoteWorkspaceTile(k, j);
}
}

for (int64_t j : first_indices) {
if (TVlocal.tileIsLocal( k, j )) {
// TVlocal and TVreduce have the same process distribution
TVlocal.tileUpdateOrigin( k, j );
TVlocal.releaseLocalWorkspaceTile( k, j );
if (j != k+1) {
// j == k+1 is the root of the reduction tree
// TVreduce( k, k+1 ) isn't allocated
TVreduce.tileUpdateOrigin( k, j );
TVreduce.releaseLocalWorkspaceTile( k, j );
}
}
else {
TVlocal.releaseRemoteWorkspaceTile( k, j );
TVreduce.releaseRemoteWorkspaceTile( k, j );
}
}
neil-lindquist marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
Expand Down
Loading