diff --git a/include/slate/simplified_api.hh b/include/slate/simplified_api.hh index f16c0b9c1..b92f563f0 100644 --- a/include/slate/simplified_api.hh +++ b/include/slate/simplified_api.hh @@ -390,24 +390,24 @@ void chol_solve( // posv template -void chol_solve( +int64_t chol_solve( HermitianMatrix& A, Matrix& B, Options const& opts = Options()) { - posv(A, B, opts); + return posv( A, B, opts ); } // forward real-symmetric matrices to posv; // disabled for complex template -void chol_solve( +int64_t chol_solve( SymmetricMatrix& A, Matrix& B, Options const& opts = Options(), enable_if_t< ! is_complex::value >* = nullptr) { - posv(A, B, opts); + return posv( A, B, opts ); } //----------------------------------------- @@ -415,31 +415,31 @@ void chol_solve( // pbtrf template -void chol_factor( +int64_t chol_factor( HermitianBandMatrix& A, Options const& opts = Options()) { - pbtrf(A, opts); + return pbtrf( A, opts ); } // potrf template -void chol_factor( +int64_t chol_factor( HermitianMatrix& A, Options const& opts = Options()) { - potrf(A, opts); + return potrf( A, opts ); } // forward real-symmetric matrices to potrf; // disabled for complex template -void chol_factor( +int64_t chol_factor( SymmetricMatrix& A, Options const& opts = Options(), enable_if_t< ! is_complex::value >* = nullptr) { - potrf(A, opts); + return potrf( A, opts ); } //----------------------------------------- diff --git a/include/slate/slate.hh b/include/slate/slate.hh index d0785b5bd..2229958cd 100644 --- a/include/slate/slate.hh +++ b/include/slate/slate.hh @@ -650,7 +650,7 @@ void getri( //----------------------------------------- // pbsv() template -void pbsv( +int64_t pbsv( HermitianBandMatrix& A, Matrix& B, Options const& opts = Options()); @@ -658,7 +658,7 @@ void pbsv( //----------------------------------------- // posv() template -void posv( +int64_t posv( HermitianMatrix& A, Matrix& B, Options const& opts = Options()); @@ -666,20 +666,20 @@ void posv( // forward real-symmetric matrices to potrf; // disabled for complex template -void posv( +int64_t posv( SymmetricMatrix& A, Matrix& B, Options const& opts = Options(), enable_if_t< ! is_complex::value >* = nullptr) { HermitianMatrix AH(A); - posv(AH, B, opts); + return posv( AH, B, opts ); } //----------------------------------------- // posv_mixed() template -void posv_mixed( +int64_t posv_mixed( HermitianMatrix& A, Matrix& B, Matrix& X, @@ -687,7 +687,7 @@ void posv_mixed( Options const& opts = Options()); template -void posv_mixed( +int64_t posv_mixed( HermitianMatrix& A, Matrix& B, Matrix& X, @@ -698,32 +698,32 @@ void posv_mixed( template [[deprecated( "Use posv_mixed instead. Will be removed 2024-02." )]] -void posvMixed( +int64_t posvMixed( HermitianMatrix& A, Matrix& B, Matrix& X, int& iter, Options const& opts = Options()) { - posv_mixed( A, B, X, iter, opts ); + return posv_mixed( A, B, X, iter, opts ); } template [[deprecated( "Use posv_mixed instead. Will be removed 2024-02." )]] -void posvMixed( +int64_t posvMixed( HermitianMatrix& A, Matrix& B, Matrix& X, int& iter, Options const& opts = Options()) { - posv_mixed( A, B, X, iter, opts ); + return posv_mixed( A, B, X, iter, opts ); } //----------------------------------------- // posv_mixed_gmres() template -void posv_mixed_gmres( +int64_t posv_mixed_gmres( HermitianMatrix& A, Matrix& B, Matrix& X, @@ -731,7 +731,7 @@ void posv_mixed_gmres( Options const& opts = Options()); template -void posv_mixed_gmres( +int64_t posv_mixed_gmres( HermitianMatrix& A, Matrix& B, Matrix& X, @@ -743,27 +743,27 @@ void posv_mixed_gmres( //----------------------------------------- // pbtrf() template -void pbtrf( +int64_t pbtrf( HermitianBandMatrix& A, Options const& opts = Options()); //----------------------------------------- // potrf() template -void potrf( +int64_t potrf( HermitianMatrix& A, Options const& opts = Options()); // forward real-symmetric matrices to potrf; // disabled for complex template -void potrf( +int64_t potrf( SymmetricMatrix& A, Options const& opts = Options(), enable_if_t< ! is_complex::value >* = nullptr) { HermitianMatrix AH(A); - potrf(AH, opts); + return potrf( AH, opts ); } //----------------------------------------- diff --git a/src/gesv_mixed.cc b/src/gesv_mixed.cc index b8c494ad3..dbcff779b 100644 --- a/src/gesv_mixed.cc +++ b/src/gesv_mixed.cc @@ -111,21 +111,23 @@ int64_t gesv_mixed( int& iter, Options const& opts) { - Timer t_gesv_mixed; + using real_hi = blas::real_type; - Target target = get_option( opts, Option::Target, Target::HostTask ); + Timer t_gesv_mixed; + // Constants // Assumes column major const Layout layout = Layout::ColMajor; - - bool converged = false; - using real_hi = blas::real_type; const real_hi eps = std::numeric_limits::epsilon(); const scalar_hi one_hi = 1.0; + // Options + Target target = get_option( opts, Option::Target, Target::HostTask ); int64_t itermax = get_option( opts, Option::MaxIterations, 30 ); double tol = get_option( opts, Option::Tolerance, eps*std::sqrt(A.m()) ); bool use_fallback = get_option( opts, Option::UseFallbackSolver, true ); + + bool converged = false; iter = 0; assert( B.mt() == A.mt() ); @@ -146,7 +148,6 @@ int64_t gesv_mixed( if (target == Target::Devices) { #pragma omp parallel #pragma omp master - #pragma omp taskgroup { #pragma omp task slate_omp_default_none \ shared( A ) firstprivate( layout ) diff --git a/src/gesv_mixed_gmres.cc b/src/gesv_mixed_gmres.cc index 57843ddd4..36b6bea86 100644 --- a/src/gesv_mixed_gmres.cc +++ b/src/gesv_mixed_gmres.cc @@ -128,22 +128,22 @@ int64_t gesv_mixed_gmres( // Assumes column major const Layout layout = Layout::ColMajor; + // Options Target target = get_option( opts, Option::Target, Target::HostTask ); - - bool converged = false; int64_t itermax = get_option( opts, Option::MaxIterations, 30 ); double tol = get_option( opts, Option::Tolerance, eps*std::sqrt(A.m()) ); bool use_fallback = get_option( opts, Option::UseFallbackSolver, true ); - const int64_t restart = std::min( - std::min( int64_t( 30 ), itermax ), A.tileMb( 0 )-1 ); + int64_t restart = blas::min( 30, itermax, A.tileMb( 0 )-1 ); + + bool converged = false; iter = 0; assert( B.mt() == A.mt() ); - slate_assert( A.tileMb( 0 ) >= restart ); + assert( A.tileMb( 0 ) >= restart ); // TODO: implement block gmres if (B.n() != 1) { - slate_not_implemented( "block-GMRES is not yet supported" ); + slate_not_implemented( "block-GMRES for multiple RHS is not yet supported" ); } // workspace @@ -164,7 +164,7 @@ int64_t gesv_mixed_gmres( // workspace vector for the orthogonalization process auto z = X.template emptyLike(); - z.insertLocalTiles(target); + z.insertLocalTiles( target ); // Hessenberg Matrix. Allocate as a single tile slate::Matrix H( @@ -236,8 +236,7 @@ int64_t gesv_mixed_gmres( timers[ "gesv_mixed_gmres::gemm_hi" ] += t_gemm_hi.stop(); colNorms( Norm::Max, X, colnorms_X.data(), opts ); colNorms( Norm::Max, R, colnorms_R.data(), opts ); - if (internal::iterRefConverged( colnorms_R, colnorms_X, cte )) - { + if (internal::iterRefConverged( colnorms_R, colnorms_X, cte )) { iter = iiter; converged = true; break; @@ -272,7 +271,7 @@ int64_t gesv_mixed_gmres( // excessive restarting or delayed completion. int j = 0; for (; j < restart && iiter < itermax - && !internal::iterRefConverged( + && ! internal::iterRefConverged( arnoldi_residual, colnorms_X, cte ); ++j, ++iiter) { auto Vj1 = V.slice( 0, V.m()-1, j+1, j+1 ); @@ -341,15 +340,15 @@ int64_t gesv_mixed_gmres( auto H_00 = H( 0, 0 ); for (int64_t i = 0; i < j; ++i) { blas::rot( 1, &H_00.at( i, j ), 1, &H_00.at( i+1, j ), 1, - givens_alpha[i], givens_beta[i] ); + givens_alpha[i], givens_beta[i] ); } scalar_hi H_jj = H_00.at( j, j ), H_j1j = H_00.at( j+1, j ); blas::rotg( &H_jj, & H_j1j, &givens_alpha[j], &givens_beta[j] ); blas::rot( 1, &H_00.at( j, j ), 1, &H_00.at( j+1, j ), 1, - givens_alpha[j], givens_beta[j] ); + givens_alpha[j], givens_beta[j] ); auto S_00 = S( 0, 0 ); blas::rot( 1, &S_00.at( j, 0 ), 1, &S_00.at( j+1, 0 ), 1, - givens_alpha[j], givens_beta[j] ); + givens_alpha[j], givens_beta[j] ); arnoldi_residual[0] = cabs1( S_00.at( j+1, 0 ) ); } timers[ "gesv_mixed_gmres::rotations" ] += t_gesv_mixed_gmres_rotations.stop(); @@ -411,7 +410,6 @@ int64_t gesv_mixed_gmres( return info; } - //------------------------------------------------------------------------------ // Explicit instantiations. template <> @@ -422,7 +420,8 @@ int64_t gesv_mixed_gmres( int& iter, Options const& opts) { - return gesv_mixed_gmres( A, pivots, B, X, iter, opts ); + return gesv_mixed_gmres( + A, pivots, B, X, iter, opts ); } template <> @@ -433,8 +432,8 @@ int64_t gesv_mixed_gmres< std::complex >( int& iter, Options const& opts) { - return gesv_mixed_gmres, std::complex>( - A, pivots, B, X, iter, opts ); + return gesv_mixed_gmres< std::complex, std::complex >( + A, pivots, B, X, iter, opts ); } } // namespace slate diff --git a/src/internal/internal.hh b/src/internal/internal.hh index 4f93be848..7b9d8ec06 100644 --- a/src/internal/internal.hh +++ b/src/internal/internal.hh @@ -675,20 +675,10 @@ void unmbr_tb2bd(Side side, Op op, //----------------------------------------- // potrf() template -void potrf(HermitianMatrix&& A, - int priority=0, int64_t queue_index=0, - lapack::device_info_int* device_info=nullptr); - -// forward real-symmetric matrices to potrf; -// disabled for complex -template -void potrf(SymmetricMatrix&& A, - int priority=0, int64_t queue_index=0, - lapack::device_info_int* device_info=nullptr, - enable_if_t< ! is_complex::value >* = nullptr) -{ - potrf(SymmetricMatrix(A), priority); -} +int64_t potrf( + HermitianMatrix&& A, + int priority=0, int64_t queue_index=0, + lapack::device_info_int* device_info=nullptr ); //----------------------------------------- // hegst() diff --git a/src/internal/internal_potrf.cc b/src/internal/internal_potrf.cc index e20918a20..3e49b5dca 100644 --- a/src/internal/internal_potrf.cc +++ b/src/internal/internal_potrf.cc @@ -18,12 +18,13 @@ namespace internal { /// @ingroup posv_internal /// template -void potrf(HermitianMatrix< scalar_t >&& A, - int priority, int64_t queue_index, - lapack::device_info_int* device_info) +int64_t potrf( + HermitianMatrix< scalar_t >&& A, + int priority, int64_t queue_index, + lapack::device_info_int* device_info) { - potrf(internal::TargetType(), A, priority, - queue_index, device_info); + return potrf( internal::TargetType(), A, priority, + queue_index, device_info ); } //------------------------------------------------------------------------------ @@ -31,19 +32,21 @@ void potrf(HermitianMatrix< scalar_t >&& A, /// @ingroup posv_internal /// template -void potrf(internal::TargetType, - HermitianMatrix& A, - int priority, int64_t queue_index, - lapack::device_info_int* device_info) +int64_t potrf( + internal::TargetType, + HermitianMatrix& A, + int priority, int64_t queue_index, + lapack::device_info_int* device_info) { assert(A.mt() == 1); assert(A.nt() == 1); - if (A.tileIsLocal(0, 0)) - { - A.tileGetForWriting(0, 0, LayoutConvert::ColMajor); - potrf(A(0, 0)); - } + int64_t info = 0; + if (A.tileIsLocal( 0, 0 )) { + A.tileGetForWriting( 0, 0, LayoutConvert::ColMajor ); + info = potrf( A( 0, 0 ) ); + } + return info; } //------------------------------------------------------------------------------ @@ -51,14 +54,16 @@ void potrf(internal::TargetType, /// @ingroup posv_internal /// template -void potrf(internal::TargetType, - HermitianMatrix& A, - int priority, int64_t queue_index, - lapack::device_info_int* device_info) +int64_t potrf( + internal::TargetType, + HermitianMatrix& A, + int priority, int64_t queue_index, + lapack::device_info_int* device_info) { assert(A.mt() == 1); assert(A.nt() == 1); + int64_t info = 0; if (A.tileIsLocal(0, 0)) { int device = A.tileDevice( 0, 0 ); A.tileGetForWriting(0, 0, device, LayoutConvert::ColMajor); @@ -67,63 +72,67 @@ void potrf(internal::TargetType, lapack::potrf( A00.uploPhysical(), A00.mb(), A00.data(), A00.stride(), device_info, *queue ); + lapack::device_info_int host_info; + blas::device_memcpy( &host_info, device_info, 1, *queue ); queue->sync(); + info = int64_t( host_info ); } + return info; } //------------------------------------------------------------------------------ // Explicit instantiations. // ---------------------------------------- template -void potrf( +int64_t potrf( HermitianMatrix&& A, int priority, int64_t queue_index, lapack::device_info_int* device_info); // ---------------------------------------- template -void potrf( +int64_t potrf( HermitianMatrix&& A, int priority, int64_t queue_index, lapack::device_info_int* device_info); // ---------------------------------------- template -void potrf< Target::HostTask, std::complex >( +int64_t potrf< Target::HostTask, std::complex >( HermitianMatrix< std::complex >&& A, int priority, int64_t queue_index, lapack::device_info_int* device_info); // ---------------------------------------- template -void potrf< Target::HostTask, std::complex >( +int64_t potrf< Target::HostTask, std::complex >( HermitianMatrix< std::complex >&& A, int priority, int64_t queue_index, lapack::device_info_int* device_info); template -void potrf( +int64_t potrf( HermitianMatrix&& A, int priority, int64_t queue_index, lapack::device_info_int* device_info); // ---------------------------------------- template -void potrf( +int64_t potrf( HermitianMatrix&& A, int priority, int64_t queue_index, lapack::device_info_int* device_info); // ---------------------------------------- template -void potrf< Target::Devices, std::complex >( +int64_t potrf< Target::Devices, std::complex >( HermitianMatrix< std::complex >&& A, int priority, int64_t queue_index, lapack::device_info_int* device_info); // ---------------------------------------- template -void potrf< Target::Devices, std::complex >( +int64_t potrf< Target::Devices, std::complex >( HermitianMatrix< std::complex >&& A, int priority, int64_t queue_index, lapack::device_info_int* device_info); diff --git a/src/pbsv.cc b/src/pbsv.cc index 1b4fef936..fd342d52b 100644 --- a/src/pbsv.cc +++ b/src/pbsv.cc @@ -59,50 +59,51 @@ namespace slate { /// - HostBatch: batched BLAS on CPU host. /// - Devices: batched BLAS on GPU device. /// -/// TODO: return value -/// @retval 0 successful exit -/// @retval >0 for return value = $i$, the leading minor of order $i$ of $A$ is not +/// @return 0: successful exit +/// @return i > 0: the leading minor of order $i$ of $A$ is not /// positive definite, so the factorization could not /// be completed, and the solution has not been computed. /// /// @ingroup pbsv /// template -void pbsv(HermitianBandMatrix& A, - Matrix& B, - Options const& opts) +int64_t pbsv( + HermitianBandMatrix& A, + Matrix& B, + Options const& opts ) { // factorization - pbtrf(A, opts); + int64_t info = pbtrf(A, opts); // solve - pbtrs(A, B, opts); - - // todo: return value for errors? + if (info == 0) { + pbtrs(A, B, opts); + } + return info; } //------------------------------------------------------------------------------ // Explicit instantiations. template -void pbsv( +int64_t pbsv( HermitianBandMatrix& A, Matrix& B, Options const& opts); template -void pbsv( +int64_t pbsv( HermitianBandMatrix& A, Matrix& B, Options const& opts); template -void pbsv< std::complex >( +int64_t pbsv< std::complex >( HermitianBandMatrix< std::complex >& A, Matrix< std::complex >& B, Options const& opts); template -void pbsv< std::complex >( +int64_t pbsv< std::complex >( HermitianBandMatrix< std::complex >& A, Matrix< std::complex >& B, Options const& opts); diff --git a/src/pbtrf.cc b/src/pbtrf.cc index a3d10c8cd..3cdc2b15a 100644 --- a/src/pbtrf.cc +++ b/src/pbtrf.cc @@ -22,7 +22,7 @@ namespace impl { /// Warning: ColMajor layout is assumed /// template -void pbtrf( +int64_t pbtrf( HermitianBandMatrix A, Options const& opts ) { @@ -42,6 +42,7 @@ void pbtrf( if (A.uplo() == Uplo::Upper) A = conj_transpose( A ); + int64_t info = 0; int64_t A_nt = A.nt(); // OpenMP needs pointer types, but vectors are exception safe @@ -56,69 +57,76 @@ void pbtrf( #pragma omp parallel #pragma omp master - for (int64_t k = 0; k < A_nt; ++k) { - - int64_t ij_end = std::min(k + kdt + 1, A_nt); - - // panel, high priority - #pragma omp task depend(inout:column[k]) priority(1) - { - // factor A(k, k) - internal::potrf(A.sub(k, k), 1); - - // send A(k, k) down col A( k+1:ij_end-1, k ) - if (k+1 < ij_end) - A.tileBcast(k, k, A.sub(k+1, ij_end-1, k, k), layout); - - // A(k+1:ij_end-1, k) * A(k, k)^{-H} - if (k+1 < ij_end) { - auto Akk = A.sub(k, k); - auto Tkk = TriangularMatrix< scalar_t >(Diag::NonUnit, Akk); - internal::trsm( - Side::Right, - one, conj_transpose( Tkk ), - A.sub(k+1, ij_end-1, k, k), 1); - } + { + int64_t kk = 0; // column index (not block-column) + for (int64_t k = 0; k < A_nt; ++k) { - BcastList bcast_list_A; - for (int64_t i = k+1; i < ij_end; ++i) { - // send A(i, k) across row A(i, k+1:i) and - // down col A(i:ij_end-1, i). - bcast_list_A.push_back({i, k, {A.sub(i, i, k+1, i), - A.sub(i, ij_end-1, i, i)}}); - } - A.template listBcast(bcast_list_A, layout); - } + int64_t ij_end = std::min(k + kdt + 1, A_nt); - // update trailing submatrix, normal priority - if (k+1+lookahead < ij_end) { - #pragma omp task depend(in:column[k]) \ - depend(inout:column[k+1+lookahead]) \ - depend(inout:column[A_nt-1]) + // panel, high priority + #pragma omp task depend(inout:column[k]) priority(1) \ + shared( info ) { - internal::herk( - -r_one, A.sub(k+1+lookahead, ij_end-1, k, k), - r_one, A.sub(k+1+lookahead, ij_end-1) ); + // factor A(k, k) + int64_t iinfo = internal::potrf( A.sub( k, k ), 1 ); + if (iinfo != 0 && info == 0) + info = kk + iinfo; + + // send A(k, k) down col A( k+1:ij_end-1, k ) + if (k+1 < ij_end) + A.tileBcast(k, k, A.sub(k+1, ij_end-1, k, k), layout); + + // A(k+1:ij_end-1, k) * A(k, k)^{-H} + if (k+1 < ij_end) { + auto Akk = A.sub(k, k); + auto Tkk = TriangularMatrix< scalar_t >(Diag::NonUnit, Akk); + internal::trsm( + Side::Right, + one, conj_transpose( Tkk ), + A.sub(k+1, ij_end-1, k, k), 1); + } + + BcastList bcast_list_A; + for (int64_t i = k+1; i < ij_end; ++i) { + // send A(i, k) across row A(i, k+1:i) and + // down col A(i:ij_end-1, i). + bcast_list_A.push_back({i, k, {A.sub(i, i, k+1, i), + A.sub(i, ij_end-1, i, i)}}); + } + A.template listBcast(bcast_list_A, layout); } - } - // update lookahead column(s), normal priority - for (int64_t j = k+1; j < k+1+lookahead && j < ij_end; ++j) { - #pragma omp task depend(in:column[k]) \ - depend(inout:column[j]) - { - internal::herk( - -r_one, A.sub(j, j, k, k), - r_one, A.sub(j, j) ); - - if (j+1 <= A_nt-1) { - auto Ajk = A.sub(j, j, k, k); - internal::gemm( - -one, A.sub(j+1, ij_end-1, k, k), - conj_transpose( Ajk ), - one, A.sub(j+1, ij_end-1, j, j), layout ); + // update trailing submatrix, normal priority + if (k+1+lookahead < ij_end) { + #pragma omp task depend(in:column[k]) \ + depend(inout:column[k+1+lookahead]) \ + depend(inout:column[A_nt-1]) + { + internal::herk( + -r_one, A.sub(k+1+lookahead, ij_end-1, k, k), + r_one, A.sub(k+1+lookahead, ij_end-1) ); + } + } + + // update lookahead column(s), normal priority + for (int64_t j = k+1; j < k+1+lookahead && j < ij_end; ++j) { + #pragma omp task depend(in:column[k]) \ + depend(inout:column[j]) + { + internal::herk( + -r_one, A.sub(j, j, k, k), + r_one, A.sub(j, j) ); + + if (j+1 <= A_nt-1) { + auto Ajk = A.sub(j, j, k, k); + internal::gemm( + -one, A.sub(j+1, ij_end-1, k, k), + conj_transpose( Ajk ), + one, A.sub(j+1, ij_end-1, j, j), layout ); + } } } + kk += A.tileNb( k ); } } @@ -128,6 +136,9 @@ void pbtrf( A.releaseWorkspace(); // Debug::printTilesMaps(A); + + internal::reduce_info( &info, A.mpiComm() ); + return info; } } // namespace impl @@ -135,7 +146,7 @@ void pbtrf( //------------------------------------------------------------------------------ /// Distributed parallel band Cholesky factorization. /// -/// Computes the Cholesky factorization of a hermitian positive definite band +/// Computes the Cholesky factorization of a Hermitian positive definite band /// matrix $A$. /// /// The factorization has the form @@ -153,26 +164,18 @@ void pbtrf( /// One of float, double, std::complex, std::complex. //------------------------------------------------------------------------------ /// @param[in,out] A -/// On entry, the hermitian band matrix $A$ to be factored. +/// On entry, the Hermitian band matrix $A$ to be factored. /// Tiles outside the bandwidth do not need to exist. /// For tiles that are partially outside the bandwidth, /// data outside the bandwidth should be explicitly set to zero. -/// On exit, the factors $L$ and $U$ from the factorization $A = L U$; -/// the unit diagonal elements of $L$ are not stored. -/// The upper bandwidth is increased to accomodate fill-in of $U$. -/// -/// @param[out] pivots -/// The pivot indices that define the permutations. +/// On exit, the factor $L$ or $U$ from the factorization +/// $A = L L^H$ or $A = U^H U$. /// /// @param[in] opts /// Additional options, as map of name = value pairs. Possible options: /// - Option::Lookahead: /// Number of panels to overlap with matrix updates. /// lookahead >= 0. Default 1. -/// - Option::InnerBlocking: -/// Inner blocking to use for panel. Default 16. -/// - Option::MaxPanelThreads: -/// Number of threads to use for panel. Default omp_get_max_threads()/2. /// - Option::Target: /// Implementation to target. Possible values: /// - HostTask: OpenMP tasks on CPU host [default]. @@ -180,17 +183,15 @@ void pbtrf( /// - HostBatch: batched BLAS on CPU host. /// - Devices: batched BLAS on GPU device. /// -/// TODO: return value -/// @retval 0 successful exit -/// @retval >0 for return value = $i$, $U(i,i)$ is exactly zero. The -/// factorization has been completed, but the factor $U$ is exactly -/// singular, and division by zero will occur if it is used -/// to solve a system of equations. +/// @return 0: successful exit +/// @return i > 0: the leading minor of order $i$ of $A$ is not +/// positive definite, so the factorization could not +/// be completed. /// /// @ingroup pbsv_computational /// template -void pbtrf( +int64_t pbtrf( HermitianBandMatrix& A, Options const& opts ) { @@ -199,43 +200,39 @@ void pbtrf( switch (target) { case Target::Host: case Target::HostTask: - impl::pbtrf( A, opts ); - break; + return impl::pbtrf( A, opts ); case Target::HostNest: - impl::pbtrf( A, opts ); - break; + return impl::pbtrf( A, opts ); case Target::HostBatch: - impl::pbtrf( A, opts ); - break; + return impl::pbtrf( A, opts ); case Target::Devices: - impl::pbtrf( A, opts ); - break; + return impl::pbtrf( A, opts ); } - // todo: return value for errors? + return -2; // shouldn't happen } //------------------------------------------------------------------------------ // Explicit instantiations. template -void pbtrf( +int64_t pbtrf( HermitianBandMatrix& A, Options const& opts); template -void pbtrf( +int64_t pbtrf( HermitianBandMatrix& A, Options const& opts); template -void pbtrf< std::complex >( +int64_t pbtrf< std::complex >( HermitianBandMatrix< std::complex >& A, Options const& opts); template -void pbtrf< std::complex >( +int64_t pbtrf< std::complex >( HermitianBandMatrix< std::complex >& A, Options const& opts); diff --git a/src/posv.cc b/src/posv.cc index b0af6e9fa..71940b2ac 100644 --- a/src/posv.cc +++ b/src/posv.cc @@ -61,18 +61,18 @@ namespace slate { /// - HostBatch: batched BLAS on CPU host. /// - Devices: batched BLAS on GPU device. /// -/// TODO: return value -/// @retval 0 successful exit -/// @retval >0 for return value = $i$, the leading minor of order $i$ of $A$ is not +/// @return 0: successful exit +/// @return i > 0: the leading minor of order $i$ of $A$ is not /// positive definite, so the factorization could not /// be completed, and the solution has not been computed. /// /// @ingroup posv /// template -void posv(HermitianMatrix& A, - Matrix& B, - Options const& opts) +int64_t posv( + HermitianMatrix& A, + Matrix& B, + Options const& opts) { Timer t_posv; @@ -80,41 +80,42 @@ void posv(HermitianMatrix& A, // factorization Timer t_potrf; - potrf(A, opts); + int64_t info = potrf( A, opts ); timers[ "posv::potrf" ] = t_potrf.stop(); // solve Timer t_potrs; - potrs(A, B, opts); + if (info == 0) { + potrs( A, B, opts ); + } timers[ "posv::potrs" ] = t_potrs.stop(); - // todo: return value for errors? - timers[ "posv" ] = t_posv.stop(); + return info; } //------------------------------------------------------------------------------ // Explicit instantiations. template -void posv( +int64_t posv( HermitianMatrix& A, Matrix& B, Options const& opts); template -void posv( +int64_t posv( HermitianMatrix& A, Matrix& B, Options const& opts); template -void posv< std::complex >( +int64_t posv< std::complex >( HermitianMatrix< std::complex >& A, Matrix< std::complex >& B, Options const& opts); template -void posv< std::complex >( +int64_t posv< std::complex >( HermitianMatrix< std::complex >& A, Matrix< std::complex >& B, Options const& opts); diff --git a/src/posv_mixed.cc b/src/posv_mixed.cc index c0fd0432b..d7f1d4676 100644 --- a/src/posv_mixed.cc +++ b/src/posv_mixed.cc @@ -60,7 +60,7 @@ namespace slate { /// (return value = 0 and iter < 0, see description below), then the /// array $A$ contains the factor $U$ or $L$ from the Cholesky /// factorization $A = U^H U$ or $A = L L^H$. -/// If scalar_t is real, $A$ can be a SymmetricMatrix object. +/// If scalar_t is real, $A$ can be a SymmetricMatrix. /// /// @param[in] B /// On entry, the n-by-nrhs right hand side matrix $B$. @@ -69,9 +69,13 @@ namespace slate { /// On exit, if return value = 0, the n-by-nrhs solution matrix $X$. /// /// @param[out] iter -/// The number of the iterations in the iterative refinement -/// process, needed for the convergence. If failed, it is set -/// to be -(1+itermax), where itermax = 30. +/// > 0: The number of the iterations the iterative refinement +/// process needed for convergence. +/// < 0: Iterative refinement failed; it falls back to a double +/// precision factorization and solve. +/// -3: single precision matrix was not positive definite in potrf. +/// -(itermax+1): iterative refinement failed to converge in +/// itermax iterations. /// /// @param[in] opts /// Additional options, as map of name = value pairs. Possible options: @@ -92,39 +96,39 @@ namespace slate { /// If true and iterative refinement fails to convergene, the problem is /// resolved with partial-pivoted LU. Default true /// -/// TODO: return value -/// @retval 0 successful exit -/// @retval >0 for return value = $i$, the leading minor of order $i$ of $A$ is not +/// @return 0: successful exit +/// @return i > 0: the leading minor of order $i$ of $A$ is not /// positive definite, so the factorization could not /// be completed, and the solution has not been computed. /// /// @ingroup posv /// template -void posv_mixed( +int64_t posv_mixed( HermitianMatrix& A, Matrix& B, Matrix& X, int& iter, Options const& opts) { - Timer t_posv_mixed; + using real_hi = blas::real_type; - // XXX This is only used for the memory management and may be inconsistent - // with the routines called in this routine. - Target target = get_option( opts, Option::Target, Target::HostTask ); + Timer t_posv_mixed; + // Constants // Assumes column major const Layout layout = Layout::ColMajor; - - bool converged = false; - using real_hi = blas::real_type; const real_hi eps = std::numeric_limits::epsilon(); - const scalar_hi one_hi = 1.0; + const scalar_hi one_hi = 1.0; + // Options + // XXX target is used only for the memory management and may be inconsistent + // with the routines called in this routine. + Target target = get_option( opts, Option::Target, Target::HostTask ); int64_t itermax = get_option( opts, Option::MaxIterations, 30 ); double tol = get_option( opts, Option::Tolerance, eps*std::sqrt(A.m()) ); bool use_fallback = get_option( opts, Option::UseFallbackSolver, true ); + bool converged = false; iter = 0; assert( B.mt() == A.mt() ); @@ -178,94 +182,101 @@ void posv_mixed( // Compute the Cholesky factorization of A_lo. Timer t_potrf_lo; - potrf( A_lo, opts ); + int64_t info = potrf( A_lo, opts ); timers[ "posv_mixed::potrf_lo" ] = t_potrf_lo.stop(); - - // Solve the system A_lo * X_lo = B_lo. - Timer t_potrs_lo; - potrs( A_lo, X_lo, opts ); - timers[ "posv_mixed::potrs_lo" ] = t_potrs_lo.stop(); - - // Convert X_lo to high precision. - copy( X_lo, X, opts ); - - // Compute R = B - A * X. - slate::copy( B, R, opts ); - Timer t_hemm_hi; - hemm( - Side::Left, - -one_hi, A, - X, - one_hi, R, opts ); - timers[ "posv_mixed::hemm_hi" ] = t_hemm_hi.stop(); - - // Check whether the nrhs normwise backward error satisfies the - // stopping criterion. If yes, set iter=0 and return. - colNorms( Norm::Max, X, colnorms_X.data(), opts ); - colNorms( Norm::Max, R, colnorms_R.data(), opts ); - - if (internal::iterRefConverged( colnorms_R, colnorms_X, cte) ) { - iter = 0; - converged = true; + if (info != 0) { + iter = -3; } - - // iterative refinement - timers[ "posv_mixed::add_hi" ] = 0; - for (int iiter = 0; iiter < itermax && ! converged; ++iiter) { - // Convert R from high to low precision, store result in X_lo. - copy( R, X_lo, opts ); - - // Solve the system A_lo * X_lo = R_lo. - t_potrs_lo.start(); + else { + // Solve the system A_lo * X_lo = B_lo. + Timer t_potrs_lo; potrs( A_lo, X_lo, opts ); - timers[ "posv_mixed::potrs_lo" ] += t_potrs_lo.stop(); + timers[ "posv_mixed::potrs_lo" ] = t_potrs_lo.stop(); - // Convert X_lo back to double precision and update the current iterate. - copy( X_lo, R, opts ); - Timer t_add_hi; - add( - one_hi, R, - one_hi, X, opts ); - timers[ "posv_mixed::add_hi" ] += t_add_hi.stop(); + // Convert X_lo to high precision. + copy( X_lo, X, opts ); // Compute R = B - A * X. slate::copy( B, R, opts ); - t_hemm_hi.start(); + Timer t_hemm_hi; hemm( Side::Left, -one_hi, A, X, one_hi, R, opts ); - timers[ "posv_mixed::hemm_hi" ] += t_hemm_hi.stop(); + timers[ "posv_mixed::hemm_hi" ] = t_hemm_hi.stop(); - // Check whether nrhs normwise backward error satisfies the - // stopping criterion. If yes, set iter = iiter > 0 and return. + // Check whether the nrhs normwise backward error satisfies the + // stopping criterion. If yes, set iter=0 and return. colNorms( Norm::Max, X, colnorms_X.data(), opts ); colNorms( Norm::Max, R, colnorms_R.data(), opts ); if (internal::iterRefConverged( colnorms_R, colnorms_X, cte )) { - iter = iiter+1; + iter = 0; converged = true; } + + // iterative refinement + timers[ "posv_mixed::add_hi" ] = 0; + for (int iiter = 0; iiter < itermax && ! converged; ++iiter) { + // Convert R from high to low precision, store result in X_lo. + copy( R, X_lo, opts ); + + // Solve the system A_lo * X_lo = R_lo. + t_potrs_lo.start(); + potrs( A_lo, X_lo, opts ); + timers[ "posv_mixed::potrs_lo" ] += t_potrs_lo.stop(); + + // Convert X_lo back to double precision and update the current iterate. + copy( X_lo, R, opts ); + Timer t_add_hi; + add( + one_hi, R, + one_hi, X, opts ); + timers[ "posv_mixed::add_hi" ] += t_add_hi.stop(); + + // Compute R = B - A * X. + slate::copy( B, R, opts ); + t_hemm_hi.start(); + hemm( + Side::Left, + -one_hi, A, + X, + one_hi, R, opts ); + timers[ "posv_mixed::hemm_hi" ] += t_hemm_hi.stop(); + + // Check whether nrhs normwise backward error satisfies the + // stopping criterion. If yes, set iter = iiter > 0 and return. + colNorms( Norm::Max, X, colnorms_X.data(), opts ); + colNorms( Norm::Max, R, colnorms_R.data(), opts ); + + if (internal::iterRefConverged( colnorms_R, colnorms_X, cte )) { + iter = iiter+1; + converged = true; + } + } } if (! converged) { - // If we are at this place of the code, this is because we have performed - // iter = itermax iterations and never satisfied the stopping criterion, - // set up the iter flag accordingly and follow up with double precision - // routine. - iter = -itermax - 1; + if (info == 0) { + // If we performed iter = itermax iterations and never satisfied + // the stopping criterion, set up the iter flag accordingly. + iter = -itermax - 1; + } if (use_fallback) { + // Fall back to double precision factor and solve. // Compute the Cholesky factorization of A. Timer t_potrf_hi; - potrf( A, opts ); + info = potrf( A, opts ); timers[ "posv_mixed::potrf_hi" ] = t_potrf_hi.stop(); // Solve the system A * X = B. - slate::copy( B, X, opts ); Timer t_potrs_hi; - potrs( A, X, opts ); + if (info == 0) { + slate::copy( B, X, opts ); + potrs( A, X, opts ); + } timers[ "posv_mixed::potrs_hi" ] = t_potrs_hi.stop(); } } @@ -278,31 +289,33 @@ void posv_mixed( } timers[ "posv_mixed" ] = t_posv_mixed.stop(); - // todo: return value for errors? + return info; } //------------------------------------------------------------------------------ // Explicit instantiations. template <> -void posv_mixed( +int64_t posv_mixed( HermitianMatrix& A, Matrix& B, Matrix& X, int& iter, Options const& opts) { - posv_mixed( A, B, X, iter, opts ); + return posv_mixed( + A, B, X, iter, opts ); } template <> -void posv_mixed< std::complex >( +int64_t posv_mixed< std::complex >( HermitianMatrix< std::complex >& A, Matrix< std::complex >& B, Matrix< std::complex >& X, int& iter, Options const& opts) { - posv_mixed, std::complex>( A, B, X, iter, opts ); + return posv_mixed< std::complex, std::complex >( + A, B, X, iter, opts ); } } // namespace slate diff --git a/src/posv_mixed_gmres.cc b/src/posv_mixed_gmres.cc index 59140b348..ac6e1defb 100644 --- a/src/posv_mixed_gmres.cc +++ b/src/posv_mixed_gmres.cc @@ -68,9 +68,14 @@ namespace slate { /// On exit, if return value = 0, the n-by-nrhs solution matrix $X$. /// /// @param[out] iter -/// The number of the iterations in the iterative refinement -/// process, needed for the convergence. If failed, it is set -/// to be -(1+itermax), where itermax is controlled by the opts argument. +/// @param[out] iter +/// > 0: The number of the iterations the iterative refinement +/// process needed for convergence. +/// < 0: Iterative refinement failed; it falls back to a double +/// precision factorization and solve. +/// -3: single precision matrix was exactly singular in getrf. +/// -(itermax+1): iterative refinement failed to converge in +/// itermax iterations. /// /// @param[in] opts /// Additional options, as map of name = value pairs. Possible options: @@ -95,16 +100,15 @@ namespace slate { /// If true and iterative refinement fails to convergene, the problem is /// resolved with partial-pivoted LU. Default true /// -/// TODO: return value -/// @retval 0 successful exit -/// @retval >0 for return value = $i$, the computed $U(i,i)$ is exactly zero. -/// The factorization has been completed, but the factor U is exactly -/// singular, so the solution could not be computed. +/// @return 0: successful exit +/// @return i > 0: the leading minor of order $i$ of $A$ is not +/// positive definite, so the factorization could not +/// be completed, and the solution has not been computed. /// /// @ingroup posv /// template -void posv_mixed_gmres( +int64_t posv_mixed_gmres( HermitianMatrix& A, Matrix& B, Matrix& X, @@ -118,56 +122,58 @@ void posv_mixed_gmres( // Constants const real_hi eps = std::numeric_limits::epsilon(); const int64_t mpi_rank = A.mpiRank(); + const scalar_hi zero = 0.0; + const scalar_hi one = 1.0; // Assumes column major const Layout layout = Layout::ColMajor; // Options Target target = get_option( opts, Option::Target, Target::HostTask ); - - bool converged = false; int64_t itermax = get_option( opts, Option::MaxIterations, 30 ); double tol = get_option( opts, Option::Tolerance, eps*std::sqrt(A.m()) ); bool use_fallback = get_option( opts, Option::UseFallbackSolver, true ); - const int64_t restart = std::min( - std::min( int64_t( 30 ), itermax ), A.tileMb( 0 )-1 ); + int64_t restart = blas::min( 30, itermax, A.tileMb( 0 )-1 ); + bool converged = false; iter = 0; - assert(B.mt() == A.mt()); + assert( B.mt() == A.mt() ); + assert( A.tileMb( 0 ) >= restart ); // TODO: implement block gmres if (B.n() != 1) { - slate_not_implemented("block-GMRES is not yet supported"); + slate_not_implemented( "block-GMRES for multiple RHS is not yet supported" ); } // workspace auto R = B.emptyLike(); - R. insertLocalTiles(target); + R.insertLocalTiles( target ); auto A_lo = A.template emptyLike(); - A_lo.insertLocalTiles(target); + A_lo.insertLocalTiles( target ); auto X_lo = X.template emptyLike(); - X_lo.insertLocalTiles(target); + X_lo.insertLocalTiles( target ); - std::vector colnorms_X(X.n()); - std::vector colnorms_R(R.n()); + std::vector colnorms_X( X.n() ); + std::vector colnorms_R( R.n() ); // test basis. First column corresponds to the residual - auto V = internal::alloc_basis(A, restart+1, target); - // solution basis. Columns correspond to those in V. First column is unused - auto W = internal::alloc_basis(A, restart+1, target); + auto V = internal::alloc_basis( A, restart+1, target ); + // solution basis. Columns correspond to those in V. First column is unused + auto W = internal::alloc_basis( A, restart+1, target ); // workspace vector for the orthogonalization process auto z = X.template emptyLike(); - z.insertLocalTiles(target); - - // Hessenberg Matrix. Allocate as a single tile - slate::Matrix H(restart+1, restart+1, restart+1, 1, 1, A.mpiComm()); - H.insertLocalTiles(Target::Host); - // least squares RHS. Allocate as a single tile - slate::Matrix S(restart+1, 1, restart+1, 1, 1, A.mpiComm()); - S.insertLocalTiles(Target::Host); + z.insertLocalTiles( target ); + + // Hessenberg Matrix. Allocate as a single tile + slate::Matrix H( + restart+1, restart+1, restart+1, 1, 1, A.mpiComm() ); + H.insertLocalTiles( Target::Host ); + // least squares RHS. Allocate as a single tile + slate::Matrix S( restart+1, 1, restart+1, 1, 1, A.mpiComm() ); + S.insertLocalTiles( Target::Host ); // Rotations - std::vector givens_alpha(restart); - std::vector givens_beta (restart); + std::vector givens_alpha( restart ); + std::vector givens_beta ( restart ); if (target == Target::Devices) { @@ -176,211 +182,217 @@ void posv_mixed_gmres( { #pragma omp task default(shared) { - A.tileGetAndHoldAllOnDevices(LayoutConvert(layout)); + A.tileGetAndHoldAllOnDevices( LayoutConvert( layout ) ); } #pragma omp task default(shared) { - B.tileGetAndHoldAllOnDevices(LayoutConvert(layout)); + B.tileGetAndHoldAllOnDevices( LayoutConvert( layout ) ); } #pragma omp task default(shared) { - X.tileGetAndHoldAllOnDevices(LayoutConvert(layout)); + X.tileGetAndHoldAllOnDevices( LayoutConvert( layout ) ); } } } // norm of A - real_hi Anorm = norm(Norm::Inf, A, opts); + real_hi Anorm = norm( Norm::Inf, A, opts ); // stopping criteria real_hi cte = Anorm * tol; // Compute the Cholesky factorization of A in single-precision. - slate::copy(A, A_lo, opts); + slate::copy( A, A_lo, opts ); Timer t_potrf_lo; - potrf(A_lo, opts); + int64_t info = potrf( A_lo, opts ); timers[ "posv_mixed_gmres::potrf_lo" ] = t_potrf_lo.stop(); - - - // Solve the system A * X = B in low precision. - slate::copy(B, X_lo, opts); - Timer t_potrs_lo; - potrs(A_lo, X_lo, opts); - timers[ "posv_mixed_gmres::potrs_lo" ] = t_potrs_lo.stop(); - slate::copy(X_lo, X, opts); - - - // IR - int iiter = 0; - timers[ "posv_mixed_gmres::add_hi" ] = 0; - while (iiter < itermax) { - - // Check for convergence - slate::copy(B, R, opts); - Timer t_hemm_hi; - hemm( - Side::Left, - scalar_hi(-1.0), A, - X, - scalar_hi(1.0), R, - opts); - timers[ "posv_mixed_gmres::hemm_hi" ] = t_hemm_hi.stop(); - colNorms( Norm::Max, X, colnorms_X.data(), opts ); - colNorms( Norm::Max, R, colnorms_R.data(), opts ); - if (internal::iterRefConverged(colnorms_R, colnorms_X, cte)) { - iter = iiter; - converged = true; - break; - } - - // GMRES - - // Compute initial vector - auto v0 = V.slice(0, V.m()-1, 0, 0); - slate::copy(R, v0, opts); - - std::vector arnoldi_residual = {norm(Norm::Fro, v0, opts)}; - if (arnoldi_residual[0] == 0) { - // Solver broke down, but residual is not small enough yet. - iter = iiter; - converged = false; - break; - } - scale(1.0, arnoldi_residual[0], v0, opts); - if (S.tileRank(0, 0) == mpi_rank) { - S.tileGetForWriting(0, 0, LayoutConvert::ColMajor); - auto S_00 = S(0, 0); - S_00.at(0, 0) = arnoldi_residual[0]; - for (int i = 1; i < S_00.mb(); ++i) { - S_00.at(i, 0) = 0.0; + if (info != 0) { + iter = -3; + } + else { + // Solve the system A * X = B in low precision. + slate::copy( B, X_lo, opts ); + Timer t_potrs_lo; + potrs( A_lo, X_lo, opts ); + timers[ "posv_mixed_gmres::potrs_lo" ] = t_potrs_lo.stop(); + slate::copy( X_lo, X, opts ); + + // IR + int iiter = 0; + timers[ "posv_mixed_gmres::add_hi" ] = 0; + while (iiter < itermax) { + + // Check for convergence + slate::copy( B, R, opts ); + Timer t_hemm_hi; + hemm( + Side::Left, + -one, A, + X, + one, R, + opts); + timers[ "posv_mixed_gmres::hemm_hi" ] = t_hemm_hi.stop(); + colNorms( Norm::Max, X, colnorms_X.data(), opts ); + colNorms( Norm::Max, R, colnorms_R.data(), opts ); + if (internal::iterRefConverged( colnorms_R, colnorms_X, cte )) { + iter = iiter; + converged = true; + break; } - } + // GMRES - // N.B. convergence is detected using norm(X) at the beginning of the - // outer iteration. Thus, changes in the magnitude of X may lead to - // excessive restarting or delayed completion. - int j = 0; - for (; j < restart && iiter < itermax - && !internal::iterRefConverged(arnoldi_residual, colnorms_X, cte); - j++, iiter++) { - auto Vj1 = V.slice(0, V.m()-1, j+1, j+1); - auto Wj1 = W.slice(0, W.m()-1, j+1, j+1); + // Compute initial vector + auto v0 = V.slice( 0, V.m()-1, 0, 0 ); + slate::copy( R, v0, opts ); - auto Vj = V.slice(0, V.m()-1, j, j); - - // Wj1 = M^-1 A Vj - slate::copy(Vj, X_lo, opts); - t_potrs_lo.start(); - potrs(A_lo, X_lo, opts); - timers[ "posv_mixed_gmres::potrs_lo" ] += t_potrs_lo.stop(); - slate::copy(X_lo, Wj1, opts); + std::vector arnoldi_residual = { norm( Norm::Fro, v0, opts ) }; + if (arnoldi_residual[0] == 0) { + // Solver broke down, but residual is not small enough yet. + iter = iiter; + converged = false; + break; + } + scale( 1.0, arnoldi_residual[0], v0, opts ); + if (S.tileRank( 0, 0 ) == mpi_rank) { + S.tileGetForWriting( 0, 0, LayoutConvert::ColMajor ); + auto S_00 = S( 0, 0 ); + S_00.at( 0, 0 ) = arnoldi_residual[0]; + for (int i = 1; i < S_00.mb(); ++i) { + S_00.at( i, 0 ) = 0.0; + } + } - t_hemm_hi.start(); - hemm( - Side::Left, - scalar_hi(1.0), A, - Wj1, - scalar_hi(0.0), Vj1, - opts); - timers[ "posv_mixed_gmres::hemm_hi" ] += t_hemm_hi.stop(); + // N.B. convergence is detected using norm(X) at the beginning of the + // outer iteration. Thus, changes in the magnitude of X may lead to + // excessive restarting or delayed completion. + int j = 0; + for (; j < restart && iiter < itermax + && ! internal::iterRefConverged( + arnoldi_residual, colnorms_X, cte ); + ++j, ++iiter) { + auto Vj1 = V.slice( 0, V.m()-1, j+1, j+1 ); + auto Wj1 = W.slice( 0, W.m()-1, j+1, j+1 ); + + auto Vj = V.slice( 0, V.m()-1, j, j ); + + // Wj1 = M^-1 A Vj + slate::copy( Vj, X_lo, opts ); + t_potrs_lo.start(); + potrs( A_lo, X_lo, opts ); + timers[ "posv_mixed_gmres::potrs_lo" ] += t_potrs_lo.stop(); + slate::copy( X_lo, Wj1, opts ); + + t_hemm_hi.start(); + hemm( + Side::Left, + one, A, + Wj1, + zero, Vj1, + opts ); + timers[ "posv_mixed_gmres::hemm_hi" ] += t_hemm_hi.stop(); + + // orthogonalize w/ CGS2 + auto V0j = V.slice( 0, V.m()-1, 0, j ); + auto V0jT = conj_transpose( V0j ); + auto Hj = H.slice( 0, j, j, j ); + Timer t_gemm_hi; + gemm( + one, V0jT, + Vj1, + zero, Hj, + opts ); + gemm( + -one, V0j, + Hj, + one, Vj1, + opts ); + timers[ "posv_mixed_gmres::gemm_hi" ] = t_gemm_hi.stop(); + auto zj = z.slice( 0, j, 0, 0 ); + t_gemm_hi.start(); + gemm( + one, V0jT, + Vj1, + zero, zj, + opts ); + gemm( + -one, V0j, + zj, + one, Vj1, + opts ); + timers[ "posv_mixed_gmres::gemm_hi" ] += t_gemm_hi.stop(); + Timer t_add_hi; + add( one, zj, one, Hj, opts ); + timers[ "posv_mixed_gmres::add_hi" ] += t_add_hi.stop(); + auto Vj1_norm = norm( Norm::Fro, Vj1, opts ); + scale( 1.0, Vj1_norm, Vj1, opts ); + if (H.tileRank( 0, 0 ) == mpi_rank) { + H.tileGetForWriting( 0, 0, LayoutConvert::ColMajor ); + auto H_00 = H( 0, 0 ); + H_00.at( j+1, j ) = Vj1_norm; + } - // orthogonalize w/ CGS2 - auto V0j = V.slice(0, V.m()-1, 0, j); - auto V0jT = conj_transpose(V0j); - auto Hj = H.slice(0, j, j, j); + // apply givens rotations + Timer t_posv_mixed_gmres_rotations; + if (H.tileRank( 0, 0 ) == mpi_rank) { + auto H_00 = H( 0, 0 ); + for (int64_t i = 0; i < j; ++i) { + blas::rot( 1, &H_00.at( i, j ), 1, &H_00.at( i+1, j ), 1, + givens_alpha[i], givens_beta[i] ); + } + scalar_hi H_jj = H_00.at( j, j ), H_j1j = H_00.at( j+1, j ); + blas::rotg( &H_jj, & H_j1j, &givens_alpha[j], &givens_beta[j] ); + blas::rot( 1, &H_00.at( j, j ), 1, &H_00.at( j+1, j ), 1, + givens_alpha[j], givens_beta[j] ); + auto S_00 = S( 0, 0 ); + blas::rot( 1, &S_00.at( j, 0 ), 1, &S_00.at( j+1, 0 ), 1, + givens_alpha[j], givens_beta[j] ); + arnoldi_residual[0] = cabs1( S_00.at( j+1, 0 ) ); + } + timers[ "posv_mixed_gmres::rotations" ] = t_posv_mixed_gmres_rotations.stop(); + MPI_Bcast( arnoldi_residual.data(), arnoldi_residual.size(), + mpi_type::value, S.tileRank( 0, 0 ), + A.mpiComm() ); + } + // update X + auto H_j = H.slice( 0, j-1, 0, j-1 ); + auto S_j = S.slice( 0, j-1, 0, 0 ); + auto H_tri = TriangularMatrix( + Uplo::Upper, Diag::NonUnit, H_j ); + Timer t_trsm_hi; + trsm( Side::Left, one, H_tri, S_j, opts ); + timers[ "posv_mixed_gmres::trsm_hi" ] = t_trsm_hi.stop(); + auto W_0j = W.slice( 0, W.m()-1, 1, j ); // first column of W is unused Timer t_gemm_hi; gemm( - scalar_hi(1.0), V0jT, - Vj1, - scalar_hi(0.0), Hj, - opts); - gemm( - scalar_hi(-1.0), V0j, - Hj, - scalar_hi(1.0), Vj1, - opts); - timers[ "posv_mixed_gmres::gemm_hi" ] = t_gemm_hi.stop(); - auto zj = z.slice(0, j, 0, 0); - t_gemm_hi.start(); - gemm( - scalar_hi(1.0), V0jT, - Vj1, - scalar_hi(0.0), zj, - opts); - gemm( - scalar_hi(-1.0), V0j, - zj, - scalar_hi(1.0), Vj1, - opts); + one, W_0j, + S_j, + one, X, + opts ); timers[ "posv_mixed_gmres::gemm_hi" ] += t_gemm_hi.stop(); - Timer t_add_hi; - add(scalar_hi(1.0), zj, scalar_hi(1.0), Hj, - opts); - timers[ "posv_mixed_gmres::add_hi" ] += t_add_hi.stop(); - auto Vj1_norm = norm(Norm::Fro, Vj1, opts); - scale(1.0, Vj1_norm, Vj1, opts); - if (H.tileRank(0, 0) == mpi_rank) { - H.tileGetForWriting(0, 0, LayoutConvert::ColMajor); - auto H_00 = H(0, 0); - H_00.at(j+1, j) = Vj1_norm; - } - - // apply givens rotations - Timer t_posv_mixed_gmres_rotations; - if (H.tileRank(0, 0) == mpi_rank) { - auto H_00 = H(0, 0); - for (int64_t i = 0; i < j; ++i) { - blas::rot(1, &H_00.at(i, j), 1, &H_00.at(i+1, j), 1, - givens_alpha[i], givens_beta[i]); - } - scalar_hi H_jj = H_00.at(j, j), H_j1j = H_00.at(j+1, j); - blas::rotg(&H_jj, & H_j1j, &givens_alpha[j], &givens_beta[j]); - blas::rot(1, &H_00.at(j, j), 1, &H_00.at(j+1, j), 1, - givens_alpha[j], givens_beta[j]); - auto S_00 = S(0, 0); - blas::rot(1, &S_00.at(j, 0), 1, &S_00.at(j+1, 0), 1, - givens_alpha[j], givens_beta[j]); - arnoldi_residual[0] = cabs1(S_00.at(j+1, 0)); - } - timers[ "posv_mixed_gmres::rotations" ] = t_posv_mixed_gmres_rotations.stop(); - MPI_Bcast(arnoldi_residual.data(), arnoldi_residual.size(), - mpi_type::value, S.tileRank(0, 0), A.mpiComm()); } - // update X - auto H_j = H.slice(0, j-1, 0, j-1); - auto S_j = S.slice(0, j-1, 0, 0); - auto H_tri = TriangularMatrix(Uplo::Upper, Diag::NonUnit, H_j); - Timer t_trsm_hi; - trsm(Side::Left, scalar_hi(1.0), H_tri, S_j, opts); - timers[ "posv_mixed_gmres::trsm_hi" ] = t_trsm_hi.stop(); - auto W_0j = W.slice(0, W.m()-1, 1, j); // first column of W is unused - Timer t_gemm_hi; - gemm( - scalar_hi(1.0), W_0j, - S_j, - scalar_hi(1.0), X, - opts); - timers[ "posv_mixed_gmres::gemm_hi" ] += t_gemm_hi.stop(); } if (! converged) { - // If we are at this place of the code, this is because we have performed - // iter = itermax iterations and never satisfied the stopping criterion, - // set up the iter flag accordingly and follow up with double precision - // routine. - iter = -iiter-1; + if (info == 0) { + // If we performed iter = itermax iterations and never satisfied + // the stopping criterion, set up the iter flag accordingly. + iter = -itermax - 1; + } if (use_fallback) { + // Fall back to double precision factor and solve. // Compute the Cholesky factorization of A. Timer t_potrf_hi; - potrf( A, opts ); + info = potrf( A, opts ); timers[ "posv_mixed_gmres::potrf_hi" ] = t_potrf_hi.stop(); // Solve the system A * X = B. - slate::copy( B, X, opts ); Timer t_potrs_hi; - potrs( A, X, opts ); + if (info == 0) { + slate::copy( B, X, opts ); + potrs( A, X, opts ); + } timers[ "posv_mixed_gmres::potrs_hi" ] = t_potrs_hi.stop(); } } @@ -392,31 +404,34 @@ void posv_mixed_gmres( X.clearWorkspace(); } timers[ "posv_mixed_gmres" ] = t_posv_mixed_gmres.stop(); -} + return info; +} //------------------------------------------------------------------------------ // Explicit instantiations. template <> -void posv_mixed_gmres( +int64_t posv_mixed_gmres( HermitianMatrix& A, Matrix& B, Matrix& X, int& iter, Options const& opts) { - posv_mixed_gmres(A, B, X, iter, opts); + return posv_mixed_gmres( + A, B, X, iter, opts ); } template <> -void posv_mixed_gmres< std::complex >( +int64_t posv_mixed_gmres< std::complex >( HermitianMatrix< std::complex >& A, Matrix< std::complex >& B, Matrix< std::complex >& X, int& iter, Options const& opts) { - posv_mixed_gmres, std::complex>(A, B, X, iter, opts); + return posv_mixed_gmres< std::complex, std::complex >( + A, B, X, iter, opts ); } } // namespace slate diff --git a/src/potrf.cc b/src/potrf.cc index f0bd68cf4..9eed34846 100644 --- a/src/potrf.cc +++ b/src/potrf.cc @@ -20,7 +20,7 @@ namespace impl { /// @ingroup posv_impl /// template -void potrf( +int64_t potrf( slate::internal::TargetType, HermitianMatrix A, Options const& opts ) @@ -54,6 +54,8 @@ void potrf( if (A.uplo() == Uplo::Upper) { A = conj_transpose( A ); } + + int64_t info = 0; int64_t A_nt = A.nt(); // OpenMP needs pointer types, but vectors are exception safe @@ -91,20 +93,25 @@ void potrf( #pragma omp parallel #pragma omp master { + int64_t kk = 0; // column index (not block-column) for (int64_t k = 0; k < A_nt; ++k) { // Panel, normal priority - #pragma omp task depend(inout:column[k]) priority( priority_0 ) + #pragma omp task depend(inout:column[k]) priority( priority_0 ) \ + shared( info ) { // factor A(k, k) + int64_t iinfo; if (target == Target::Devices) { - internal::potrf( + iinfo = internal::potrf( A.sub(k, k), priority_0, queue_2, device_info_array[ A.tileDevice( k, k ) ] ); } else { - internal::potrf( + iinfo = internal::potrf( A.sub(k, k), priority_0, queue_2 ); } + if (iinfo != 0 && info == 0) + info = kk + iinfo; // send A(k, k) down col A(k+1:nt-1, k) if (k+1 <= A_nt-1) @@ -192,6 +199,7 @@ void potrf( // Erase local workspace on devices. panel.releaseLocalWorkspace(); } + kk += A.tileNb( k ); } } A.tileUpdateAllOrigin(); @@ -205,6 +213,9 @@ void potrf( blas::device_free( device_info_array[dev], *queue ); } } + + internal::reduce_info( &info, A.mpiComm() ); + return info; } } // namespace impl @@ -249,16 +260,15 @@ void potrf( /// - HostBatch: batched BLAS on CPU host. /// - Devices: batched BLAS on GPU device. /// -/// TODO: return value -/// @retval 0 successful exit -/// @retval >0 for return value = $i$, the leading minor of order $i$ of $A$ is not +/// @return 0: successful exit +/// @return i > 0: the leading minor of order $i$ of $A$ is not /// positive definite, so the factorization could not -/// be completed, and the solution has not been computed. +/// be completed. /// /// @ingroup posv_computational /// template -void potrf( +int64_t potrf( HermitianMatrix& A, Options const& opts) { @@ -271,35 +281,33 @@ void potrf( case Target::HostNest: case Target::HostBatch: case Target::HostTask: - impl::potrf( TargetType(), A, opts ); - break; + return impl::potrf( TargetType(), A, opts ); case Target::Devices: - impl::potrf( TargetType(), A, opts ); - break; + return impl::potrf( TargetType(), A, opts ); } - // todo: return value for errors? + return -2; // shouldn't happen } //------------------------------------------------------------------------------ // Explicit instantiations. template -void potrf( +int64_t potrf( HermitianMatrix& A, Options const& opts); template -void potrf( +int64_t potrf( HermitianMatrix& A, Options const& opts); template -void potrf< std::complex >( +int64_t potrf< std::complex >( HermitianMatrix< std::complex >& A, Options const& opts); template -void potrf< std::complex >( +int64_t potrf< std::complex >( HermitianMatrix< std::complex >& A, Options const& opts); diff --git a/test/test_posv.cc b/test/test_posv.cc index ecd8795df..997622c26 100644 --- a/test/test_posv.cc +++ b/test/test_posv.cc @@ -158,6 +158,8 @@ void test_posv_work(Params& params, bool run) return; } + int64_t info = 0; + // Matrix A: figure out local size. int64_t mlocA = num_local_rows_cols(n, nb, myrow, p); int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); @@ -276,26 +278,26 @@ void test_posv_work(Params& params, bool run) if (params.routine == "potrf" || params.routine == "potrs") { // Factor matrix A. - slate::chol_factor(A, opts); + info = slate::chol_factor( A, opts ); // Using traditional BLAS/LAPACK name // slate::potrf(A, opts); } else if (params.routine == "posv") { - slate::chol_solve(A, B, opts); + info = slate::chol_solve( A, B, opts ); // Using traditional BLAS/LAPACK name // slate::posv(A, B, opts); } else if (params.routine == "posv_mixed") { if constexpr (std::is_same::value) { int iters = 0; - slate::posv_mixed( A, B, X, iters, opts ); + info = slate::posv_mixed( A, B, X, iters, opts ); params.iters() = iters; } } else if (params.routine == "posv_mixed_gmres") { if constexpr (std::is_same::value) { int iters = 0; - slate::posv_mixed_gmres(A, B, X, iters, opts); + info = slate::posv_mixed_gmres(A, B, X, iters, opts); params.iters() = iters; } } @@ -332,7 +334,7 @@ void test_posv_work(Params& params, bool run) // Run SLATE test: potrs // potrs: Solve AX = B, after factoring A above. //================================================== - if (do_potrs) { + if (do_potrs && info == 0) { double time2 = barrier_get_wtime(MPI_COMM_WORLD); if ((check && params.routine == "potrf") @@ -352,9 +354,22 @@ void test_posv_work(Params& params, bool run) } if (trace) slate::trace::Trace::finish(); + + if (info != 0) { + char buf[ 80 ]; + snprintf( buf, sizeof(buf), "info = %lld, cond = %.2e", + llong( info ), params.matrix.cond_actual() ); + params.msg() = buf; + } } + print_matrix( "X_out", X, params ); - if (check) { + if (info != 0 || std::isinf( params.matrix.cond_actual() )) { + // info != 0 if and only if cond == inf (singular matrix). + // Matrices with unknown cond (nan) that are singular are marked failed. + params.okay() = info != 0 && std::isinf( params.matrix.cond_actual() ); + } + else if (check) { //================================================== // Test results by checking the residual // @@ -418,7 +433,6 @@ void test_posv_work(Params& params, bool run) slate_assert( myrow == myrow_ ); slate_assert( mycol == mycol_ ); - int64_t info; scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info); slate_assert(info == 0);