diff --git a/include/slate/simplified_api.hh b/include/slate/simplified_api.hh
index f16c0b9c1..b92f563f0 100644
--- a/include/slate/simplified_api.hh
+++ b/include/slate/simplified_api.hh
@@ -390,24 +390,24 @@ void chol_solve(
 
 // posv
 template <typename scalar_t>
-void chol_solve(
+int64_t chol_solve(
     HermitianMatrix<scalar_t>& A,
              Matrix<scalar_t>& B,
     Options const& opts = Options())
 {
-    posv(A, B, opts);
+    return posv( A, B, opts );
 }
 
 // forward real-symmetric matrices to posv;
 // disabled for complex
 template <typename scalar_t>
-void chol_solve(
+int64_t chol_solve(
     SymmetricMatrix<scalar_t>& A,
              Matrix<scalar_t>& B,
     Options const& opts = Options(),
     enable_if_t< ! is_complex<scalar_t>::value >* = nullptr)
 {
-    posv(A, B, opts);
+    return posv( A, B, opts );
 }
 
 //-----------------------------------------
@@ -415,31 +415,31 @@ void chol_solve(
 
 // pbtrf
 template <typename scalar_t>
-void chol_factor(
+int64_t chol_factor(
     HermitianBandMatrix<scalar_t>& A,
     Options const& opts = Options())
 {
-    pbtrf(A, opts);
+    return pbtrf( A, opts );
 }
 
 // potrf
 template <typename scalar_t>
-void chol_factor(
+int64_t chol_factor(
     HermitianMatrix<scalar_t>& A,
     Options const& opts = Options())
 {
-    potrf(A, opts);
+    return potrf( A, opts );
 }
 
 // forward real-symmetric matrices to potrf;
 // disabled for complex
 template <typename scalar_t>
-void chol_factor(
+int64_t chol_factor(
     SymmetricMatrix<scalar_t>& A,
     Options const& opts = Options(),
     enable_if_t< ! is_complex<scalar_t>::value >* = nullptr)
 {
-    potrf(A, opts);
+    return potrf( A, opts );
 }
 
 //-----------------------------------------
diff --git a/include/slate/slate.hh b/include/slate/slate.hh
index d0785b5bd..2229958cd 100644
--- a/include/slate/slate.hh
+++ b/include/slate/slate.hh
@@ -650,7 +650,7 @@ void getri(
 //-----------------------------------------
 // pbsv()
 template <typename scalar_t>
-void pbsv(
+int64_t pbsv(
     HermitianBandMatrix<scalar_t>& A,
                  Matrix<scalar_t>& B,
     Options const& opts = Options());
@@ -658,7 +658,7 @@ void pbsv(
 //-----------------------------------------
 // posv()
 template <typename scalar_t>
-void posv(
+int64_t posv(
     HermitianMatrix<scalar_t>& A,
              Matrix<scalar_t>& B,
     Options const& opts = Options());
@@ -666,20 +666,20 @@ void posv(
 // forward real-symmetric matrices to potrf;
 // disabled for complex
 template <typename scalar_t>
-void posv(
+int64_t posv(
     SymmetricMatrix<scalar_t>& A,
              Matrix<scalar_t>& B,
     Options const& opts = Options(),
     enable_if_t< ! is_complex<scalar_t>::value >* = nullptr)
 {
     HermitianMatrix<scalar_t> AH(A);
-    posv(AH, B, opts);
+    return posv( AH, B, opts );
 }
 
 //-----------------------------------------
 // posv_mixed()
 template <typename scalar_t>
-void posv_mixed(
+int64_t posv_mixed(
     HermitianMatrix<scalar_t>& A,
              Matrix<scalar_t>& B,
              Matrix<scalar_t>& X,
@@ -687,7 +687,7 @@ void posv_mixed(
     Options const& opts = Options());
 
 template <typename scalar_hi, typename scalar_lo>
-void posv_mixed(
+int64_t posv_mixed(
     HermitianMatrix<scalar_hi>& A,
              Matrix<scalar_hi>& B,
              Matrix<scalar_hi>& X,
@@ -698,32 +698,32 @@ void posv_mixed(
 
 template <typename scalar_t>
 [[deprecated( "Use posv_mixed instead. Will be removed 2024-02." )]]
-void posvMixed(
+int64_t posvMixed(
     HermitianMatrix<scalar_t>& A,
              Matrix<scalar_t>& B,
              Matrix<scalar_t>& X,
     int& iter,
     Options const& opts = Options())
 {
-    posv_mixed( A, B, X, iter, opts );
+    return posv_mixed( A, B, X, iter, opts );
 }
 
 template <typename scalar_hi, typename scalar_lo>
 [[deprecated( "Use posv_mixed instead. Will be removed 2024-02." )]]
-void posvMixed(
+int64_t posvMixed(
     HermitianMatrix<scalar_hi>& A,
              Matrix<scalar_hi>& B,
              Matrix<scalar_hi>& X,
     int& iter,
     Options const& opts = Options())
 {
-    posv_mixed( A, B, X, iter, opts );
+    return posv_mixed( A, B, X, iter, opts );
 }
 
 //-----------------------------------------
 // posv_mixed_gmres()
 template <typename scalar_t>
-void posv_mixed_gmres(
+int64_t posv_mixed_gmres(
     HermitianMatrix<scalar_t>& A,
              Matrix<scalar_t>& B,
              Matrix<scalar_t>& X,
@@ -731,7 +731,7 @@ void posv_mixed_gmres(
     Options const& opts = Options());
 
 template <typename scalar_hi, typename scalar_lo>
-void posv_mixed_gmres(
+int64_t posv_mixed_gmres(
     HermitianMatrix<scalar_hi>& A,
              Matrix<scalar_hi>& B,
              Matrix<scalar_hi>& X,
@@ -743,27 +743,27 @@ void posv_mixed_gmres(
 //-----------------------------------------
 // pbtrf()
 template <typename scalar_t>
-void pbtrf(
+int64_t pbtrf(
     HermitianBandMatrix<scalar_t>& A,
     Options const& opts = Options());
 
 //-----------------------------------------
 // potrf()
 template <typename scalar_t>
-void potrf(
+int64_t potrf(
     HermitianMatrix<scalar_t>& A,
     Options const& opts = Options());
 
 // forward real-symmetric matrices to potrf;
 // disabled for complex
 template <typename scalar_t>
-void potrf(
+int64_t potrf(
     SymmetricMatrix<scalar_t>& A,
     Options const& opts = Options(),
     enable_if_t< ! is_complex<scalar_t>::value >* = nullptr)
 {
     HermitianMatrix<scalar_t> AH(A);
-    potrf(AH, opts);
+    return potrf( AH, opts );
 }
 
 //-----------------------------------------
diff --git a/src/gesv_mixed.cc b/src/gesv_mixed.cc
index b8c494ad3..dbcff779b 100644
--- a/src/gesv_mixed.cc
+++ b/src/gesv_mixed.cc
@@ -111,21 +111,23 @@ int64_t gesv_mixed(
     int& iter,
     Options const& opts)
 {
-    Timer t_gesv_mixed;
+    using real_hi = blas::real_type<scalar_hi>;
 
-    Target target = get_option( opts, Option::Target, Target::HostTask );
+    Timer t_gesv_mixed;
 
+    // Constants
     // Assumes column major
     const Layout layout = Layout::ColMajor;
-
-    bool converged = false;
-    using real_hi = blas::real_type<scalar_hi>;
     const real_hi eps = std::numeric_limits<real_hi>::epsilon();
     const scalar_hi one_hi = 1.0;
 
+    // Options
+    Target target = get_option( opts, Option::Target, Target::HostTask );
     int64_t itermax = get_option<int64_t>( opts, Option::MaxIterations, 30 );
     double tol = get_option<double>( opts, Option::Tolerance, eps*std::sqrt(A.m()) );
     bool use_fallback = get_option<int64_t>( opts, Option::UseFallbackSolver, true );
+
+    bool converged = false;
     iter = 0;
 
     assert( B.mt() == A.mt() );
@@ -146,7 +148,6 @@ int64_t gesv_mixed(
     if (target == Target::Devices) {
         #pragma omp parallel
         #pragma omp master
-        #pragma omp taskgroup
         {
             #pragma omp task slate_omp_default_none \
                 shared( A ) firstprivate( layout )
diff --git a/src/gesv_mixed_gmres.cc b/src/gesv_mixed_gmres.cc
index 57843ddd4..36b6bea86 100644
--- a/src/gesv_mixed_gmres.cc
+++ b/src/gesv_mixed_gmres.cc
@@ -128,22 +128,22 @@ int64_t gesv_mixed_gmres(
     // Assumes column major
     const Layout layout = Layout::ColMajor;
 
+    // Options
     Target target = get_option( opts, Option::Target, Target::HostTask );
-
-    bool converged = false;
     int64_t itermax = get_option<int64_t>( opts, Option::MaxIterations, 30 );
     double tol = get_option<double>( opts, Option::Tolerance, eps*std::sqrt(A.m()) );
     bool use_fallback = get_option<int64_t>( opts, Option::UseFallbackSolver, true );
-    const int64_t restart = std::min(
-            std::min( int64_t( 30 ), itermax ), A.tileMb( 0 )-1 );
+    int64_t restart = blas::min( 30, itermax, A.tileMb( 0 )-1 );
+
+    bool converged = false;
     iter = 0;
 
     assert( B.mt() == A.mt() );
-    slate_assert( A.tileMb( 0 ) >= restart );
+    assert( A.tileMb( 0 ) >= restart );
 
     // TODO: implement block gmres
     if (B.n() != 1) {
-        slate_not_implemented( "block-GMRES is not yet supported" );
+        slate_not_implemented( "block-GMRES for multiple RHS is not yet supported" );
     }
 
     // workspace
@@ -164,7 +164,7 @@ int64_t gesv_mixed_gmres(
 
     // workspace vector for the orthogonalization process
     auto z = X.template emptyLike<scalar_hi>();
-    z.insertLocalTiles(target);
+    z.insertLocalTiles( target );
 
     // Hessenberg Matrix. Allocate as a single tile
     slate::Matrix<scalar_hi> H(
@@ -236,8 +236,7 @@ int64_t gesv_mixed_gmres(
             timers[ "gesv_mixed_gmres::gemm_hi" ] += t_gemm_hi.stop();
             colNorms( Norm::Max, X, colnorms_X.data(), opts );
             colNorms( Norm::Max, R, colnorms_R.data(), opts );
-            if (internal::iterRefConverged<real_hi>( colnorms_R, colnorms_X, cte ))
-            {
+            if (internal::iterRefConverged<real_hi>( colnorms_R, colnorms_X, cte )) {
                 iter = iiter;
                 converged = true;
                 break;
@@ -272,7 +271,7 @@ int64_t gesv_mixed_gmres(
             // excessive restarting or delayed completion.
             int j = 0;
             for (; j < restart && iiter < itermax
-                       && !internal::iterRefConverged(
+                       && ! internal::iterRefConverged(
                                 arnoldi_residual, colnorms_X, cte );
                  ++j, ++iiter) {
                 auto Vj1 = V.slice( 0, V.m()-1, j+1, j+1 );
@@ -341,15 +340,15 @@ int64_t gesv_mixed_gmres(
                     auto H_00 = H( 0, 0 );
                     for (int64_t i = 0; i < j; ++i) {
                         blas::rot( 1, &H_00.at( i, j ), 1, &H_00.at( i+1, j ), 1,
-                                  givens_alpha[i], givens_beta[i] );
+                                   givens_alpha[i], givens_beta[i] );
                     }
                     scalar_hi H_jj = H_00.at( j, j ), H_j1j = H_00.at( j+1, j );
                     blas::rotg( &H_jj, & H_j1j, &givens_alpha[j], &givens_beta[j] );
                     blas::rot( 1, &H_00.at( j, j ), 1, &H_00.at( j+1, j ), 1,
-                              givens_alpha[j], givens_beta[j] );
+                               givens_alpha[j], givens_beta[j] );
                     auto S_00 = S( 0, 0 );
                     blas::rot( 1, &S_00.at( j, 0 ), 1, &S_00.at( j+1, 0 ), 1,
-                              givens_alpha[j], givens_beta[j] );
+                               givens_alpha[j], givens_beta[j] );
                     arnoldi_residual[0] = cabs1( S_00.at( j+1, 0 ) );
                 }
                 timers[ "gesv_mixed_gmres::rotations" ] += t_gesv_mixed_gmres_rotations.stop();
@@ -411,7 +410,6 @@ int64_t gesv_mixed_gmres(
     return info;
 }
 
-
 //------------------------------------------------------------------------------
 // Explicit instantiations.
 template <>
@@ -422,7 +420,8 @@ int64_t gesv_mixed_gmres<double>(
     int& iter,
     Options const& opts)
 {
-    return gesv_mixed_gmres<double, float>( A, pivots, B, X, iter, opts );
+    return gesv_mixed_gmres<double, float>(
+        A, pivots, B, X, iter, opts );
 }
 
 template <>
@@ -433,8 +432,8 @@ int64_t gesv_mixed_gmres< std::complex<double> >(
     int& iter,
     Options const& opts)
 {
-    return gesv_mixed_gmres<std::complex<double>, std::complex<float>>(
-            A, pivots, B, X, iter, opts );
+    return gesv_mixed_gmres< std::complex<double>, std::complex<float> >(
+        A, pivots, B, X, iter, opts );
 }
 
 } // namespace slate
diff --git a/src/internal/internal.hh b/src/internal/internal.hh
index 4f93be848..7b9d8ec06 100644
--- a/src/internal/internal.hh
+++ b/src/internal/internal.hh
@@ -675,20 +675,10 @@ void unmbr_tb2bd(Side side, Op op,
 //-----------------------------------------
 // potrf()
 template <Target target=Target::HostTask, typename scalar_t>
-void potrf(HermitianMatrix<scalar_t>&& A,
-           int priority=0, int64_t queue_index=0,
-           lapack::device_info_int* device_info=nullptr);
-
-// forward real-symmetric matrices to potrf;
-// disabled for complex
-template <Target target=Target::HostTask, typename scalar_t>
-void potrf(SymmetricMatrix<scalar_t>&& A,
-           int priority=0, int64_t queue_index=0,
-           lapack::device_info_int* device_info=nullptr,
-           enable_if_t< ! is_complex<scalar_t>::value >* = nullptr)
-{
-    potrf<target>(SymmetricMatrix<scalar_t>(A), priority);
-}
+int64_t potrf(
+    HermitianMatrix<scalar_t>&& A,
+    int priority=0, int64_t queue_index=0,
+    lapack::device_info_int* device_info=nullptr );
 
 //-----------------------------------------
 // hegst()
diff --git a/src/internal/internal_potrf.cc b/src/internal/internal_potrf.cc
index e20918a20..3e49b5dca 100644
--- a/src/internal/internal_potrf.cc
+++ b/src/internal/internal_potrf.cc
@@ -18,12 +18,13 @@ namespace internal {
 /// @ingroup posv_internal
 ///
 template <Target target, typename scalar_t>
-void potrf(HermitianMatrix< scalar_t >&& A,
-           int priority, int64_t queue_index,
-           lapack::device_info_int* device_info)
+int64_t potrf(
+    HermitianMatrix< scalar_t >&& A,
+    int priority, int64_t queue_index,
+    lapack::device_info_int* device_info)
 {
-    potrf(internal::TargetType<target>(), A, priority,
-          queue_index, device_info);
+    return potrf( internal::TargetType<target>(), A, priority,
+                  queue_index, device_info );
 }
 
 //------------------------------------------------------------------------------
@@ -31,19 +32,21 @@ void potrf(HermitianMatrix< scalar_t >&& A,
 /// @ingroup posv_internal
 ///
 template <typename scalar_t>
-void potrf(internal::TargetType<Target::HostTask>,
-           HermitianMatrix<scalar_t>& A,
-           int priority, int64_t queue_index,
-           lapack::device_info_int* device_info)
+int64_t potrf(
+    internal::TargetType<Target::HostTask>,
+    HermitianMatrix<scalar_t>& A,
+    int priority, int64_t queue_index,
+    lapack::device_info_int* device_info)
 {
     assert(A.mt() == 1);
     assert(A.nt() == 1);
 
-    if (A.tileIsLocal(0, 0))
-        {
-            A.tileGetForWriting(0, 0, LayoutConvert::ColMajor);
-            potrf(A(0, 0));
-        }
+    int64_t info = 0;
+    if (A.tileIsLocal( 0, 0 )) {
+        A.tileGetForWriting( 0, 0, LayoutConvert::ColMajor );
+        info = potrf( A( 0, 0 ) );
+    }
+    return info;
 }
 
 //------------------------------------------------------------------------------
@@ -51,14 +54,16 @@ void potrf(internal::TargetType<Target::HostTask>,
 /// @ingroup posv_internal
 ///
 template <typename scalar_t>
-void potrf(internal::TargetType<Target::Devices>,
-           HermitianMatrix<scalar_t>& A,
-           int priority, int64_t queue_index,
-           lapack::device_info_int* device_info)
+int64_t potrf(
+    internal::TargetType<Target::Devices>,
+    HermitianMatrix<scalar_t>& A,
+    int priority, int64_t queue_index,
+    lapack::device_info_int* device_info)
 {
     assert(A.mt() == 1);
     assert(A.nt() == 1);
 
+    int64_t info = 0;
     if (A.tileIsLocal(0, 0)) {
         int device = A.tileDevice( 0, 0 );
         A.tileGetForWriting(0, 0, device, LayoutConvert::ColMajor);
@@ -67,63 +72,67 @@ void potrf(internal::TargetType<Target::Devices>,
         lapack::potrf(
             A00.uploPhysical(), A00.mb(), A00.data(),
             A00.stride(), device_info, *queue );
+        lapack::device_info_int host_info;
+        blas::device_memcpy( &host_info, device_info, 1, *queue );
         queue->sync();
+        info = int64_t( host_info );
     }
+    return info;
 }
 
 //------------------------------------------------------------------------------
 // Explicit instantiations.
 // ----------------------------------------
 template
-void potrf<Target::HostTask, float>(
+int64_t potrf<Target::HostTask, float>(
     HermitianMatrix<float>&& A,
     int priority, int64_t queue_index,
     lapack::device_info_int* device_info);
 
 // ----------------------------------------
 template
-void potrf<Target::HostTask, double>(
+int64_t potrf<Target::HostTask, double>(
     HermitianMatrix<double>&& A,
     int priority, int64_t queue_index,
     lapack::device_info_int* device_info);
 
 // ----------------------------------------
 template
-void potrf< Target::HostTask, std::complex<float> >(
+int64_t potrf< Target::HostTask, std::complex<float> >(
     HermitianMatrix< std::complex<float> >&& A,
     int priority, int64_t queue_index,
     lapack::device_info_int* device_info);
 
 // ----------------------------------------
 template
-void potrf< Target::HostTask, std::complex<double> >(
+int64_t potrf< Target::HostTask, std::complex<double> >(
     HermitianMatrix< std::complex<double> >&& A,
     int priority, int64_t queue_index,
     lapack::device_info_int* device_info);
 
 template
-void potrf<Target::Devices, float>(
+int64_t potrf<Target::Devices, float>(
     HermitianMatrix<float>&& A,
     int priority, int64_t queue_index,
     lapack::device_info_int* device_info);
 
 // ----------------------------------------
 template
-void potrf<Target::Devices, double>(
+int64_t potrf<Target::Devices, double>(
     HermitianMatrix<double>&& A,
     int priority, int64_t queue_index,
     lapack::device_info_int* device_info);
 
 // ----------------------------------------
 template
-void potrf< Target::Devices, std::complex<float> >(
+int64_t potrf< Target::Devices, std::complex<float> >(
     HermitianMatrix< std::complex<float> >&& A,
     int priority, int64_t queue_index,
     lapack::device_info_int* device_info);
 
 // ----------------------------------------
 template
-void potrf< Target::Devices, std::complex<double> >(
+int64_t potrf< Target::Devices, std::complex<double> >(
     HermitianMatrix< std::complex<double> >&& A,
     int priority, int64_t queue_index,
     lapack::device_info_int* device_info);
diff --git a/src/pbsv.cc b/src/pbsv.cc
index 1b4fef936..fd342d52b 100644
--- a/src/pbsv.cc
+++ b/src/pbsv.cc
@@ -59,50 +59,51 @@ namespace slate {
 ///       - HostBatch: batched BLAS on CPU host.
 ///       - Devices:   batched BLAS on GPU device.
 ///
-/// TODO: return value
-/// @retval 0 successful exit
-/// @retval >0 for return value = $i$, the leading minor of order $i$ of $A$ is not
+/// @return 0: successful exit
+/// @return i > 0: the leading minor of order $i$ of $A$ is not
 ///         positive definite, so the factorization could not
 ///         be completed, and the solution has not been computed.
 ///
 /// @ingroup pbsv
 ///
 template <typename scalar_t>
-void pbsv(HermitianBandMatrix<scalar_t>& A,
-          Matrix<scalar_t>& B,
-          Options const& opts)
+int64_t pbsv(
+    HermitianBandMatrix<scalar_t>& A,
+    Matrix<scalar_t>& B,
+    Options const& opts )
 {
     // factorization
-    pbtrf(A, opts);
+    int64_t info = pbtrf(A, opts);
 
     // solve
-    pbtrs(A, B, opts);
-
-    // todo: return value for errors?
+    if (info == 0) {
+        pbtrs(A, B, opts);
+    }
+    return info;
 }
 
 //------------------------------------------------------------------------------
 // Explicit instantiations.
 template
-void pbsv<float>(
+int64_t pbsv<float>(
     HermitianBandMatrix<float>& A,
     Matrix<float>& B,
     Options const& opts);
 
 template
-void pbsv<double>(
+int64_t pbsv<double>(
     HermitianBandMatrix<double>& A,
     Matrix<double>& B,
     Options const& opts);
 
 template
-void pbsv< std::complex<float> >(
+int64_t pbsv< std::complex<float> >(
     HermitianBandMatrix< std::complex<float> >& A,
     Matrix< std::complex<float> >& B,
     Options const& opts);
 
 template
-void pbsv< std::complex<double> >(
+int64_t pbsv< std::complex<double> >(
     HermitianBandMatrix< std::complex<double> >& A,
     Matrix< std::complex<double> >& B,
     Options const& opts);
diff --git a/src/pbtrf.cc b/src/pbtrf.cc
index a3d10c8cd..3cdc2b15a 100644
--- a/src/pbtrf.cc
+++ b/src/pbtrf.cc
@@ -22,7 +22,7 @@ namespace impl {
 /// Warning: ColMajor layout is assumed
 ///
 template <Target target, typename scalar_t>
-void pbtrf(
+int64_t pbtrf(
     HermitianBandMatrix<scalar_t> A,
     Options const& opts )
 {
@@ -42,6 +42,7 @@ void pbtrf(
     if (A.uplo() == Uplo::Upper)
         A = conj_transpose( A );
 
+    int64_t info = 0;
     int64_t A_nt = A.nt();
 
     // OpenMP needs pointer types, but vectors are exception safe
@@ -56,69 +57,76 @@ void pbtrf(
 
     #pragma omp parallel
     #pragma omp master
-    for (int64_t k = 0; k < A_nt; ++k) {
-
-        int64_t ij_end = std::min(k + kdt + 1, A_nt);
-
-        // panel, high priority
-        #pragma omp task depend(inout:column[k]) priority(1)
-        {
-            // factor A(k, k)
-            internal::potrf<Target::HostTask>(A.sub(k, k), 1);
-
-            // send A(k, k) down col A( k+1:ij_end-1, k )
-            if (k+1 < ij_end)
-                A.tileBcast(k, k, A.sub(k+1, ij_end-1, k, k), layout);
-
-            // A(k+1:ij_end-1, k) * A(k, k)^{-H}
-            if (k+1 < ij_end) {
-                auto Akk = A.sub(k, k);
-                auto Tkk = TriangularMatrix< scalar_t >(Diag::NonUnit, Akk);
-                internal::trsm<Target::HostTask>(
-                    Side::Right,
-                    one, conj_transpose( Tkk ),
-                    A.sub(k+1, ij_end-1, k, k), 1);
-            }
+    {
+        int64_t kk = 0;  // column index (not block-column)
+        for (int64_t k = 0; k < A_nt; ++k) {
 
-            BcastList bcast_list_A;
-            for (int64_t i = k+1; i < ij_end; ++i) {
-                // send A(i, k) across row A(i, k+1:i) and
-                // down col A(i:ij_end-1, i).
-                bcast_list_A.push_back({i, k, {A.sub(i, i, k+1, i),
-                                               A.sub(i, ij_end-1, i, i)}});
-            }
-            A.template listBcast(bcast_list_A, layout);
-        }
+            int64_t ij_end = std::min(k + kdt + 1, A_nt);
 
-        // update trailing submatrix, normal priority
-        if (k+1+lookahead < ij_end) {
-            #pragma omp task depend(in:column[k]) \
-                             depend(inout:column[k+1+lookahead]) \
-                             depend(inout:column[A_nt-1])
+            // panel, high priority
+            #pragma omp task depend(inout:column[k]) priority(1) \
+                shared( info )
             {
-                internal::herk<Target::HostTask>(
-                    -r_one, A.sub(k+1+lookahead, ij_end-1, k, k),
-                    r_one,  A.sub(k+1+lookahead, ij_end-1) );
+                // factor A(k, k)
+                int64_t iinfo = internal::potrf<Target::HostTask>( A.sub( k, k ), 1 );
+                if (iinfo != 0 && info == 0)
+                    info = kk + iinfo;
+
+                // send A(k, k) down col A( k+1:ij_end-1, k )
+                if (k+1 < ij_end)
+                    A.tileBcast(k, k, A.sub(k+1, ij_end-1, k, k), layout);
+
+                // A(k+1:ij_end-1, k) * A(k, k)^{-H}
+                if (k+1 < ij_end) {
+                    auto Akk = A.sub(k, k);
+                    auto Tkk = TriangularMatrix< scalar_t >(Diag::NonUnit, Akk);
+                    internal::trsm<Target::HostTask>(
+                        Side::Right,
+                        one, conj_transpose( Tkk ),
+                        A.sub(k+1, ij_end-1, k, k), 1);
+                }
+
+                BcastList bcast_list_A;
+                for (int64_t i = k+1; i < ij_end; ++i) {
+                    // send A(i, k) across row A(i, k+1:i) and
+                    // down col A(i:ij_end-1, i).
+                    bcast_list_A.push_back({i, k, {A.sub(i, i, k+1, i),
+                                                   A.sub(i, ij_end-1, i, i)}});
+                }
+                A.template listBcast(bcast_list_A, layout);
             }
-        }
 
-        // update lookahead column(s), normal priority
-        for (int64_t j = k+1; j < k+1+lookahead && j < ij_end; ++j) {
-            #pragma omp task depend(in:column[k]) \
-                             depend(inout:column[j])
-            {
-                internal::herk<Target::HostTask>(
-                    -r_one, A.sub(j, j, k, k),
-                    r_one,  A.sub(j, j) );
-
-                if (j+1 <= A_nt-1) {
-                    auto Ajk = A.sub(j, j, k, k);
-                    internal::gemm<Target::HostTask>(
-                        -one, A.sub(j+1, ij_end-1, k, k),
-                              conj_transpose( Ajk ),
-                        one,  A.sub(j+1, ij_end-1, j, j), layout );
+            // update trailing submatrix, normal priority
+            if (k+1+lookahead < ij_end) {
+                #pragma omp task depend(in:column[k]) \
+                                 depend(inout:column[k+1+lookahead]) \
+                                 depend(inout:column[A_nt-1])
+                {
+                    internal::herk<Target::HostTask>(
+                        -r_one, A.sub(k+1+lookahead, ij_end-1, k, k),
+                        r_one,  A.sub(k+1+lookahead, ij_end-1) );
+                }
+            }
+
+            // update lookahead column(s), normal priority
+            for (int64_t j = k+1; j < k+1+lookahead && j < ij_end; ++j) {
+                #pragma omp task depend(in:column[k]) \
+                                 depend(inout:column[j])
+                {
+                    internal::herk<Target::HostTask>(
+                        -r_one, A.sub(j, j, k, k),
+                        r_one,  A.sub(j, j) );
+
+                    if (j+1 <= A_nt-1) {
+                        auto Ajk = A.sub(j, j, k, k);
+                        internal::gemm<Target::HostTask>(
+                            -one, A.sub(j+1, ij_end-1, k, k),
+                                  conj_transpose( Ajk ),
+                            one,  A.sub(j+1, ij_end-1, j, j), layout );
+                    }
                 }
             }
+            kk += A.tileNb( k );
         }
     }
 
@@ -128,6 +136,9 @@ void pbtrf(
     A.releaseWorkspace();
 
     // Debug::printTilesMaps(A);
+
+    internal::reduce_info( &info, A.mpiComm() );
+    return info;
 }
 
 } // namespace impl
@@ -135,7 +146,7 @@ void pbtrf(
 //------------------------------------------------------------------------------
 /// Distributed parallel band Cholesky factorization.
 ///
-/// Computes the Cholesky factorization of a hermitian positive definite band
+/// Computes the Cholesky factorization of a Hermitian positive definite band
 /// matrix $A$.
 ///
 /// The factorization has the form
@@ -153,26 +164,18 @@ void pbtrf(
 ///     One of float, double, std::complex<float>, std::complex<double>.
 //------------------------------------------------------------------------------
 /// @param[in,out] A
-///     On entry, the hermitian band matrix $A$ to be factored.
+///     On entry, the Hermitian band matrix $A$ to be factored.
 ///     Tiles outside the bandwidth do not need to exist.
 ///     For tiles that are partially outside the bandwidth,
 ///     data outside the bandwidth should be explicitly set to zero.
-///     On exit, the factors $L$ and $U$ from the factorization $A = L U$;
-///     the unit diagonal elements of $L$ are not stored.
-///     The upper bandwidth is increased to accomodate fill-in of $U$.
-///
-/// @param[out] pivots
-///     The pivot indices that define the permutations.
+///     On exit, the factor $L$ or $U$ from the factorization
+///     $A = L L^H$ or $A = U^H U$.
 ///
 /// @param[in] opts
 ///     Additional options, as map of name = value pairs. Possible options:
 ///     - Option::Lookahead:
 ///       Number of panels to overlap with matrix updates.
 ///       lookahead >= 0. Default 1.
-///     - Option::InnerBlocking:
-///       Inner blocking to use for panel. Default 16.
-///     - Option::MaxPanelThreads:
-///       Number of threads to use for panel. Default omp_get_max_threads()/2.
 ///     - Option::Target:
 ///       Implementation to target. Possible values:
 ///       - HostTask:  OpenMP tasks on CPU host [default].
@@ -180,17 +183,15 @@ void pbtrf(
 ///       - HostBatch: batched BLAS on CPU host.
 ///       - Devices:   batched BLAS on GPU device.
 ///
-/// TODO: return value
-/// @retval 0 successful exit
-/// @retval >0 for return value = $i$, $U(i,i)$ is exactly zero. The
-///         factorization has been completed, but the factor $U$ is exactly
-///         singular, and division by zero will occur if it is used
-///         to solve a system of equations.
+/// @return 0: successful exit
+/// @return i > 0: the leading minor of order $i$ of $A$ is not
+///         positive definite, so the factorization could not
+///         be completed.
 ///
 /// @ingroup pbsv_computational
 ///
 template <typename scalar_t>
-void pbtrf(
+int64_t pbtrf(
     HermitianBandMatrix<scalar_t>& A,
     Options const& opts )
 {
@@ -199,43 +200,39 @@ void pbtrf(
     switch (target) {
         case Target::Host:
         case Target::HostTask:
-            impl::pbtrf<Target::HostTask>( A, opts );
-            break;
+            return impl::pbtrf<Target::HostTask>( A, opts );
 
         case Target::HostNest:
-            impl::pbtrf<Target::HostNest>( A, opts );
-            break;
+            return impl::pbtrf<Target::HostNest>( A, opts );
 
         case Target::HostBatch:
-            impl::pbtrf<Target::HostBatch>( A, opts );
-            break;
+            return impl::pbtrf<Target::HostBatch>( A, opts );
 
         case Target::Devices:
-            impl::pbtrf<Target::Devices>( A, opts );
-            break;
+            return impl::pbtrf<Target::Devices>( A, opts );
     }
-    // todo: return value for errors?
+    return -2;  // shouldn't happen
 }
 
 //------------------------------------------------------------------------------
 // Explicit instantiations.
 template
-void pbtrf<float>(
+int64_t pbtrf<float>(
     HermitianBandMatrix<float>& A,
     Options const& opts);
 
 template
-void pbtrf<double>(
+int64_t pbtrf<double>(
     HermitianBandMatrix<double>& A,
     Options const& opts);
 
 template
-void pbtrf< std::complex<float> >(
+int64_t pbtrf< std::complex<float> >(
     HermitianBandMatrix< std::complex<float> >& A,
     Options const& opts);
 
 template
-void pbtrf< std::complex<double> >(
+int64_t pbtrf< std::complex<double> >(
     HermitianBandMatrix< std::complex<double> >& A,
     Options const& opts);
 
diff --git a/src/posv.cc b/src/posv.cc
index b0af6e9fa..71940b2ac 100644
--- a/src/posv.cc
+++ b/src/posv.cc
@@ -61,18 +61,18 @@ namespace slate {
 ///       - HostBatch: batched BLAS on CPU host.
 ///       - Devices:   batched BLAS on GPU device.
 ///
-/// TODO: return value
-/// @retval 0 successful exit
-/// @retval >0 for return value = $i$, the leading minor of order $i$ of $A$ is not
+/// @return 0: successful exit
+/// @return i > 0: the leading minor of order $i$ of $A$ is not
 ///         positive definite, so the factorization could not
 ///         be completed, and the solution has not been computed.
 ///
 /// @ingroup posv
 ///
 template <typename scalar_t>
-void posv(HermitianMatrix<scalar_t>& A,
-          Matrix<scalar_t>& B,
-          Options const& opts)
+int64_t posv(
+    HermitianMatrix<scalar_t>& A,
+    Matrix<scalar_t>& B,
+    Options const& opts)
 {
     Timer t_posv;
 
@@ -80,41 +80,42 @@ void posv(HermitianMatrix<scalar_t>& A,
 
     // factorization
     Timer t_potrf;
-    potrf(A, opts);
+    int64_t info = potrf( A, opts );
     timers[ "posv::potrf" ] = t_potrf.stop();
 
     // solve
     Timer t_potrs;
-    potrs(A, B, opts);
+    if (info == 0) {
+        potrs( A, B, opts );
+    }
     timers[ "posv::potrs" ] = t_potrs.stop();
 
-    // todo: return value for errors?
-
     timers[ "posv" ] = t_posv.stop();
+    return info;
 }
 
 //------------------------------------------------------------------------------
 // Explicit instantiations.
 template
-void posv<float>(
+int64_t posv<float>(
     HermitianMatrix<float>& A,
     Matrix<float>& B,
     Options const& opts);
 
 template
-void posv<double>(
+int64_t posv<double>(
     HermitianMatrix<double>& A,
     Matrix<double>& B,
     Options const& opts);
 
 template
-void posv< std::complex<float> >(
+int64_t posv< std::complex<float> >(
     HermitianMatrix< std::complex<float> >& A,
     Matrix< std::complex<float> >& B,
     Options const& opts);
 
 template
-void posv< std::complex<double> >(
+int64_t posv< std::complex<double> >(
     HermitianMatrix< std::complex<double> >& A,
     Matrix< std::complex<double> >& B,
     Options const& opts);
diff --git a/src/posv_mixed.cc b/src/posv_mixed.cc
index c0fd0432b..d7f1d4676 100644
--- a/src/posv_mixed.cc
+++ b/src/posv_mixed.cc
@@ -60,7 +60,7 @@ namespace slate {
 ///     (return value = 0 and iter < 0, see description below), then the
 ///     array $A$ contains the factor $U$ or $L$ from the Cholesky
 ///     factorization $A = U^H U$ or $A = L L^H$.
-///     If scalar_t is real, $A$ can be a SymmetricMatrix object.
+///     If scalar_t is real, $A$ can be a SymmetricMatrix.
 ///
 /// @param[in] B
 ///     On entry, the n-by-nrhs right hand side matrix $B$.
@@ -69,9 +69,13 @@ namespace slate {
 ///     On exit, if return value = 0, the n-by-nrhs solution matrix $X$.
 ///
 /// @param[out] iter
-///     The number of the iterations in the iterative refinement
-///     process, needed for the convergence. If failed, it is set
-///     to be -(1+itermax), where itermax = 30.
+///     > 0: The number of the iterations the iterative refinement
+///          process needed for convergence.
+///     < 0: Iterative refinement failed; it falls back to a double
+///          precision factorization and solve.
+///          -3: single precision matrix was not positive definite in potrf.
+///          -(itermax+1): iterative refinement failed to converge in
+///          itermax iterations.
 ///
 /// @param[in] opts
 ///     Additional options, as map of name = value pairs. Possible options:
@@ -92,39 +96,39 @@ namespace slate {
 ///       If true and iterative refinement fails to convergene, the problem is
 ///       resolved with partial-pivoted LU. Default true
 ///
-/// TODO: return value
-/// @retval 0 successful exit
-/// @retval >0 for return value = $i$, the leading minor of order $i$ of $A$ is not
+/// @return 0: successful exit
+/// @return i > 0: the leading minor of order $i$ of $A$ is not
 ///         positive definite, so the factorization could not
 ///         be completed, and the solution has not been computed.
 ///
 /// @ingroup posv
 ///
 template <typename scalar_hi, typename scalar_lo>
-void posv_mixed(
+int64_t posv_mixed(
     HermitianMatrix<scalar_hi>& A,
     Matrix<scalar_hi>& B,
     Matrix<scalar_hi>& X,
     int& iter,
     Options const& opts)
 {
-    Timer t_posv_mixed;
+    using real_hi = blas::real_type<scalar_hi>;
 
-    // XXX This is only used for the memory management and may be inconsistent
-    // with the routines called in this routine.
-    Target target = get_option( opts, Option::Target, Target::HostTask );
+    Timer t_posv_mixed;
 
+    // Constants
     // Assumes column major
     const Layout layout = Layout::ColMajor;
-
-    bool converged = false;
-    using real_hi = blas::real_type<scalar_hi>;
     const real_hi eps = std::numeric_limits<real_hi>::epsilon();
-    const scalar_hi one_hi      = 1.0;
+    const scalar_hi one_hi = 1.0;
 
+    // Options
+    // XXX target is used only for the memory management and may be inconsistent
+    // with the routines called in this routine.
+    Target target = get_option( opts, Option::Target, Target::HostTask );
     int64_t itermax = get_option<int64_t>( opts, Option::MaxIterations, 30 );
     double tol = get_option<double>( opts, Option::Tolerance, eps*std::sqrt(A.m()) );
     bool use_fallback = get_option<int64_t>( opts, Option::UseFallbackSolver, true );
+    bool converged = false;
     iter = 0;
 
     assert( B.mt() == A.mt() );
@@ -178,94 +182,101 @@ void posv_mixed(
 
     // Compute the Cholesky factorization of A_lo.
     Timer t_potrf_lo;
-    potrf(  A_lo, opts );
+    int64_t info = potrf( A_lo, opts );
     timers[ "posv_mixed::potrf_lo" ] = t_potrf_lo.stop();
-
-    // Solve the system A_lo * X_lo = B_lo.
-    Timer t_potrs_lo;
-    potrs( A_lo, X_lo, opts );
-    timers[ "posv_mixed::potrs_lo" ] = t_potrs_lo.stop();
-
-    // Convert X_lo to high precision.
-    copy( X_lo, X, opts );
-
-    // Compute R = B - A * X.
-    slate::copy( B, R, opts );
-    Timer t_hemm_hi;
-    hemm<scalar_hi>(
-        Side::Left,
-        -one_hi, A,
-                 X,
-        one_hi,  R, opts );
-    timers[ "posv_mixed::hemm_hi" ] = t_hemm_hi.stop();
-
-    // Check whether the nrhs normwise backward error satisfies the
-    // stopping criterion. If yes, set iter=0 and return.
-    colNorms( Norm::Max, X, colnorms_X.data(), opts );
-    colNorms( Norm::Max, R, colnorms_R.data(), opts );
-
-    if (internal::iterRefConverged<real_hi>( colnorms_R, colnorms_X, cte) ) {
-        iter = 0;
-        converged = true;
+    if (info != 0) {
+        iter = -3;
     }
-
-    // iterative refinement
-    timers[ "posv_mixed::add_hi" ] = 0;
-    for (int iiter = 0; iiter < itermax && ! converged; ++iiter) {
-        // Convert R from high to low precision, store result in X_lo.
-        copy( R, X_lo, opts );
-
-        // Solve the system A_lo * X_lo = R_lo.
-        t_potrs_lo.start();
+    else {
+        // Solve the system A_lo * X_lo = B_lo.
+        Timer t_potrs_lo;
         potrs( A_lo, X_lo, opts );
-        timers[ "posv_mixed::potrs_lo" ] += t_potrs_lo.stop();
+        timers[ "posv_mixed::potrs_lo" ] = t_potrs_lo.stop();
 
-        // Convert X_lo back to double precision and update the current iterate.
-        copy( X_lo, R, opts );
-        Timer t_add_hi;
-        add<scalar_hi>(
-              one_hi, R,
-              one_hi, X, opts );
-        timers[ "posv_mixed::add_hi" ] += t_add_hi.stop();
+        // Convert X_lo to high precision.
+        copy( X_lo, X, opts );
 
         // Compute R = B - A * X.
         slate::copy( B, R, opts );
-        t_hemm_hi.start();
+        Timer t_hemm_hi;
         hemm<scalar_hi>(
             Side::Left,
             -one_hi, A,
                      X,
             one_hi,  R, opts );
-        timers[ "posv_mixed::hemm_hi" ] += t_hemm_hi.stop();
+        timers[ "posv_mixed::hemm_hi" ] = t_hemm_hi.stop();
 
-        // Check whether nrhs normwise backward error satisfies the
-        // stopping criterion. If yes, set iter = iiter > 0 and return.
+        // Check whether the nrhs normwise backward error satisfies the
+        // stopping criterion. If yes, set iter=0 and return.
         colNorms( Norm::Max, X, colnorms_X.data(), opts );
         colNorms( Norm::Max, R, colnorms_R.data(), opts );
 
         if (internal::iterRefConverged<real_hi>( colnorms_R, colnorms_X, cte )) {
-            iter = iiter+1;
+            iter = 0;
             converged = true;
         }
+
+        // iterative refinement
+        timers[ "posv_mixed::add_hi" ] = 0;
+        for (int iiter = 0; iiter < itermax && ! converged; ++iiter) {
+            // Convert R from high to low precision, store result in X_lo.
+            copy( R, X_lo, opts );
+
+            // Solve the system A_lo * X_lo = R_lo.
+            t_potrs_lo.start();
+            potrs( A_lo, X_lo, opts );
+            timers[ "posv_mixed::potrs_lo" ] += t_potrs_lo.stop();
+
+            // Convert X_lo back to double precision and update the current iterate.
+            copy( X_lo, R, opts );
+            Timer t_add_hi;
+            add<scalar_hi>(
+                  one_hi, R,
+                  one_hi, X, opts );
+            timers[ "posv_mixed::add_hi" ] += t_add_hi.stop();
+
+            // Compute R = B - A * X.
+            slate::copy( B, R, opts );
+            t_hemm_hi.start();
+            hemm<scalar_hi>(
+                Side::Left,
+                -one_hi, A,
+                         X,
+                one_hi,  R, opts );
+            timers[ "posv_mixed::hemm_hi" ] += t_hemm_hi.stop();
+
+            // Check whether nrhs normwise backward error satisfies the
+            // stopping criterion. If yes, set iter = iiter > 0 and return.
+            colNorms( Norm::Max, X, colnorms_X.data(), opts );
+            colNorms( Norm::Max, R, colnorms_R.data(), opts );
+
+            if (internal::iterRefConverged<real_hi>( colnorms_R, colnorms_X, cte )) {
+                iter = iiter+1;
+                converged = true;
+            }
+        }
     }
 
     if (! converged) {
-        // If we are at this place of the code, this is because we have performed
-        // iter = itermax iterations and never satisfied the stopping criterion,
-        // set up the iter flag accordingly and follow up with double precision
-        // routine.
-        iter = -itermax - 1;
+        if (info == 0) {
+            // If we performed iter = itermax iterations and never satisfied
+            // the stopping criterion, set up the iter flag accordingly.
+            iter = -itermax - 1;
+        }
 
         if (use_fallback) {
+            // Fall back to double precision factor and solve.
             // Compute the Cholesky factorization of A.
             Timer t_potrf_hi;
-            potrf( A, opts );
+            info = potrf( A, opts );
             timers[ "posv_mixed::potrf_hi" ] = t_potrf_hi.stop();
 
             // Solve the system A * X = B.
-            slate::copy( B, X, opts );
             Timer t_potrs_hi;
-            potrs( A, X, opts );
+            if (info == 0) {
+                slate::copy( B, X, opts );
+                potrs( A, X, opts );
+            }
             timers[ "posv_mixed::potrs_hi" ] = t_potrs_hi.stop();
         }
     }
@@ -278,31 +289,33 @@ void posv_mixed(
     }
     timers[ "posv_mixed" ] = t_posv_mixed.stop();
 
-    // todo: return value for errors?
+    return info;
 }
 
 //------------------------------------------------------------------------------
 // Explicit instantiations.
 template <>
-void posv_mixed<double>(
+int64_t posv_mixed<double>(
     HermitianMatrix<double>& A,
     Matrix<double>& B,
     Matrix<double>& X,
     int& iter,
     Options const& opts)
 {
-    posv_mixed<double, float>( A, B, X, iter, opts );
+    return posv_mixed<double, float>(
+        A, B, X, iter, opts );
 }
 
 template <>
-void posv_mixed< std::complex<double> >(
+int64_t posv_mixed< std::complex<double> >(
     HermitianMatrix< std::complex<double> >& A,
     Matrix< std::complex<double> >& B,
     Matrix< std::complex<double> >& X,
     int& iter,
     Options const& opts)
 {
-    posv_mixed<std::complex<double>, std::complex<float>>( A, B, X, iter, opts );
+    return posv_mixed< std::complex<double>, std::complex<float> >(
+        A, B, X, iter, opts );
 }
 
 } // namespace slate
diff --git a/src/posv_mixed_gmres.cc b/src/posv_mixed_gmres.cc
index 59140b348..ac6e1defb 100644
--- a/src/posv_mixed_gmres.cc
+++ b/src/posv_mixed_gmres.cc
@@ -68,9 +68,14 @@ namespace slate {
 ///     On exit, if return value = 0, the n-by-nrhs solution matrix $X$.
 ///
 /// @param[out] iter
-///     The number of the iterations in the iterative refinement
-///     process, needed for the convergence. If failed, it is set
-///     to be -(1+itermax), where itermax is controlled by the opts argument.
+/// @param[out] iter
+///     > 0: The number of the iterations the iterative refinement
+///          process needed for convergence.
+///     < 0: Iterative refinement failed; it falls back to a double
+///          precision factorization and solve.
+///          -3: single precision matrix was exactly singular in getrf.
+///          -(itermax+1): iterative refinement failed to converge in
+///          itermax iterations.
 ///
 /// @param[in] opts
 ///     Additional options, as map of name = value pairs. Possible options:
@@ -95,16 +100,15 @@ namespace slate {
 ///       If true and iterative refinement fails to convergene, the problem is
 ///       resolved with partial-pivoted LU. Default true
 ///
-/// TODO: return value
-/// @retval 0 successful exit
-/// @retval >0 for return value = $i$, the computed $U(i,i)$ is exactly zero.
-///         The factorization has been completed, but the factor U is exactly
-///         singular, so the solution could not be computed.
+/// @return 0: successful exit
+/// @return i > 0: the leading minor of order $i$ of $A$ is not
+///         positive definite, so the factorization could not
+///         be completed, and the solution has not been computed.
 ///
 /// @ingroup posv
 ///
 template <typename scalar_hi, typename scalar_lo>
-void posv_mixed_gmres(
+int64_t posv_mixed_gmres(
     HermitianMatrix<scalar_hi>& A,
     Matrix<scalar_hi>& B,
     Matrix<scalar_hi>& X,
@@ -118,56 +122,58 @@ void posv_mixed_gmres(
     // Constants
     const real_hi eps = std::numeric_limits<real_hi>::epsilon();
     const int64_t mpi_rank = A.mpiRank();
+    const scalar_hi zero = 0.0;
+    const scalar_hi one  = 1.0;
     // Assumes column major
     const Layout layout = Layout::ColMajor;
 
     // Options
     Target target = get_option( opts, Option::Target, Target::HostTask );
-
-    bool converged = false;
     int64_t itermax = get_option<int64_t>( opts, Option::MaxIterations, 30 );
     double tol = get_option<double>( opts, Option::Tolerance, eps*std::sqrt(A.m()) );
     bool use_fallback = get_option<int64_t>( opts, Option::UseFallbackSolver, true );
-    const int64_t restart = std::min(
-            std::min( int64_t( 30 ), itermax ), A.tileMb( 0 )-1 );
+    int64_t restart = blas::min( 30, itermax, A.tileMb( 0 )-1 );
+    bool converged = false;
     iter = 0;
 
-    assert(B.mt() == A.mt());
+    assert( B.mt() == A.mt() );
+    assert( A.tileMb( 0 ) >= restart );
 
     // TODO: implement block gmres
     if (B.n() != 1) {
-        slate_not_implemented("block-GMRES is not yet supported");
+        slate_not_implemented( "block-GMRES for multiple RHS is not yet supported" );
     }
 
     // workspace
     auto R    = B.emptyLike();
-    R.   insertLocalTiles(target);
+    R.insertLocalTiles( target );
     auto A_lo = A.template emptyLike<scalar_lo>();
-    A_lo.insertLocalTiles(target);
+    A_lo.insertLocalTiles( target );
     auto X_lo = X.template emptyLike<scalar_lo>();
-    X_lo.insertLocalTiles(target);
+    X_lo.insertLocalTiles( target );
 
-    std::vector<real_hi> colnorms_X(X.n());
-    std::vector<real_hi> colnorms_R(R.n());
+    std::vector<real_hi> colnorms_X( X.n() );
+    std::vector<real_hi> colnorms_R( R.n() );
 
     // test basis.  First column corresponds to the residual
-    auto V = internal::alloc_basis(A, restart+1, target);
-    // solution basis.  Columns correspond to those in V.  First column is unused
-    auto W = internal::alloc_basis(A, restart+1, target);
+    auto V = internal::alloc_basis( A, restart+1, target );
+    // solution basis.  Columns correspond to those in V. First column is unused
+    auto W = internal::alloc_basis( A, restart+1, target );
 
     // workspace vector for the orthogonalization process
     auto z = X.template emptyLike<scalar_hi>();
-    z.insertLocalTiles(target);
-
-    // Hessenberg Matrix.  Allocate as a single tile
-    slate::Matrix<scalar_hi> H(restart+1, restart+1, restart+1, 1, 1, A.mpiComm());
-    H.insertLocalTiles(Target::Host);
-    // least squares RHS.  Allocate as a single tile
-    slate::Matrix<scalar_hi> S(restart+1, 1, restart+1, 1, 1, A.mpiComm());
-    S.insertLocalTiles(Target::Host);
+    z.insertLocalTiles( target );
+
+    // Hessenberg Matrix. Allocate as a single tile
+    slate::Matrix<scalar_hi> H(
+            restart+1, restart+1, restart+1, 1, 1, A.mpiComm() );
+    H.insertLocalTiles( Target::Host );
+    // least squares RHS. Allocate as a single tile
+    slate::Matrix<scalar_hi> S( restart+1, 1, restart+1, 1, 1, A.mpiComm() );
+    S.insertLocalTiles( Target::Host );
     // Rotations
-    std::vector<real_hi>   givens_alpha(restart);
-    std::vector<scalar_hi> givens_beta (restart);
+    std::vector<real_hi>   givens_alpha( restart );
+    std::vector<scalar_hi> givens_beta ( restart );
 
 
     if (target == Target::Devices) {
@@ -176,211 +182,217 @@ void posv_mixed_gmres(
         {
             #pragma omp task default(shared)
             {
-                A.tileGetAndHoldAllOnDevices(LayoutConvert(layout));
+                A.tileGetAndHoldAllOnDevices( LayoutConvert( layout ) );
             }
             #pragma omp task default(shared)
             {
-                B.tileGetAndHoldAllOnDevices(LayoutConvert(layout));
+                B.tileGetAndHoldAllOnDevices( LayoutConvert( layout ) );
             }
             #pragma omp task default(shared)
             {
-                X.tileGetAndHoldAllOnDevices(LayoutConvert(layout));
+                X.tileGetAndHoldAllOnDevices( LayoutConvert( layout ) );
             }
         }
     }
 
     // norm of A
-    real_hi Anorm = norm(Norm::Inf, A, opts);
+    real_hi Anorm = norm( Norm::Inf, A, opts );
 
     // stopping criteria
     real_hi cte = Anorm * tol;
 
     // Compute the Cholesky factorization of A in single-precision.
-    slate::copy(A, A_lo, opts);
+    slate::copy( A, A_lo, opts );
     Timer t_potrf_lo;
-    potrf(A_lo, opts);
+    int64_t info = potrf( A_lo, opts );
     timers[ "posv_mixed_gmres::potrf_lo" ] = t_potrf_lo.stop();
-
-
-    // Solve the system A * X = B in low precision.
-    slate::copy(B, X_lo, opts);
-    Timer t_potrs_lo;
-    potrs(A_lo, X_lo, opts);
-    timers[ "posv_mixed_gmres::potrs_lo" ] = t_potrs_lo.stop();
-    slate::copy(X_lo, X, opts);
-
-
-    // IR
-    int iiter = 0;
-    timers[ "posv_mixed_gmres::add_hi" ] = 0;
-    while (iiter < itermax) {
-
-        // Check for convergence
-        slate::copy(B, R, opts);
-        Timer t_hemm_hi;
-        hemm<scalar_hi>(
-            Side::Left,
-            scalar_hi(-1.0), A,
-                             X,
-            scalar_hi(1.0),  R,
-            opts);
-        timers[ "posv_mixed_gmres::hemm_hi" ] = t_hemm_hi.stop();
-        colNorms( Norm::Max, X, colnorms_X.data(), opts );
-        colNorms( Norm::Max, R, colnorms_R.data(), opts );
-        if (internal::iterRefConverged<real_hi>(colnorms_R, colnorms_X, cte)) {
-            iter = iiter;
-            converged = true;
-            break;
-        }
-
-        // GMRES
-
-        // Compute initial vector
-        auto v0 = V.slice(0, V.m()-1, 0, 0);
-        slate::copy(R, v0, opts);
-
-        std::vector<real_hi> arnoldi_residual = {norm(Norm::Fro, v0, opts)};
-        if (arnoldi_residual[0] == 0) {
-            // Solver broke down, but residual is not small enough yet.
-            iter = iiter;
-            converged = false;
-            break;
-        }
-        scale(1.0, arnoldi_residual[0], v0, opts);
-        if (S.tileRank(0, 0) == mpi_rank) {
-            S.tileGetForWriting(0, 0, LayoutConvert::ColMajor);
-            auto S_00 = S(0, 0);
-            S_00.at(0, 0) = arnoldi_residual[0];
-            for (int i = 1; i < S_00.mb(); ++i) {
-                S_00.at(i, 0) = 0.0;
+    if (info != 0) {
+        iter = -3;
+    }
+    else {
+        // Solve the system A * X = B in low precision.
+        slate::copy( B, X_lo, opts );
+        Timer t_potrs_lo;
+        potrs( A_lo, X_lo, opts );
+        timers[ "posv_mixed_gmres::potrs_lo" ] = t_potrs_lo.stop();
+        slate::copy( X_lo, X, opts );
+
+        // IR
+        int iiter = 0;
+        timers[ "posv_mixed_gmres::add_hi" ] = 0;
+        while (iiter < itermax) {
+
+            // Check for convergence
+            slate::copy( B, R, opts );
+            Timer t_hemm_hi;
+            hemm<scalar_hi>(
+                Side::Left,
+                -one, A,
+                      X,
+                one,  R,
+                opts);
+            timers[ "posv_mixed_gmres::hemm_hi" ] = t_hemm_hi.stop();
+            colNorms( Norm::Max, X, colnorms_X.data(), opts );
+            colNorms( Norm::Max, R, colnorms_R.data(), opts );
+            if (internal::iterRefConverged<real_hi>( colnorms_R, colnorms_X, cte )) {
+                iter = iiter;
+                converged = true;
+                break;
             }
-        }
 
+            // GMRES
 
-        // N.B. convergence is detected using norm(X) at the beginning of the
-        // outer iteration. Thus, changes in the magnitude of X may lead to
-        // excessive restarting or delayed completion.
-        int j = 0;
-        for (; j < restart && iiter < itermax
-                   && !internal::iterRefConverged(arnoldi_residual, colnorms_X, cte);
-             j++, iiter++) {
-            auto Vj1 = V.slice(0, V.m()-1, j+1, j+1);
-            auto Wj1 = W.slice(0, W.m()-1, j+1, j+1);
+            // Compute initial vector
+            auto v0 = V.slice( 0, V.m()-1, 0, 0 );
+            slate::copy( R, v0, opts );
 
-            auto Vj = V.slice(0, V.m()-1, j, j);
-
-            // Wj1 = M^-1 A Vj
-            slate::copy(Vj, X_lo, opts);
-            t_potrs_lo.start();
-            potrs(A_lo, X_lo, opts);
-            timers[ "posv_mixed_gmres::potrs_lo" ] += t_potrs_lo.stop();
-            slate::copy(X_lo, Wj1, opts);
+            std::vector<real_hi> arnoldi_residual = { norm( Norm::Fro, v0, opts ) };
+            if (arnoldi_residual[0] == 0) {
+                // Solver broke down, but residual is not small enough yet.
+                iter = iiter;
+                converged = false;
+                break;
+            }
+            scale( 1.0, arnoldi_residual[0], v0, opts );
+            if (S.tileRank( 0, 0 ) == mpi_rank) {
+                S.tileGetForWriting( 0, 0, LayoutConvert::ColMajor );
+                auto S_00 = S( 0, 0 );
+                S_00.at( 0, 0 ) = arnoldi_residual[0];
+                for (int i = 1; i < S_00.mb(); ++i) {
+                    S_00.at( i, 0 ) = 0.0;
+                }
+            }
 
-            t_hemm_hi.start();
-            hemm<scalar_hi>(
-                Side::Left,
-                scalar_hi(1.0), A,
-                                Wj1,
-                scalar_hi(0.0), Vj1,
-                opts);
-            timers[ "posv_mixed_gmres::hemm_hi" ] += t_hemm_hi.stop();
+            // N.B. convergence is detected using norm(X) at the beginning of the
+            // outer iteration. Thus, changes in the magnitude of X may lead to
+            // excessive restarting or delayed completion.
+            int j = 0;
+            for (; j < restart && iiter < itermax
+                       && ! internal::iterRefConverged(
+                                arnoldi_residual, colnorms_X, cte );
+                 ++j, ++iiter) {
+                auto Vj1 = V.slice( 0, V.m()-1, j+1, j+1 );
+                auto Wj1 = W.slice( 0, W.m()-1, j+1, j+1 );
+
+                auto Vj = V.slice( 0, V.m()-1, j, j );
+
+                // Wj1 = M^-1 A Vj
+                slate::copy( Vj, X_lo, opts );
+                t_potrs_lo.start();
+                potrs( A_lo, X_lo, opts );
+                timers[ "posv_mixed_gmres::potrs_lo" ] += t_potrs_lo.stop();
+                slate::copy( X_lo, Wj1, opts );
+
+                t_hemm_hi.start();
+                hemm<scalar_hi>(
+                    Side::Left,
+                    one,  A,
+                          Wj1,
+                    zero, Vj1,
+                    opts );
+                timers[ "posv_mixed_gmres::hemm_hi" ] += t_hemm_hi.stop();
+
+                // orthogonalize w/ CGS2
+                auto V0j = V.slice( 0, V.m()-1, 0, j );
+                auto V0jT = conj_transpose( V0j );
+                auto Hj = H.slice( 0, j, j, j );
+                Timer t_gemm_hi;
+                gemm<scalar_hi>(
+                    one,  V0jT,
+                          Vj1,
+                    zero, Hj,
+                    opts );
+                gemm<scalar_hi>(
+                    -one, V0j,
+                          Hj,
+                    one,  Vj1,
+                    opts );
+                timers[ "posv_mixed_gmres::gemm_hi" ] = t_gemm_hi.stop();
+                auto zj = z.slice( 0, j, 0, 0 );
+                t_gemm_hi.start();
+                gemm<scalar_hi>(
+                    one,  V0jT,
+                          Vj1,
+                    zero, zj,
+                    opts );
+                gemm<scalar_hi>(
+                    -one, V0j,
+                          zj,
+                    one,  Vj1,
+                    opts );
+                timers[ "posv_mixed_gmres::gemm_hi" ] += t_gemm_hi.stop();
+                Timer t_add_hi;
+                add( one, zj, one, Hj, opts );
+                timers[ "posv_mixed_gmres::add_hi" ] += t_add_hi.stop();
+                auto Vj1_norm = norm( Norm::Fro, Vj1, opts );
+                scale( 1.0, Vj1_norm, Vj1, opts );
+                if (H.tileRank( 0, 0 ) == mpi_rank) {
+                    H.tileGetForWriting( 0, 0, LayoutConvert::ColMajor );
+                    auto H_00 = H( 0, 0 );
+                    H_00.at( j+1, j ) = Vj1_norm;
+                }
 
-            // orthogonalize w/ CGS2
-            auto V0j = V.slice(0, V.m()-1, 0, j);
-            auto V0jT = conj_transpose(V0j);
-            auto Hj = H.slice(0, j, j, j);
+                // apply givens rotations
+                Timer t_posv_mixed_gmres_rotations;
+                if (H.tileRank( 0, 0 ) == mpi_rank) {
+                    auto H_00 = H( 0, 0 );
+                    for (int64_t i = 0; i < j; ++i) {
+                        blas::rot( 1, &H_00.at( i, j ), 1, &H_00.at( i+1, j ), 1,
+                                   givens_alpha[i], givens_beta[i] );
+                    }
+                    scalar_hi H_jj = H_00.at( j, j ), H_j1j = H_00.at( j+1, j );
+                    blas::rotg( &H_jj, & H_j1j, &givens_alpha[j], &givens_beta[j] );
+                    blas::rot( 1, &H_00.at( j, j ), 1, &H_00.at( j+1, j ), 1,
+                               givens_alpha[j], givens_beta[j] );
+                    auto S_00 = S( 0, 0 );
+                    blas::rot( 1, &S_00.at( j, 0 ), 1, &S_00.at( j+1, 0 ), 1,
+                               givens_alpha[j], givens_beta[j] );
+                    arnoldi_residual[0] = cabs1( S_00.at( j+1, 0 ) );
+                }
+                timers[ "posv_mixed_gmres::rotations" ] = t_posv_mixed_gmres_rotations.stop();
+                MPI_Bcast( arnoldi_residual.data(), arnoldi_residual.size(),
+                           mpi_type<scalar_hi>::value, S.tileRank( 0, 0 ),
+                           A.mpiComm() );
+            }
+            // update X
+            auto H_j = H.slice( 0, j-1, 0, j-1 );
+            auto S_j = S.slice( 0, j-1, 0, 0 );
+            auto H_tri = TriangularMatrix<scalar_hi>(
+                    Uplo::Upper, Diag::NonUnit, H_j );
+            Timer t_trsm_hi;
+            trsm( Side::Left, one, H_tri, S_j, opts );
+            timers[ "posv_mixed_gmres::trsm_hi" ] = t_trsm_hi.stop();
+            auto W_0j = W.slice( 0, W.m()-1, 1, j ); // first column of W is unused
             Timer t_gemm_hi;
             gemm<scalar_hi>(
-                scalar_hi(1.0), V0jT,
-                                Vj1,
-                scalar_hi(0.0), Hj,
-                opts);
-            gemm<scalar_hi>(
-                scalar_hi(-1.0), V0j,
-                                 Hj,
-                scalar_hi(1.0),  Vj1,
-                opts);
-            timers[ "posv_mixed_gmres::gemm_hi" ] = t_gemm_hi.stop();
-            auto zj = z.slice(0, j, 0, 0);
-            t_gemm_hi.start();
-            gemm<scalar_hi>(
-                scalar_hi(1.0), V0jT,
-                                Vj1,
-                scalar_hi(0.0), zj,
-                opts);
-            gemm<scalar_hi>(
-                scalar_hi(-1.0), V0j,
-                                 zj,
-                scalar_hi(1.0),  Vj1,
-                opts);
+                one, W_0j,
+                     S_j,
+                one, X,
+                opts );
             timers[ "posv_mixed_gmres::gemm_hi" ] += t_gemm_hi.stop();
-            Timer t_add_hi;
-            add(scalar_hi(1.0), zj, scalar_hi(1.0), Hj,
-                opts);
-            timers[ "posv_mixed_gmres::add_hi" ] += t_add_hi.stop();
-            auto Vj1_norm = norm(Norm::Fro, Vj1, opts);
-            scale(1.0, Vj1_norm, Vj1, opts);
-            if (H.tileRank(0, 0) == mpi_rank) {
-                H.tileGetForWriting(0, 0, LayoutConvert::ColMajor);
-                auto H_00 = H(0, 0);
-                H_00.at(j+1, j) = Vj1_norm;
-            }
-
-            // apply givens rotations
-            Timer t_posv_mixed_gmres_rotations;
-            if (H.tileRank(0, 0) == mpi_rank) {
-                auto H_00 = H(0, 0);
-                for (int64_t i = 0; i < j; ++i) {
-                    blas::rot(1, &H_00.at(i, j), 1, &H_00.at(i+1, j), 1,
-                              givens_alpha[i], givens_beta[i]);
-                }
-                scalar_hi H_jj = H_00.at(j, j), H_j1j = H_00.at(j+1, j);
-                blas::rotg(&H_jj, & H_j1j, &givens_alpha[j], &givens_beta[j]);
-                blas::rot(1, &H_00.at(j, j), 1, &H_00.at(j+1, j), 1,
-                          givens_alpha[j], givens_beta[j]);
-                auto S_00 = S(0, 0);
-                blas::rot(1, &S_00.at(j, 0), 1, &S_00.at(j+1, 0), 1,
-                          givens_alpha[j], givens_beta[j]);
-                arnoldi_residual[0] = cabs1(S_00.at(j+1, 0));
-            }
-            timers[ "posv_mixed_gmres::rotations" ] = t_posv_mixed_gmres_rotations.stop();
-            MPI_Bcast(arnoldi_residual.data(), arnoldi_residual.size(),
-                      mpi_type<scalar_hi>::value, S.tileRank(0, 0), A.mpiComm());
         }
-        // update X
-        auto H_j = H.slice(0, j-1, 0, j-1);
-        auto S_j = S.slice(0, j-1, 0, 0);
-        auto H_tri = TriangularMatrix<scalar_hi>(Uplo::Upper, Diag::NonUnit, H_j);
-        Timer t_trsm_hi;
-        trsm(Side::Left, scalar_hi(1.0), H_tri, S_j, opts);
-        timers[ "posv_mixed_gmres::trsm_hi" ] = t_trsm_hi.stop();
-        auto W_0j = W.slice(0, W.m()-1, 1, j); // first column of W is unused
-        Timer t_gemm_hi;
-        gemm<scalar_hi>(
-            scalar_hi(1.0), W_0j,
-                            S_j,
-            scalar_hi(1.0), X,
-            opts);
-        timers[ "posv_mixed_gmres::gemm_hi" ] += t_gemm_hi.stop();
     }
 
     if (! converged) {
-        // If we are at this place of the code, this is because we have performed
-        // iter = itermax iterations and never satisfied the stopping criterion,
-        // set up the iter flag accordingly and follow up with double precision
-        // routine.
-        iter = -iiter-1;
+        if (info == 0) {
+            // If we performed iter = itermax iterations and never satisfied
+            // the stopping criterion, set up the iter flag accordingly.
+            iter = -itermax - 1;
+        }
 
         if (use_fallback) {
+            // Fall back to double precision factor and solve.
             // Compute the Cholesky factorization of A.
             Timer t_potrf_hi;
-            potrf( A, opts );
+            info = potrf( A, opts );
             timers[ "posv_mixed_gmres::potrf_hi" ] = t_potrf_hi.stop();
 
             // Solve the system A * X = B.
-            slate::copy( B, X, opts );
             Timer t_potrs_hi;
-            potrs( A, X, opts );
+            if (info == 0) {
+                slate::copy( B, X, opts );
+                potrs( A, X, opts );
+            }
             timers[ "posv_mixed_gmres::potrs_hi" ] = t_potrs_hi.stop();
         }
     }
@@ -392,31 +404,34 @@ void posv_mixed_gmres(
         X.clearWorkspace();
     }
     timers[ "posv_mixed_gmres" ] = t_posv_mixed_gmres.stop();
-}
 
+    return info;
+}
 
 //------------------------------------------------------------------------------
 // Explicit instantiations.
 template <>
-void posv_mixed_gmres<double>(
+int64_t posv_mixed_gmres<double>(
     HermitianMatrix<double>& A,
     Matrix<double>& B,
     Matrix<double>& X,
     int& iter,
     Options const& opts)
 {
-    posv_mixed_gmres<double, float>(A, B, X, iter, opts);
+    return posv_mixed_gmres<double, float>(
+        A, B, X, iter, opts );
 }
 
 template <>
-void posv_mixed_gmres< std::complex<double> >(
+int64_t posv_mixed_gmres< std::complex<double> >(
     HermitianMatrix< std::complex<double> >& A,
     Matrix< std::complex<double> >& B,
     Matrix< std::complex<double> >& X,
     int& iter,
     Options const& opts)
 {
-    posv_mixed_gmres<std::complex<double>, std::complex<float>>(A, B, X, iter, opts);
+    return posv_mixed_gmres< std::complex<double>, std::complex<float> >(
+        A, B, X, iter, opts );
 }
 
 } // namespace slate
diff --git a/src/potrf.cc b/src/potrf.cc
index f0bd68cf4..9eed34846 100644
--- a/src/potrf.cc
+++ b/src/potrf.cc
@@ -20,7 +20,7 @@ namespace impl {
 /// @ingroup posv_impl
 ///
 template <Target target, typename scalar_t>
-void potrf(
+int64_t potrf(
     slate::internal::TargetType<target>,
     HermitianMatrix<scalar_t> A,
     Options const& opts )
@@ -54,6 +54,8 @@ void potrf(
     if (A.uplo() == Uplo::Upper) {
         A = conj_transpose( A );
     }
+
+    int64_t info = 0;
     int64_t A_nt = A.nt();
 
     // OpenMP needs pointer types, but vectors are exception safe
@@ -91,20 +93,25 @@ void potrf(
     #pragma omp parallel
     #pragma omp master
     {
+        int64_t kk = 0;  // column index (not block-column)
         for (int64_t k = 0; k < A_nt; ++k) {
             // Panel, normal priority
-            #pragma omp task depend(inout:column[k]) priority( priority_0 )
+            #pragma omp task depend(inout:column[k]) priority( priority_0 ) \
+                shared( info )
             {
                 // factor A(k, k)
+                int64_t iinfo;
                 if (target == Target::Devices) {
-                    internal::potrf<target>(
+                    iinfo = internal::potrf<target>(
                         A.sub(k, k), priority_0, queue_2,
                         device_info_array[ A.tileDevice( k, k ) ] );
                 }
                 else {
-                    internal::potrf<target>(
+                    iinfo = internal::potrf<target>(
                         A.sub(k, k), priority_0, queue_2 );
                 }
+                if (iinfo != 0 && info == 0)
+                    info = kk + iinfo;
 
                 // send A(k, k) down col A(k+1:nt-1, k)
                 if (k+1 <= A_nt-1)
@@ -192,6 +199,7 @@ void potrf(
                 // Erase local workspace on devices.
                 panel.releaseLocalWorkspace();
             }
+            kk += A.tileNb( k );
         }
     }
     A.tileUpdateAllOrigin();
@@ -205,6 +213,9 @@ void potrf(
             blas::device_free( device_info_array[dev], *queue );
         }
     }
+
+    internal::reduce_info( &info, A.mpiComm() );
+    return info;
 }
 
 } // namespace impl
@@ -249,16 +260,15 @@ void potrf(
 ///       - HostBatch: batched BLAS on CPU host.
 ///       - Devices:   batched BLAS on GPU device.
 ///
-/// TODO: return value
-/// @retval 0 successful exit
-/// @retval >0 for return value = $i$, the leading minor of order $i$ of $A$ is not
+/// @return 0: successful exit
+/// @return i > 0: the leading minor of order $i$ of $A$ is not
 ///         positive definite, so the factorization could not
-///         be completed, and the solution has not been computed.
+///         be completed.
 ///
 /// @ingroup posv_computational
 ///
 template <typename scalar_t>
-void potrf(
+int64_t potrf(
     HermitianMatrix<scalar_t>& A,
     Options const& opts)
 {
@@ -271,35 +281,33 @@ void potrf(
         case Target::HostNest:
         case Target::HostBatch:
         case Target::HostTask:
-            impl::potrf( TargetType<Target::HostTask>(), A, opts );
-            break;
+            return impl::potrf( TargetType<Target::HostTask>(), A, opts );
 
         case Target::Devices:
-            impl::potrf( TargetType<Target::Devices>(), A, opts );
-            break;
+            return impl::potrf( TargetType<Target::Devices>(), A, opts );
     }
-    // todo: return value for errors?
+    return -2;  // shouldn't happen
 }
 
 //------------------------------------------------------------------------------
 // Explicit instantiations.
 template
-void potrf<float>(
+int64_t potrf<float>(
     HermitianMatrix<float>& A,
     Options const& opts);
 
 template
-void potrf<double>(
+int64_t potrf<double>(
     HermitianMatrix<double>& A,
     Options const& opts);
 
 template
-void potrf< std::complex<float> >(
+int64_t potrf< std::complex<float> >(
     HermitianMatrix< std::complex<float> >& A,
     Options const& opts);
 
 template
-void potrf< std::complex<double> >(
+int64_t potrf< std::complex<double> >(
     HermitianMatrix< std::complex<double> >& A,
     Options const& opts);
 
diff --git a/test/test_posv.cc b/test/test_posv.cc
index ecd8795df..997622c26 100644
--- a/test/test_posv.cc
+++ b/test/test_posv.cc
@@ -158,6 +158,8 @@ void test_posv_work(Params& params, bool run)
         return;
     }
 
+    int64_t info = 0;
+
     // Matrix A: figure out local size.
     int64_t mlocA = num_local_rows_cols(n, nb, myrow, p);
     int64_t nlocA = num_local_rows_cols(n, nb, mycol, q);
@@ -276,26 +278,26 @@ void test_posv_work(Params& params, bool run)
 
         if (params.routine == "potrf" || params.routine == "potrs") {
             // Factor matrix A.
-            slate::chol_factor(A, opts);
+            info = slate::chol_factor( A, opts );
             // Using traditional BLAS/LAPACK name
             // slate::potrf(A, opts);
         }
         else if (params.routine == "posv") {
-            slate::chol_solve(A, B, opts);
+            info = slate::chol_solve( A, B, opts );
             // Using traditional BLAS/LAPACK name
             // slate::posv(A, B, opts);
         }
         else if (params.routine == "posv_mixed") {
             if constexpr (std::is_same<real_t, double>::value) {
                 int iters = 0;
-                slate::posv_mixed( A, B, X, iters, opts );
+                info = slate::posv_mixed( A, B, X, iters, opts );
                 params.iters() = iters;
             }
         }
         else if (params.routine == "posv_mixed_gmres") {
             if constexpr (std::is_same<real_t, double>::value) {
                 int iters = 0;
-                slate::posv_mixed_gmres(A, B, X, iters, opts);
+                info = slate::posv_mixed_gmres(A, B, X, iters, opts);
                 params.iters() = iters;
             }
         }
@@ -332,7 +334,7 @@ void test_posv_work(Params& params, bool run)
         // Run SLATE test: potrs
         // potrs: Solve AX = B, after factoring A above.
         //==================================================
-        if (do_potrs) {
+        if (do_potrs && info == 0) {
             double time2 = barrier_get_wtime(MPI_COMM_WORLD);
 
             if ((check && params.routine == "potrf")
@@ -352,9 +354,22 @@ void test_posv_work(Params& params, bool run)
         }
 
         if (trace) slate::trace::Trace::finish();
+
+        if (info != 0) {
+            char buf[ 80 ];
+            snprintf( buf, sizeof(buf), "info = %lld, cond = %.2e",
+                      llong( info ), params.matrix.cond_actual() );
+            params.msg() = buf;
+        }
     }
+    print_matrix( "X_out", X, params );
 
-    if (check) {
+    if (info != 0 || std::isinf( params.matrix.cond_actual() )) {
+        // info != 0 if and only if cond == inf (singular matrix).
+        // Matrices with unknown cond (nan) that are singular are marked failed.
+        params.okay() = info != 0 && std::isinf( params.matrix.cond_actual() );
+    }
+    else if (check) {
         //==================================================
         // Test results by checking the residual
         //
@@ -418,7 +433,6 @@ void test_posv_work(Params& params, bool run)
             slate_assert( myrow == myrow_ );
             slate_assert( mycol == mycol_ );
 
-            int64_t info;
             scalapack_descinit(A_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info);
             slate_assert(info == 0);