diff --git a/GNUmakefile b/GNUmakefile index 97c97adcc..aae7fa9e1 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -624,6 +624,7 @@ ifneq ($(only_unit),1) src/pbsv.cc \ src/pbtrf.cc \ src/pbtrs.cc \ + src/pocondest.cc \ src/posv.cc \ src/posv_mixed.cc \ src/posv_mixed_gmres.cc \ @@ -731,6 +732,7 @@ tester_src += \ test/test_herk.cc \ test/test_hesv.cc \ test/test_pbsv.cc \ + test/test_pocondest.cc \ test/test_posv.cc \ test/test_potri.cc \ test/test_scale.cc \ diff --git a/include/slate/BaseMatrix.hh b/include/slate/BaseMatrix.hh index 37661d91e..8b1e3892a 100644 --- a/include/slate/BaseMatrix.hh +++ b/include/slate/BaseMatrix.hh @@ -2535,6 +2535,9 @@ void BaseMatrix::tileCopyDataLayout(Tile* src_tile, // TODO Consider inter-Queue dependencies instead of disabling async async &= !(dst_device != HostNum && src_device != HostNum); + if (dst_tile->layout() != target_layout && ! dst_tile->isTransposable()) { + storage_->tileMakeTransposable( dst_tile ); + } if (is_square || ! need_convert) { lapack::Queue* queue = comm_queue( work_device ); @@ -2546,9 +2549,6 @@ void BaseMatrix::tileCopyDataLayout(Tile* src_tile, } } else { - if (dst_tile->layout() != target_layout && ! dst_tile->isTransposable()) { - storage_->tileMakeTransposable( dst_tile ); - } dst_tile->setLayout( target_layout ); scalar_t* work_data = nullptr; diff --git a/include/slate/simplified_api.hh b/include/slate/simplified_api.hh index 1fa0e3c5b..a56cf778d 100644 --- a/include/slate/simplified_api.hh +++ b/include/slate/simplified_api.hh @@ -358,6 +358,21 @@ void lu_inverse_using_factor_out_of_place( getri(A, pivots, A_inverse, opts); } +//----------------------------------------- +// lu_condest_using_factor() + +// gecondest +template +void lu_condest_using_factor( + Norm norm, + Matrix& A, + blas::real_type Anorm, + blas::real_type *rcond, + Options const& opts = Options()) +{ + gecondest(norm, A, Anorm, rcond, opts); +} + //----------------------------------------- // Cholesky @@ -475,6 +490,21 @@ void chol_inverse_using_factor( potri(A, opts); } +//----------------------------------------- +// chol_condest_using_factor() + +// pocondest +template +void chol_condest_using_factor( + Norm norm, + HermitianMatrix& A, + blas::real_type Anorm, + blas::real_type *rcond, + Options const& opts = Options()) +{ + pocondest(norm, A, Anorm, rcond, opts); +} + //----------------------------------------- // Symmetric indefinite -- block Aasen's @@ -649,6 +679,21 @@ void lq_multiply_by_q( unmlq(side, op, A, T, C, opts); } +//----------------------------------------- +// triangular_condest() + +// trcondest +template +void triangular_condest( + Norm norm, + TriangularMatrix& A, + blas::real_type *rcond, + Options const& opts = Options()) +{ + trcondest(norm, A, rcond, opts); +} + + //------------------------------------------------------------------------------ // Symmetric/Hermitian Eigenvalues diff --git a/include/slate/slate.hh b/include/slate/slate.hh index 0269b8938..9ba21f4d0 100644 --- a/include/slate/slate.hh +++ b/include/slate/slate.hh @@ -1345,11 +1345,33 @@ void steqr2( //----------------------------------------- // gecondest() template +void gecondest( + Norm in_norm, + Matrix& A, + blas::real_type Anorm, + blas::real_type *rcond, + Options const& opts = Options()); + +template +[[deprecated( "Pass Anorm by value instead. Will be removed 2024-11." )]] void gecondest( Norm in_norm, Matrix& A, blas::real_type *Anorm, blas::real_type *rcond, + Options const& opts = Options()) +{ + gecondest( in_norm, A, *Anorm, rcond, opts ); +} + +//----------------------------------------- +// pocondest() +template +void pocondest( + Norm in_norm, + HermitianMatrix& A, + blas::real_type Anorm, + blas::real_type *rcond, Options const& opts = Options()); //----------------------------------------- diff --git a/src/gecondest.cc b/src/gecondest.cc index 26b65355c..22e5026b4 100644 --- a/src/gecondest.cc +++ b/src/gecondest.cc @@ -57,7 +57,7 @@ template void gecondest( Norm in_norm, Matrix& A, - blas::real_type *Anorm, + blas::real_type Anorm, blas::real_type *rcond, Options const& opts) { @@ -83,7 +83,7 @@ void gecondest( *rcond = 1.; return; } - else if (*Anorm == 0.) { + else if (Anorm == 0.) { return; } @@ -144,7 +144,7 @@ void gecondest( // Compute the estimate of the reciprocal condition number. if (Ainvnorm != 0.0) { - *rcond = (1.0 / Ainvnorm) / *Anorm; + *rcond = (1.0 / Ainvnorm) / Anorm; } } @@ -154,7 +154,7 @@ template void gecondest( Norm in_norm, Matrix& A, - float *Anorm, + float Anorm, float *rcond, Options const& opts); @@ -162,7 +162,7 @@ template void gecondest( Norm in_norm, Matrix& A, - double *Anorm, + double Anorm, double *rcond, Options const& opts); @@ -170,7 +170,7 @@ template void gecondest< std::complex >( Norm in_norm, Matrix< std::complex >& A, - float *Anorm, + float Anorm, float *rcond, Options const& opts); @@ -178,7 +178,7 @@ template void gecondest< std::complex >( Norm in_norm, Matrix< std::complex >& A, - double *Anorm, + double Anorm, double *rcond, Options const& opts); diff --git a/src/pocondest.cc b/src/pocondest.cc new file mode 100644 index 000000000..cf2764818 --- /dev/null +++ b/src/pocondest.cc @@ -0,0 +1,160 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "auxiliary/Debug.hh" +#include "slate/Matrix.hh" +#include "slate/HermitianMatrix.hh" +#include "internal/internal.hh" + +namespace slate { + +//------------------------------------------------------------------------------ +/// Distributed parallel estimate of the reciprocal of the condition number +/// of a general matrix A, in either the 1-norm or the infinity-norm. +/// Generic implementation for any target. +/// +/// The reciprocal of the condition number computed as: +/// \[ +/// rcond = \frac{1}{\|\|A\|\| \times \|\|A^{-1}\|\-} +/// \] +/// where $A$ is the output of the Cholesky factorization (potrf). +/// +//------------------------------------------------------------------------------ +/// @tparam scalar_t +/// One of float, double, std::complex, std::complex. +//------------------------------------------------------------------------------ +/// @param[in] in_norm +/// rcond to compute: +/// - Norm::One: 1-norm condition number +/// - Norm::Inf: infinity-norm condition number +/// +/// @param[in] A +/// On entry, the n-by-n matrix $A$. +/// It is the output of the Cholesky factorization of a Hermitian matrix. +/// +/// @param[in] Anorm +/// If Norm::One, the 1-norm of the original matrix A. +/// If Norm::Inf, the infinity-norm of the original matrix A. +/// +/// @param[in,out] rcond +/// The reciprocal of the condition number of the matrix A, +/// computed as stated above. +/// +/// @param[in] opts +/// Additional options, as map of name = value pairs. Possible options: +/// - Option::Target: +/// Implementation to target. Possible values: +/// - HostTask: OpenMP tasks on CPU host [default]. +/// - HostNest: nested OpenMP parallel for loop on CPU host. +/// - HostBatch: batched BLAS on CPU host. +/// - Devices: batched BLAS on GPU device. +/// +/// @ingroup cond_specialization +/// +template +void pocondest( + Norm in_norm, + HermitianMatrix& A, + blas::real_type Anorm, + blas::real_type *rcond, + Options const& opts) +{ + using blas::real; + using real_t = blas::real_type; + + int kase; + if (in_norm != Norm::One && in_norm != Norm::Inf) { + slate_error("invalid norm."); + } + + int64_t m = A.m(); + + // Quick return + *rcond = 0.; + if (m <= 1) { + *rcond = 1.; + return; + } + else if (Anorm == 0.) { + return; + } + + real_t Ainvnorm = 0.0; + + std::vector isave = {0, 0, 0, 0}; + + auto tileMb = A.tileMbFunc(); + auto tileNb = func::uniform_blocksize(1, 1); + auto tileRank = A.tileRankFunc(); + auto tileDevice = A.tileDeviceFunc(); + slate::Matrix X (m, 1, tileMb, tileNb, + tileRank, tileDevice, A.mpiComm()); + X.insertLocalTiles(Target::Host); + slate::Matrix V (m, 1, tileMb, tileNb, + tileRank, tileDevice, A.mpiComm()); + V.insertLocalTiles(Target::Host); + slate::Matrix isgn (m, 1, tileMb, tileNb, + tileRank, tileDevice, A.mpiComm()); + isgn.insertLocalTiles(Target::Host); + + // initial and final value of kase is 0 + kase = 0; + internal::norm1est( X, V, isgn, &Ainvnorm, &kase, isave, opts); + + MPI_Bcast( &isave[0], 4, MPI_INT64_T, X.tileRank(0, 0), A.mpiComm() ); + MPI_Bcast( &kase, 1, MPI_INT, X.tileRank(0, 0), A.mpiComm() ); + + while (kase != 0) + { + // A is symmetric, so both cases are equivalent + potrs( A, X, opts ); + + internal::norm1est( X, V, isgn, &Ainvnorm, &kase, isave, opts); + MPI_Bcast( &isave[0], 4, MPI_INT64_T, X.tileRank(0, 0), A.mpiComm() ); + MPI_Bcast( &kase, 1, MPI_INT, X.tileRank(0, 0), A.mpiComm() ); + } // while (kase != 0) + + // Compute the estimate of the reciprocal condition number. + if (Ainvnorm != 0.0) { + *rcond = (1.0 / Ainvnorm) / Anorm; + } +} + +//------------------------------------------------------------------------------ +// Explicit instantiations. +template +void pocondest( + Norm in_norm, + HermitianMatrix& A, + float Anorm, + float *rcond, + Options const& opts); + +template +void pocondest( + Norm in_norm, + HermitianMatrix& A, + double Anorm, + double *rcond, + Options const& opts); + +template +void pocondest< std::complex >( + Norm in_norm, + HermitianMatrix< std::complex >& A, + float Anorm, + float *rcond, + Options const& opts); + +template +void pocondest< std::complex >( + Norm in_norm, + HermitianMatrix< std::complex >& A, + double Anorm, + double *rcond, + Options const& opts); + +} // namespace slate diff --git a/test/run_tests.py b/test/run_tests.py index 4e9983c0e..0dd06e3e1 100755 --- a/test/run_tests.py +++ b/test/run_tests.py @@ -614,12 +614,12 @@ def filter_csv( values, csv ): if (opts.cond): cmds += [ [ 'gecondest', gen + dtype + n ], + [ 'pocondest', gen + dtype + n + uplo ], # Triangle [ 'trcondest', gen + dtype + n ], #[ 'gbcon', gen + dtype + la + n + kl + ku ], - #[ 'pocon', gen + dtype + la + n + uplo ], #[ 'pbcon', gen + dtype + la + n + kd + uplo ], ] diff --git a/test/scalapack_wrappers.hh b/test/scalapack_wrappers.hh index 90341f09d..c62dc465b 100644 --- a/test/scalapack_wrappers.hh +++ b/test/scalapack_wrappers.hh @@ -4624,6 +4624,126 @@ void scalapack_pgecon( Anorm, rcond, work, &lwork_, iwork, &liwork_, &info_ ); } +//============================================================================== +// Fortran prototypes +#define scalapack_pspocon BLAS_FORTRAN_NAME( pspocon, PSPOCON ) +#define scalapack_pdpocon BLAS_FORTRAN_NAME( pdpocon, PDPOCON ) +#define scalapack_pcpocon BLAS_FORTRAN_NAME( pcpocon, PCPOCON ) +#define scalapack_pzpocon BLAS_FORTRAN_NAME( pzpocon, PZPOCON ) + +extern "C" { + +void scalapack_pspocon( + const char* uplo, const char* norm, blas_int* n, + float* A, blas_int* ia, blas_int* ja, blas_int* descA, + float* Anorm, float* rcond, + float* work, blas_int* lwork, + blas_int* iwrok, blas_int* liwork, + blas_int* info ); + +void scalapack_pdpocon( + const char* uplo, const char* norm, blas_int* n, + double* A, blas_int* ia, blas_int* ja, blas_int* descA, + double* Anorm, double* rcond, + double* work, blas_int* lwork, + blas_int* iwrok, blas_int* liwork, + blas_int* info ); + +void scalapack_pcpocon( + const char* uplo, const char* norm, blas_int* n, + std::complex* A, blas_int* ia, blas_int* ja, blas_int* descA, + float* Anorm, float* rcond, + std::complex* work, blas_int* lwork, + blas_int* iwrok, blas_int* liwork, + blas_int* info ); + +void scalapack_pzpocon( + const char* uplo, const char* norm, blas_int* n, + std::complex* A, blas_int* ia, blas_int* ja, blas_int* descA, + double* Anorm, double* rcond, + std::complex* work, blas_int* lwork, + blas_int* iwrok, blas_int* liwork, + blas_int* info ); + +} // extern C + +//------------------------------------------------------------------------------ +// Low-level overloaded wrappers +inline void scalapack_ppocon( + const char* uplo, const char* norm, blas_int* n, + float* A, blas_int* ia, blas_int* ja, blas_int* descA, + float* Anorm, float* rcond, + float* work, blas_int* lwork, + blas_int* iwrok, blas_int* liwork, + blas_int* info ) +{ + scalapack_pspocon( + uplo, norm, n, A, ia, ja, descA, + Anorm, rcond, work, lwork, iwrok, liwork, info ); +} + +inline void scalapack_ppocon( + const char* uplo, const char* norm, blas_int* n, + double* A, blas_int* ia, blas_int* ja, blas_int* descA, + double* Anorm, double* rcond, + double* work, blas_int* lwork, + blas_int* iwrok, blas_int* liwork, + blas_int* info ) +{ + scalapack_pdpocon( + uplo, norm, n, A, ia, ja, descA, + Anorm, rcond, work, lwork, iwrok, liwork, info ); +} + +inline void scalapack_ppocon( + const char* uplo, const char* norm, blas_int* n, + std::complex* A, blas_int* ia, blas_int* ja, blas_int* descA, + float* Anorm, float* rcond, + std::complex* work, blas_int* lwork, + blas_int* iwrok, blas_int* liwork, + blas_int* info ) +{ + scalapack_pcpocon( + uplo, norm, n, A, ia, ja, descA, + Anorm, rcond, work, lwork, iwrok, liwork, info ); +} + +inline void scalapack_ppocon( + const char* uplo, const char* norm, blas_int* n, + std::complex* A, blas_int* ia, blas_int* ja, blas_int* descA, + double* Anorm, double* rcond, + std::complex* work, blas_int* lwork, + blas_int* iwrok, blas_int* liwork, + blas_int* info ) +{ + scalapack_pzpocon( + uplo, norm, n, A, ia, ja, descA, + Anorm, rcond, work, lwork, iwrok, liwork, info ); +} + +//------------------------------------------------------------------------------ +// Templated wrapper +template +void scalapack_ppocon( + const char* uplo, const char* norm, int64_t n, + scalar_t* A, int64_t ia, int64_t ja, blas_int* descA, + blas::real_type* Anorm, blas::real_type* rcond, + scalar_t* work, int64_t lwork, + blas_int* iwork, int64_t liwork, + int64_t* info ) +{ + blas_int n_ = to_blas_int( n ); + blas_int ia_ = to_blas_int( ia ); + blas_int ja_ = to_blas_int( ja ); + blas_int lwork_ = to_blas_int( lwork ); + blas_int liwork_ = to_blas_int( liwork ); + blas_int info_ = 0; + scalapack_ppocon( + uplo, norm, &n_, A, &ia_, &ja_, descA, + Anorm, rcond, work, &lwork_, iwork, &liwork_, &info_ ); +} + + //============================================================================== // Fortran prototypes #define scalapack_pstrcon BLAS_FORTRAN_NAME( pstrcon, PSTRCON ) diff --git a/test/test.cc b/test/test.cc index 81742fbd5..9146f8f9c 100644 --- a/test/test.cc +++ b/test/test.cc @@ -157,6 +157,7 @@ std::vector< testsweeper::routines_t > routines = { { "potri", test_potri, Section::posv }, { "", nullptr, Section::newline }, + { "pocondest", test_pocondest, Section::posv }, // ----- // symmetric indefinite diff --git a/test/test.hh b/test/test.hh index afb1ac0d8..7aa3760d3 100644 --- a/test/test.hh +++ b/test/test.hh @@ -218,8 +218,9 @@ void test_trtri (Params& params, bool run); void test_gbsv (Params& params, bool run); // Cholesky -void test_posv (Params& params, bool run); -void test_potri (Params& params, bool run); +void test_posv (Params& params, bool run); +void test_pocondest (Params& params, bool run); +void test_potri (Params& params, bool run); // Cholesky, band void test_pbsv (Params& params, bool run); diff --git a/test/test_gecondest.cc b/test/test_gecondest.cc index 0ff2f8775..7deff8801 100644 --- a/test/test_gecondest.cc +++ b/test/test_gecondest.cc @@ -179,8 +179,9 @@ void test_gecondest_work(Params& params, bool run) double time = barrier_get_wtime(MPI_COMM_WORLD); - - slate::gecondest(norm, A, &Anorm, &slate_rcond, opts); + slate::lu_condest_using_factor(norm, A, Anorm, &slate_rcond, opts); + // Using traditional BLAS/LAPACK name + // slate::gecondest(norm, A, Anorm, &slate_rcond, opts); time = barrier_get_wtime(MPI_COMM_WORLD) - time; // compute and save timing/performance params.time() = time; diff --git a/test/test_pocondest.cc b/test/test_pocondest.cc new file mode 100644 index 000000000..2913bcc49 --- /dev/null +++ b/test/test_pocondest.cc @@ -0,0 +1,309 @@ +// Copyright (c) 2017-2022, University of Tennessee. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// This program is free software: you can redistribute it and/or modify it under +// the terms of the BSD 3-Clause license. See the accompanying LICENSE file. + +#include "slate/slate.hh" +#include "test.hh" +#include "blas/flops.hh" +#include "lapack/flops.hh" +#include "print_matrix.hh" +#include "grid_utils.hh" + +#include "scalapack_wrappers.hh" +#include "scalapack_support_routines.hh" +#include "scalapack_copy.hh" + +#include +#include +#include +#include + +//------------------------------------------------------------------------------ +template +void test_pocondest_work(Params& params, bool run) +{ + using real_t = blas::real_type; + using blas::real; + + // Constants + const scalar_t zero = 0; + const scalar_t one = 1; + real_t rone = 1.; + + auto method = params.method_lu(); + + // get & mark input values + slate::Norm norm = params.norm(); + slate::Uplo uplo = params.uplo(); + int64_t n = params.dim.n(); + int64_t p = params.grid.m(); + int64_t q = params.grid.n(); + int64_t nb = params.nb(); + int64_t ib = params.ib(); + int64_t lookahead = params.lookahead(); + int64_t panel_threads = params.panel_threads(); + bool ref_only = params.ref() == 'o'; + bool ref = params.ref() == 'y' || ref_only; + bool check = params.check() == 'y' && ! ref_only; + bool trace = params.trace() == 'y'; + int verbose = params.verbose(); + SLATE_UNUSED(verbose); + slate::Origin origin = params.origin(); + slate::Target target = params.target(); + slate::GridOrder grid_order = params.grid_order(); + params.matrix.mark(); + params.matrixB.mark(); + + // mark non-standard output values + params.time(); + params.gflops(); + params.ref_time(); + params.ref_gflops(); + params.time2(); + params.time2.name( "chol time (s)" ); + params.time2.width( 12 ); + params.gflops2(); + params.gflops2.name( "chol gflop/s" ); + + params.error(); + params.error.name( "slate-exact" ); + params.error2(); + params.error2.name( "scl-exact" ); + params.error3(); + params.error3.name( "slate-scl" ); + + params.value(); + params.value.name( "slate" ); + params.value2(); + params.value2.name( "exact" ); + params.value3(); + params.value3.name( "scl" ); + + if (! run) { + params.matrix.kind.set_default( "rand_dominant" ); + params.tol() = 0.75; + return; + } + + // MPI variables + int mpi_rank, myrow, mycol; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + gridinfo( mpi_rank, grid_order, p, q, &myrow, &mycol ); + + slate::Options const opts = { + {slate::Option::Lookahead, lookahead}, + {slate::Option::Target, target}, + {slate::Option::MaxPanelThreads, panel_threads}, + {slate::Option::InnerBlocking, ib}, + {slate::Option::MethodLU, method}, + }; + + // Matrix A: figure out local size. + int64_t mlocA = num_local_rows_cols(n, nb, myrow, p); + int64_t nlocA = num_local_rows_cols(n, nb, mycol, q); + int64_t lldA = blas::max(1, mlocA); // local leading dimension of A + + std::vector A_data; + slate::HermitianMatrix A; + if (origin != slate::Origin::ScaLAPACK) { + // SLATE allocates CPU or GPU tiles. + slate::Target origin_target = origin2target(origin); + A = slate::HermitianMatrix( + uplo, n, nb, p, q, MPI_COMM_WORLD ); + A.insertLocalTiles(origin_target); + } + else { + // Create SLATE matrix from the ScaLAPACK layouts + A_data.resize( lldA * nlocA ); + A = slate::HermitianMatrix::fromScaLAPACK( + uplo, n, &A_data[0], lldA, nb, grid_order, p, q, MPI_COMM_WORLD ); + } + + slate::Matrix Id; + if (check) { + slate::Target origin_target = origin2target(origin); + Id = slate::Matrix( + n, n, nb, p, q, MPI_COMM_WORLD ); + Id.insertLocalTiles(origin_target); + } + + slate::generate_matrix(params.matrix, A); + print_matrix("A", A, params); + + // If ref is required, copy test data. + slate::HermitianMatrix Aref; + std::vector Aref_data; + if (ref) { + Aref_data.resize( lldA* nlocA ); + Aref = slate::HermitianMatrix::fromScaLAPACK( + uplo, n, &Aref_data[0], lldA, nb, + grid_order, p, q, MPI_COMM_WORLD ); + + slate::copy(A, Aref); + } + + double gflop = lapack::Gflop::getrf(n, n); + + // Compute the matrix norm + real_t Anorm = 0; + Anorm = slate::norm(norm, A, opts); + real_t slate_rcond = 0., scl_rcond = 0., exact_rcond = 0.; + + if (! ref_only) { + + if (trace) slate::trace::Trace::on(); + else slate::trace::Trace::off(); + + //================================================== + // Run SLATE test: potrf followed by pocondest + // potrf: Factor A = LL^T. + // pocondest: Solve AX = B, including factoring A. + //================================================== + + double time2 = barrier_get_wtime(MPI_COMM_WORLD); + slate::chol_factor(A, opts); + // Using traditional BLAS/LAPACK name + // slate::potrf(A, opts); + // compute and save timing/performance + time2 = barrier_get_wtime(MPI_COMM_WORLD) - time2; + params.time2() = time2; + params.gflops2() = gflop / time2; + + + + double time = barrier_get_wtime(MPI_COMM_WORLD); + slate::chol_condest_using_factor(norm, A, Anorm, &slate_rcond, opts); + // Using traditional BLAS/LAPACK name + // slate::pocondest(norm, A, Anorm, &slate_rcond, opts); + time = barrier_get_wtime(MPI_COMM_WORLD) - time; + // compute and save timing/performance + params.time() = time; + params.gflops() = gflop / time; + + if (trace) slate::trace::Trace::finish(); + } + + if (check) { + // Find the exact condition number: + // A * A^-1 = Id + // LU * A^-1 = Id + // U * A^-1 = trsm(L, Id), Id will be overwritten by the first solve + // A^-1 = trsm(U, Id), Id will be overwritten by the second solve + // Id is the inverse of the matrix A, norm(A^-1) = norm(Id) + + // Set the Id matrix to identity + set(zero, one, Id); + chol_solve_using_factor( A, Id, opts ); + exact_rcond = (1. / slate::norm(norm, Id, opts)) / Anorm; + } + + if (ref) { + #ifdef SLATE_HAVE_SCALAPACK + // A comparison with a reference routine from ScaLAPACK for timing only + + // BLACS/MPI variables + blas_int ictxt, myrow_, mycol_, p_, q_; + blas_int mpi_rank_ = 0, nprocs = 1; + // initialize BLACS and ScaLAPACK + Cblacs_pinfo(&mpi_rank_, &nprocs); + slate_assert(p*q <= nprocs); + Cblacs_get(-1, 0, &ictxt); + Cblacs_gridinit( &ictxt, grid_order2str( grid_order ), p, q ); + Cblacs_gridinfo(ictxt, &p_, &q_, &myrow_, &mycol_); + slate_assert( p == p_ ); + slate_assert( q == q_ ); + slate_assert( myrow == myrow_ ); + slate_assert( mycol == mycol_ ); + + // ScaLAPACK descriptor for the reference matrix + int64_t info; + blas_int Aref_desc[9]; + scalapack_descinit(Aref_desc, n, n, nb, nb, 0, 0, ictxt, mlocA, &info); + slate_assert(info == 0); + + //================================================== + // Run ScaLAPACK reference routine. + //================================================== + scalapack_ppotrf( + uplo2str(uplo), n, &Aref_data[0], 1, 1, Aref_desc, &info); + slate_assert( info == 0 ); + + // query for workspace size for ppocon + int64_t lwork = -1; + int64_t liwork = -1; + scalar_t dummy; + blas_int idummy; + scalapack_ppocon( uplo2str(uplo), norm2str(norm), n, + &Aref_data[0], 1, 1, Aref_desc, + &Anorm, &scl_rcond, + &dummy, lwork, &idummy, liwork, &info ); + slate_assert( info == 0 ); + lwork = (int64_t)( real( dummy ) ); + liwork = (int64_t)( real( idummy ) ); + + // Compute the condition number using scalapack + std::vector work(lwork); + std::vector iwork(liwork); + + // todo: ScaLAPCK pzpocon has a seg fault + + double time = barrier_get_wtime(MPI_COMM_WORLD); + scalapack_ppocon( uplo2str(uplo), norm2str(norm), n, + &Aref_data[0], 1, 1, Aref_desc, + &Anorm, &scl_rcond, + &work[0], lwork, &iwork[0], liwork, &info ); + slate_assert( info == 0 ); + time = barrier_get_wtime(MPI_COMM_WORLD) - time; + + params.ref_time() = time; + params.ref_gflops() = gflop / time; + + Cblacs_gridexit(ictxt); + + #else // not SLATE_HAVE_SCALAPACK + if (mpi_rank == 0) + printf( "ScaLAPACK not available\n" ); + #endif + } + + // Compute the error + params.error() = std::abs( rone/slate_rcond - rone/exact_rcond ) / (rone/exact_rcond); + params.error2() = std::abs( rone/scl_rcond - rone/exact_rcond ) / (rone/exact_rcond); + params.error3() = std::abs(slate_rcond - scl_rcond); + + // Printf out the condest + params.value() = slate_rcond; + params.value2() = exact_rcond; + params.value3() = scl_rcond; + + real_t tol = params.tol(); + params.okay() = (params.error() <= tol); + +} + +// ----------------------------------------------------------------------------- +void test_pocondest(Params& params, bool run) +{ + switch (params.datatype()) { + case testsweeper::DataType::Single: + test_pocondest_work (params, run); + break; + + case testsweeper::DataType::Double: + test_pocondest_work (params, run); + break; + + case testsweeper::DataType::SingleComplex: + test_pocondest_work> (params, run); + break; + + case testsweeper::DataType::DoubleComplex: + test_pocondest_work> (params, run); + break; + + default: + throw std::runtime_error( "unknown datatype" ); + break; + } +} diff --git a/test/test_trcondest.cc b/test/test_trcondest.cc index 6e6e2df0c..3557a5556 100644 --- a/test/test_trcondest.cc +++ b/test/test_trcondest.cc @@ -154,7 +154,9 @@ void test_trcondest_work(Params& params, bool run) double time = barrier_get_wtime(MPI_COMM_WORLD); auto R = slate::TriangularMatrix( slate::Uplo::Upper, slate::Diag::NonUnit, A ); - slate::trcondest(norm, R, &slate_rcond, opts); + slate::triangular_condest(norm, R, &slate_rcond, opts); + // Using traditional BLAS/LAPACK name + // slate::trcondest(norm, R, &slate_rcond, opts); time = barrier_get_wtime(MPI_COMM_WORLD) - time; // compute and save timing/performance params.time() = time;