43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP 44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP 46 #include <Teuchos_Details_MpiTypeTraits.hpp> 48 #include <Tpetra_Distributor.hpp> 49 #include <Tpetra_BlockMultiVector.hpp> 51 #include <Kokkos_ArithTraits.hpp> 52 #include <KokkosBatched_Util.hpp> 53 #include <KokkosBatched_Vector.hpp> 54 #include <KokkosBatched_Copy_Decl.hpp> 55 #include <KokkosBatched_Copy_Impl.hpp> 56 #include <KokkosBatched_AddRadial_Decl.hpp> 57 #include <KokkosBatched_AddRadial_Impl.hpp> 58 #include <KokkosBatched_Gemm_Decl.hpp> 59 #include <KokkosBatched_Gemm_Serial_Impl.hpp> 60 #include <KokkosBatched_Gemm_Team_Impl.hpp> 61 #include <KokkosBatched_Gemv_Decl.hpp> 62 #include <KokkosBatched_Gemv_Serial_Impl.hpp> 63 #include <KokkosBatched_Gemv_Team_Impl.hpp> 64 #include <KokkosBatched_Trsm_Decl.hpp> 65 #include <KokkosBatched_Trsm_Serial_Impl.hpp> 66 #include <KokkosBatched_Trsm_Team_Impl.hpp> 67 #include <KokkosBatched_Trsv_Decl.hpp> 68 #include <KokkosBatched_Trsv_Serial_Impl.hpp> 69 #include <KokkosBatched_Trsv_Team_Impl.hpp> 70 #include <KokkosBatched_LU_Decl.hpp> 71 #include <KokkosBatched_LU_Serial_Impl.hpp> 72 #include <KokkosBatched_LU_Team_Impl.hpp> 78 #undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE 79 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 80 #include "cuda_profiler_api.h" 85 namespace BlockTriDiContainerDetails {
92 template <
typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
93 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::Unmanaged |
94 MemoryTraitsType::RandomAccess |
97 template <
typename ViewType>
98 using Unmanaged = Kokkos::View<
typename ViewType::data_type,
99 typename ViewType::array_layout,
100 typename ViewType::device_type,
101 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
102 template <
typename ViewType>
103 using Const = Kokkos::View<
typename ViewType::const_data_type,
104 typename ViewType::array_layout,
105 typename ViewType::device_type,
106 typename ViewType::memory_traits>;
107 template <
typename ViewType>
108 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
113 template<
typename T>
struct is_cuda { enum :
bool { value =
false }; };
114 #if defined(KOKKOS_ENABLE_CUDA) 115 template<>
struct is_cuda<Kokkos::Cuda> { enum :
bool { value =
true }; };
121 template<
typename CommPtrType>
123 const auto rank = comm->getRank();
124 const auto nranks = comm->getSize();
125 std::stringstream ss;
126 ss <<
"Rank " << rank <<
" of " << nranks <<
": ";
133 template<
typename T,
int N>
136 KOKKOS_INLINE_FUNCTION
138 for (
int i=0;i<N;++i)
141 KOKKOS_INLINE_FUNCTION
143 for (
int i=0;i<N;++i)
147 template<
typename T,
int N>
149 KOKKOS_INLINE_FUNCTION
153 for (
int i=0;i<N;++i)
156 template<
typename T,
int N>
158 KOKKOS_INLINE_FUNCTION
160 operator+=(ArrayValueType<T,N> &a,
161 const ArrayValueType<T,N> &b) {
162 for (
int i=0;i<N;++i)
169 template<
typename T,
int N,
typename ExecSpace>
173 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
176 KOKKOS_INLINE_FUNCTION
179 KOKKOS_INLINE_FUNCTION
181 for (
int i=0;i<N;++i)
182 dst.v[i] += src.v[i];
184 KOKKOS_INLINE_FUNCTION
186 for (
int i=0;i<N;++i)
187 dst.v[i] += src.v[i];
189 KOKKOS_INLINE_FUNCTION
191 for (
int i=0;i<N;++i)
192 val.v[i] = Kokkos::reduction_identity<T>::sum();
194 KOKKOS_INLINE_FUNCTION
198 KOKKOS_INLINE_FUNCTION
199 result_view_type view()
const {
200 return result_view_type(value);
207 template <
typename MatrixType>
213 typedef typename MatrixType::scalar_type scalar_type;
214 typedef typename MatrixType::local_ordinal_type local_ordinal_type;
215 typedef typename MatrixType::global_ordinal_type global_ordinal_type;
216 typedef typename MatrixType::node_type node_type;
222 typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
233 typedef typename device_type::execution_space execution_space;
234 typedef typename device_type::memory_space memory_space;
235 typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
236 typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
237 typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
238 typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
239 typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
240 typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
241 typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
242 typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_type local_crs_graph_type;
247 template<
typename T,
int l>
using Vector = KokkosBatched::Experimental::Vector<T,l>;
248 template<
typename T>
using SIMD = KokkosBatched::Experimental::SIMD<T>;
249 template<
typename T,
typename M>
using DefaultVectorLength = KokkosBatched::Experimental::DefaultVectorLength<T,M>;
253 static constexpr
int vector_length = DefaultVectorLength<impl_scalar_type,memory_space>::value;
260 typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
262 typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
264 typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
266 typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
267 typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
268 typedef Kokkos::View<impl_scalar_type****,Kokkos::LayoutRight,device_type> impl_scalar_type_4d_view;
274 template<
typename MatrixType>
275 typename Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type>
278 using tpetra_map_type =
typename impl_type::tpetra_map_type;
279 using tpetra_mv_type =
typename impl_type::tpetra_block_multivector_type;
280 using tpetra_import_type =
typename impl_type::tpetra_import_type;
282 const auto g = A->getCrsGraph();
283 const auto blocksize = A->getBlockSize();
284 const auto src = Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
285 const auto tgt = Teuchos::rcp(
new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
287 return Teuchos::rcp(
new tpetra_import_type(src, tgt));
295 template<
typename MatrixType>
296 struct AsyncableImport {
298 using impl_type = ImplType<MatrixType>;
304 #if !defined(HAVE_IFPACK2_MPI) 305 typedef int MPI_Request;
306 typedef int MPI_Comm;
308 using scalar_type =
typename impl_type::scalar_type;
312 template <
typename T>
313 static int isend(
const MPI_Comm comm,
const T* buf,
int count,
int dest,
int tag, MPI_Request* ireq) {
314 #ifdef HAVE_IFPACK2_MPI 316 const auto dt = Teuchos::Details::MpiTypeTraits<scalar_type>::getType();
317 int ret = MPI_Isend(const_cast<T*>(buf), count, dt, dest, tag, comm, ireq == NULL ? &ureq : ireq);
318 if (ireq == NULL) MPI_Request_free(&ureq);
325 template <
typename T>
326 static int irecv(
const MPI_Comm comm, T* buf,
int count,
int src,
int tag, MPI_Request* ireq) {
327 #ifdef HAVE_IFPACK2_MPI 329 const auto dt = Teuchos::Details::MpiTypeTraits<scalar_type>::getType();
330 int ret = MPI_Irecv(buf, count, dt, src, tag, comm, ireq == NULL ? &ureq : ireq);
331 if (ireq == NULL) MPI_Request_free(&ureq);
338 static int waitany(
int count, MPI_Request* reqs,
int* index) {
339 #ifdef HAVE_IFPACK2_MPI 340 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
346 static int waitall(
int count, MPI_Request* reqs) {
347 #ifdef HAVE_IFPACK2_MPI 348 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
355 using tpetra_map_type =
typename impl_type::tpetra_map_type;
356 using tpetra_import_type =
typename impl_type::tpetra_import_type;
358 using local_ordinal_type =
typename impl_type::local_ordinal_type;
359 using global_ordinal_type =
typename impl_type::global_ordinal_type;
364 using int_1d_view_host = Kokkos::View<int*,host_execution_space>;
365 using local_ordinal_type_1d_view_host =
typename impl_type::local_ordinal_type_1d_view::HostMirror;
367 using execution_space =
typename impl_type::execution_space;
368 using memory_space =
typename impl_type::memory_space;
369 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
371 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
372 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
374 #ifdef HAVE_IFPACK2_MPI 377 impl_scalar_type_2d_view remote_multivector;
378 local_ordinal_type blocksize;
381 struct SendRecvPair {
386 SendRecvPair<int_1d_view_host> pids;
387 SendRecvPair<std::vector<MPI_Request> > reqs;
388 SendRecvPair<size_type_1d_view> offset;
389 SendRecvPair<local_ordinal_type_1d_view> lids;
390 SendRecvPair<impl_scalar_type_1d_view> buffer;
392 local_ordinal_type_1d_view dm2cm;
396 void setOffsetValues(
const Teuchos::ArrayView<const size_t> &lens,
397 const size_type_1d_view &offs) {
399 Kokkos::View<size_t*,host_execution_space> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
400 const auto lens_device = Kokkos::create_mirror_view(memory_space(), lens_host);
401 Kokkos::deep_copy(lens_device, lens_host);
404 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
405 const local_ordinal_type lens_size = lens_device.extent(0);
406 Kokkos::parallel_scan
407 (
"AsyncableImport::RangePolicy::setOffsetValues",
408 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
411 update += (i < lens_size ? lens_device[i] : 0);
416 void createMpiRequests(
const tpetra_import_type &
import) {
417 Tpetra::Distributor &distributor =
import.getDistributor();
420 const auto pids_from = distributor.getProcsFrom();
422 memcpy(pids.recv.data(), pids_from.getRawPtr(),
sizeof(int)*pids.recv.extent(0));
424 const auto pids_to = distributor.getProcsTo();
426 memcpy(pids.send.data(), pids_to.getRawPtr(),
sizeof(int)*pids.send.extent(0));
429 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*
sizeof(MPI_Request));
430 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*
sizeof(MPI_Request));
433 const auto lengths_to = distributor.getLengthsTo();
436 const auto lengths_from = distributor.getLengthsFrom();
439 setOffsetValues(lengths_to, offset.send);
440 setOffsetValues(lengths_from, offset.recv);
443 void createSendRecvIDs(
const tpetra_import_type &
import) {
445 const auto remote_lids =
import.getRemoteLIDs();
446 const local_ordinal_type_1d_view_host
447 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
449 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
452 auto epids =
import.getExportPIDs();
453 auto elids =
import.getExportLIDs();
454 TEUCHOS_ASSERT(epids.size() == elids.size());
456 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
459 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
460 const auto pid_send_value = pids.send[i];
461 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
462 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
463 #if !defined(__CUDA_ARCH__) 464 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset.send[i+1]);
467 Kokkos::deep_copy(lids.send, lids_send_host);
473 struct ToMultiVector {};
476 template<
typename PackTag>
478 void copy(
const local_ordinal_type_1d_view &lids_,
479 const impl_scalar_type_1d_view &buffer_,
480 const local_ordinal_type &ibeg_,
481 const local_ordinal_type &iend_,
482 const impl_scalar_type_2d_view &multivector_,
483 const local_ordinal_type blocksize_) {
484 const local_ordinal_type num_vectors = multivector_.extent(1);
485 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
486 const local_ordinal_type idiff = iend_ - ibeg_;
487 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
489 if (is_cuda<execution_space>::value) {
490 #if defined(KOKKOS_ENABLE_CUDA) 491 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
492 const team_policy_type policy(idiff, num_vectors == 1 ? 1 : 2);
494 (
"AsyncableImport::TeamPolicy::copy",
495 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
496 const local_ordinal_type i = member.league_rank();
498 (Kokkos::TeamThreadRange(member,num_vectors),[&](
const local_ordinal_type &j) {
499 auto aptr = abase + blocksize_*(i + idiff*j);
500 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
501 if (std::is_same<PackTag,ToBuffer>::value)
503 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
508 (Kokkos::ThreadVectorRange(member,blocksize_),[&](
const local_ordinal_type &k) {
515 #if defined(__CUDA_ARCH__) 516 TEUCHOS_TEST_FOR_EXCEPT_MSG(
true,
"Error: CUDA should not see this code");
518 if (num_vectors == 1) {
519 const Kokkos::RangePolicy<execution_space> policy(ibeg_, iend_);
521 (
"AsyncableImport::RangePolicy::copy",
522 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
523 auto aptr = buffer_.data() + blocksize_*i;
524 auto bptr = multivector_.data() + blocksize_*lids_(i);
525 if (std::is_same<PackTag,ToBuffer>::value)
526 memcpy(aptr, bptr,
sizeof(impl_scalar_type)*blocksize_);
528 memcpy(bptr, aptr,
sizeof(impl_scalar_type)*blocksize_);
532 Kokkos::MDRangePolicy
533 <execution_space, Kokkos::Rank<2>, Kokkos::IndexType<local_ordinal_type> >
534 policy( { 0, 0 }, { idiff, num_vectors } );
537 (
"AsyncableImport::MDRangePolicy::copy",
538 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i,
539 const local_ordinal_type &j) {
540 auto aptr = abase + blocksize_*(i + idiff*j);
541 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
542 if (std::is_same<PackTag,ToBuffer>::value)
543 for (local_ordinal_type k=0;k<blocksize_;++k) aptr[k] = bptr[k];
545 for (local_ordinal_type k=0;k<blocksize_;++k) bptr[k] = aptr[k];
553 void createDataBuffer(
const local_ordinal_type &num_vectors) {
554 const size_type extent_0 = lids.recv.extent(0)*blocksize;
555 const size_type extent_1 = num_vectors;
556 if (remote_multivector.extent(0) == extent_0 &&
557 remote_multivector.extent(1) == extent_1) {
563 const auto send_buffer_size = offset.send[offset.send.extent(0)-1]*blocksize*num_vectors;
566 const auto recv_buffer_size = offset.recv[offset.recv.extent(0)-1]*blocksize*num_vectors;
571 AsyncableImport (
const Teuchos::RCP<const tpetra_map_type>& src_map,
572 const Teuchos::RCP<const tpetra_map_type>& tgt_map,
573 const local_ordinal_type blocksize_,
574 const local_ordinal_type_1d_view dm2cm_) {
575 blocksize = blocksize_;
578 #ifdef HAVE_IFPACK2_MPI 579 const auto mpi_comm = Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int> >(tgt_map->getComm());
580 TEUCHOS_ASSERT(!mpi_comm.is_null());
581 comm = *mpi_comm->getRawMpiComm();
583 const tpetra_import_type
import(src_map, tgt_map);
585 createMpiRequests(
import);
586 createSendRecvIDs(
import);
589 void asyncSendRecv(
const impl_scalar_type_2d_view &mv) {
590 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 591 TEUCHOS_FUNC_TIME_MONITOR(
"BlockTriDi::Setup::AsyncSendRecv");
593 #ifdef HAVE_IFPACK2_MPI 595 const local_ordinal_type num_vectors = mv.extent(1);
596 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
599 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
600 copy<ToBuffer>(lids.send, buffer.send, offset.send(i), offset.send(i+1),
603 buffer.send.data() + offset.send[i]*mv_blocksize,
604 (offset.send[i+1] - offset.send[i])*mv_blocksize,
611 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
613 buffer.recv.data() + offset.recv[i]*mv_blocksize,
614 (offset.recv[i+1] - offset.recv[i])*mv_blocksize,
622 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
625 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
631 #ifdef HAVE_IFPACK2_MPI 632 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i)
633 MPI_Cancel(&reqs.recv[i]);
638 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 639 TEUCHOS_FUNC_TIME_MONITOR(
"BlockTriDi::Setup::SyncRecv");
641 #ifdef HAVE_IFPACK2_MPI 643 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
644 local_ordinal_type idx = i;
645 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
646 copy<ToMultiVector>(lids.recv, buffer.recv, offset.recv(idx), offset.recv(idx+1),
647 remote_multivector, blocksize);
650 waitall(reqs.send.size(), reqs.send.data());
654 void syncExchange(
const impl_scalar_type_2d_view &mv) {
659 impl_scalar_type_2d_view getRemoteMultiVectorLocalView()
const {
return remote_multivector; }
665 template<
typename MatrixType>
666 Teuchos::RCP<AsyncableImport<MatrixType> >
669 using tpetra_map_type =
typename impl_type::tpetra_map_type;
670 using local_ordinal_type =
typename impl_type::local_ordinal_type;
671 using global_ordinal_type =
typename impl_type::global_ordinal_type;
672 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
674 const auto g = A->getCrsGraph();
675 const auto blocksize = A->getBlockSize();
676 const auto domain_map = g.getDomainMap();
677 const auto column_map = g.getColMap();
679 std::vector<global_ordinal_type> gids;
680 bool separate_remotes =
true, found_first =
false, need_owned_permutation =
false;
681 for (
size_t i=0;i<column_map->getNodeNumElements();++i) {
682 const global_ordinal_type gid = column_map->getGlobalElement(i);
683 if (!domain_map->isNodeGlobalElement(gid)) {
686 }
else if (found_first) {
687 separate_remotes =
false;
690 if (!need_owned_permutation &&
691 domain_map->getLocalElement(gid) !=
static_cast<local_ordinal_type
>(i)) {
700 need_owned_permutation =
true;
704 if (separate_remotes) {
705 const auto invalid = Teuchos::OrdinalTraits<global_ordinal_type>::invalid();
706 const auto parsimonious_col_map
707 = Teuchos::rcp(
new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
708 if (parsimonious_col_map->getGlobalNumElements() > 0) {
710 local_ordinal_type_1d_view dm2cm;
711 if (need_owned_permutation) {
712 dm2cm = local_ordinal_type_1d_view(
"dm2cm", domain_map->getNodeNumElements());
713 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
714 for (
size_t i=0;i<domain_map->getNodeNumElements();++i)
715 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
716 Kokkos::deep_copy(dm2cm, dm2cm_host);
718 return Teuchos::rcp(
new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
721 return Teuchos::null;
724 template<
typename MatrixType>
725 struct PartInterface {
726 using local_ordinal_type =
typename ImplType<MatrixType>::local_ordinal_type;
727 using local_ordinal_type_1d_view =
typename ImplType<MatrixType>::local_ordinal_type_1d_view;
729 PartInterface() =
default;
730 PartInterface(
const PartInterface &b) =
default;
750 local_ordinal_type_1d_view lclrow;
752 local_ordinal_type_1d_view partptr;
755 local_ordinal_type_1d_view packptr;
758 local_ordinal_type_1d_view part2rowidx0;
762 local_ordinal_type_1d_view part2packrowidx0;
763 local_ordinal_type part2packrowidx0_back;
765 local_ordinal_type_1d_view rowidx2part;
777 template<
typename MatrixType>
778 PartInterface<MatrixType>
779 createPartInterface(
const Teuchos::RCP<
const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
780 const Teuchos::Array<Teuchos::Array<
typename ImplType<MatrixType>::local_ordinal_type> > &partitions) {
782 using local_ordinal_type =
typename impl_type::local_ordinal_type;
783 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
785 constexpr
int vector_length = impl_type::vector_length;
787 const auto comm = A->getRowMap()->getComm();
789 PartInterface<MatrixType> interf;
791 const bool jacobi = partitions.size() == 0;
792 const local_ordinal_type A_n_lclrows = A->getNodeNumRows();
793 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
795 #if defined(BLOCKTRIDICONTAINER_DEBUG) 796 local_ordinal_type nrows = 0;
800 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
802 TEUCHOS_TEST_FOR_EXCEPT_MSG
803 (nrows != A_n_lclrows,
get_msg_prefix(comm) <<
"The #rows implied by the local partition is not " 804 <<
"the same as getNodeNumRows: " << nrows <<
" vs " << A_n_lclrows);
808 std::vector<local_ordinal_type> p;
813 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
814 std::vector<size_idx_pair_type> partsz(nparts);
815 for (local_ordinal_type i=0;i<nparts;++i)
816 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
817 std::sort(partsz.begin(), partsz.end(),
818 [] (
const size_idx_pair_type& x,
const size_idx_pair_type& y) {
819 return x.first > y.first;
821 for (local_ordinal_type i=0;i<nparts;++i)
822 p[i] = partsz[i].second;
829 interf.part2packrowidx0 = local_ordinal_type_1d_view(
do_not_initialize_tag(
"part2packrowidx0"), nparts + 1);
833 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
834 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
835 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
836 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
837 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
840 interf.row_contiguous =
true;
843 part2packrowidx0(0) = 0;
844 local_ordinal_type pack_nrows = 0;
845 for (local_ordinal_type ip=0;ip<nparts;++ip) {
846 const auto* part = jacobi ? NULL : &partitions[p[ip]];
847 const local_ordinal_type ipnrows = jacobi ? 1 : part->size();
848 TEUCHOS_ASSERT(ip == 0 || (jacobi || ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
849 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
851 <<
"partition " << p[ip]
852 <<
" is empty, which is not allowed.");
854 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
857 if (ip % vector_length == 0) pack_nrows = ipnrows;
858 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
859 const local_ordinal_type os = partptr(ip);
860 for (local_ordinal_type i=0;i<ipnrows;++i) {
861 const auto lcl_row = jacobi ? ip : (*part)[i];
862 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
864 <<
"partitions[" << p[ip] <<
"][" 865 << i <<
"] = " << lcl_row
866 <<
" but input matrix implies limits of [0, " << A_n_lclrows-1
868 lclrow(os+i) = lcl_row;
869 rowidx2part(os+i) = ip;
870 if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
871 interf.row_contiguous =
false;
873 partptr(ip+1) = os + ipnrows;
875 #if defined(BLOCKTRIDICONTAINER_DEBUG) 876 TEUCHOS_ASSERT(partptr(nparts) == nrows);
878 if (lclrow(0) != 0) interf.row_contiguous =
false;
880 Kokkos::deep_copy(interf.partptr, partptr);
881 Kokkos::deep_copy(interf.lclrow, lclrow);
884 interf.part2rowidx0 = interf.partptr;
885 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
887 interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
888 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
891 local_ordinal_type npacks = 0;
892 for (local_ordinal_type ip=1;ip<=nparts;++ip)
893 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
895 interf.packptr = local_ordinal_type_1d_view(
"packptr", npacks + 1);
896 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
898 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
899 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
901 Kokkos::deep_copy(interf.packptr, packptr);
910 template <
typename MatrixType>
913 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
915 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
921 size_type_1d_view flat_td_ptr, pack_td_ptr;
924 local_ordinal_type_1d_view A_colindsub;
927 vector_type_3d_view values;
929 bool is_diagonal_only;
935 template <
typename idx_type>
936 static KOKKOS_FORCEINLINE_FUNCTION
937 idx_type IndexToRow (
const idx_type& ind) {
return (ind + 1) / 3; }
940 template <
typename idx_type>
941 static KOKKOS_FORCEINLINE_FUNCTION
942 idx_type RowToIndex (
const idx_type& row) {
return row > 0 ? 3*row - 1 : 0; }
944 template <
typename idx_type>
945 static KOKKOS_FORCEINLINE_FUNCTION
946 idx_type NumBlocks (
const idx_type& nrows) {
return nrows > 0 ? 3*nrows - 2 : 0; }
953 template<
typename MatrixType>
957 using execution_space =
typename impl_type::execution_space;
958 using local_ordinal_type =
typename impl_type::local_ordinal_type;
959 using size_type =
typename impl_type::size_type;
960 using size_type_1d_view =
typename impl_type::size_type_1d_view;
962 constexpr
int vector_length = impl_type::vector_length;
966 const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
970 const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
971 Kokkos::parallel_scan
972 (
"createBlockTridiags::RangePolicy::flat_td_ptr",
973 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
975 btdm.flat_td_ptr(i) = update;
977 const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
978 update += btdm.NumBlocks(nrows);
982 const auto nblocks = Kokkos::create_mirror_view_and_copy
983 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
984 btdm.is_diagonal_only = (
static_cast<local_ordinal_type
>(nblocks()) == ntridiags);
988 if (vector_length == 1) {
989 btdm.pack_td_ptr = btdm.flat_td_ptr;
991 const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
993 const Kokkos::RangePolicy<execution_space> policy(0,npacks);
994 Kokkos::parallel_scan
995 (
"createBlockTridiags::RangePolicy::pack_td_ptr",
996 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i, size_type &update,
const bool &
final) {
997 const local_ordinal_type parti = interf.packptr(i);
998 const local_ordinal_type parti_next = interf.packptr(i+1);
1000 const size_type nblks = update;
1001 for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1002 btdm.pack_td_ptr(pti) = nblks;
1003 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1006 btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1009 const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1010 update += btdm.NumBlocks(nrows);
1030 template<
typename MatrixType>
1031 void setTridiagsToIdentity(
const BlockTridiags<MatrixType> &btdm,
1032 const typename ImplType<MatrixType>::local_ordinal_type_1d_view &packptr) {
1033 using impl_type = ImplType<MatrixType>;
1034 using execution_space =
typename impl_type::execution_space;
1035 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1036 using size_type =
typename impl_type::size_type;
1038 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1039 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1041 const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1042 const local_ordinal_type blocksize = btdm.values.extent(1);
1044 if (is_cuda<execution_space>::value) {
1045 #if defined(KOKKOS_ENABLE_CUDA) 1046 constexpr
int vector_length = impl_type::vector_length;
1047 using impl_scalar_type =
typename impl_type::impl_scalar_type;
1048 using impl_scalar_type_4d_view =
typename impl_type::impl_scalar_type_4d_view;
1049 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1050 const impl_scalar_type_4d_view values((impl_scalar_type*)btdm.values.data(),
1051 btdm.values.extent(0),
1052 btdm.values.extent(1),
1053 btdm.values.extent(2),
1058 const local_ordinal_type vector_length_value = vector_length;
1059 const team_policy_type policy(packptr.extent(0)-1, Kokkos::AUTO(), vector_length);
1060 Kokkos::parallel_for
1061 (
"setTridiagsToIdentity::TeamPolicy",
1062 policy, KOKKOS_LAMBDA(
const typename team_policy_type::member_type &member) {
1063 const local_ordinal_type k = member.league_rank();
1064 const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1065 const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1066 const local_ordinal_type diff = iend - ibeg;
1067 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1068 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](
const local_ordinal_type &ii) {
1069 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_length_value),[&](
const int &v) {
1070 const local_ordinal_type i = ibeg + ii*3;
1071 for (local_ordinal_type j=0;j<blocksize;++j)
1072 values(i,j,j,v) = 1;
1079 #if defined(__CUDA_ARCH__) 1080 TEUCHOS_TEST_FOR_EXCEPT_MSG(
true,
"Error: CUDA should not see this code");
1082 const Unmanaged<vector_type_3d_view> values = btdm.values;
1083 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
1084 Kokkos::parallel_for
1085 (
"setTridiagsToIdentity::RangePolicy",
1086 policy, KOKKOS_LAMBDA(
const local_ordinal_type k) {
1087 for (size_type i=pack_td_ptr(packptr(k)),iend=pack_td_ptr(packptr(k+1));i<iend;i+=3)
1088 for (local_ordinal_type j=0;j<blocksize;++j)
1098 template <
typename MatrixType>
1101 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1103 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
1106 size_type_1d_view rowptr, rowptr_remote;
1113 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1116 bool is_tpetra_block_crs;
1119 impl_scalar_type_1d_view tpetra_values;
1122 AmD(
const AmD &b) =
default;
1128 template<
typename MatrixType>
1131 const PartInterface<MatrixType> &interf,
1134 const bool overlap_communication_and_computation) {
1136 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 1137 TEUCHOS_FUNC_TIME_MONITOR(
"BlockTriDi::SymbolicPhase");
1140 using memory_space =
typename impl_type::memory_space;
1141 using host_execution_space =
typename impl_type::host_execution_space;
1143 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1144 using global_ordinal_type =
typename impl_type::global_ordinal_type;
1145 using size_type =
typename impl_type::size_type;
1146 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1147 using size_type_1d_view =
typename impl_type::size_type_1d_view;
1148 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1149 using block_crs_matrix_type =
typename impl_type::tpetra_block_crs_matrix_type;
1151 constexpr
int vector_length = impl_type::vector_length;
1153 const auto comm = A->getRowMap()->getComm();
1154 const auto& g = A->getCrsGraph();
1155 const auto blocksize = A->getBlockSize();
1158 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1159 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1160 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1161 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1162 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1164 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1167 Kokkos::View<local_ordinal_type*,host_execution_space> col2row(
"col2row", A->getNodeNumCols());
1168 Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1170 const auto rowmap = g.getRowMap();
1171 const auto colmap = g.getColMap();
1172 const auto dommap = g.getDomainMap();
1173 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1175 #if !defined(__CUDA_ARCH__) 1176 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1177 Kokkos::parallel_for
1178 (
"performSymbolicPhase::RangePolicy::col2row",
1179 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1180 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1181 TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
1182 if (dommap->isNodeGlobalElement(gid)) {
1183 const local_ordinal_type lc = colmap->getLocalElement(gid);
1184 # if defined(BLOCKTRIDICONTAINER_DEBUG) 1185 TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
1187 <<
" gives an invalid local column.");
1197 const auto& local_graph = g.getLocalGraph();
1198 const auto& local_graph_rowptr = local_graph.row_map;
1199 TEUCHOS_ASSERT(local_graph_rowptr.size() ==
static_cast<size_t>(nrows + 1));
1200 const auto& local_graph_colidx = local_graph.entries;
1204 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx(
"lclrow2idx", nrows);
1206 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1207 Kokkos::parallel_for
1208 (
"performSymbolicPhase::RangePolicy::lclrow2idx",
1209 policy, KOKKOS_LAMBDA(
const local_ordinal_type &i) {
1210 lclrow2idx[lclrow(i)] = i;
1216 typename sum_reducer_type::value_type sum_reducer_value;
1218 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1219 Kokkos::parallel_reduce
1222 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
typename sum_reducer_type::value_type &update) {
1224 const local_ordinal_type ri0 = lclrow2idx[lr];
1225 const local_ordinal_type pi0 = rowidx2part(ri0);
1226 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1227 const local_ordinal_type lc = local_graph_colidx(j);
1228 const local_ordinal_type lc2r = col2row[lc];
1229 bool incr_R =
false;
1231 if (lc2r == Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) {
1235 const local_ordinal_type ri = lclrow2idx[lc2r];
1236 const local_ordinal_type pi = rowidx2part(ri);
1244 if (ri0 + 1 >= ri && ri0 <= ri + 1)
1250 if (lc < nrows) ++update.v[1];
1254 }, sum_reducer_type(sum_reducer_value));
1256 size_type D_nnz = sum_reducer_value.v[0];
1257 size_type R_nnz_owned = sum_reducer_value.v[1];
1258 size_type R_nnz_remote = sum_reducer_value.v[2];
1260 if (!overlap_communication_and_computation) {
1261 R_nnz_owned += R_nnz_remote;
1267 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1269 btdm.A_colindsub = local_ordinal_type_1d_view(
"btdm.A_colindsub", D_nnz);
1270 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1272 #if defined(BLOCKTRIDICONTAINER_DEBUG) 1273 Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1276 const local_ordinal_type nparts = partptr.extent(0) - 1;
1278 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1279 Kokkos::parallel_for
1280 (
"performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1281 policy, KOKKOS_LAMBDA(
const local_ordinal_type &pi0) {
1282 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1283 local_ordinal_type offset = 0;
1284 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1285 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1287 const local_ordinal_type lr0 = lclrow(ri0);
1288 const size_type j0 = local_graph_rowptr(lr0);
1289 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1290 const local_ordinal_type lc = local_graph_colidx(j);
1291 const local_ordinal_type lc2r = col2row[lc];
1292 if (lc2r == Teuchos::OrdinalTraits<local_ordinal_type>::invalid())
continue;
1293 const local_ordinal_type ri = lclrow2idx[lc2r];
1294 const local_ordinal_type pi = rowidx2part(ri);
1295 if (pi != pi0)
continue;
1296 if (ri + 1 < ri0 || ri > ri0 + 1)
continue;
1297 const local_ordinal_type row_entry = j - j0;
1298 D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1303 #if defined(BLOCKTRIDICONTAINER_DEBUG) 1304 for (
size_t i=0;i<D_A_colindsub.extent(0);++i)
1305 TEUCHOS_ASSERT(D_A_colindsub(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1307 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1311 const local_ordinal_type npacks = packptr.extent(0) - 1;
1312 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1313 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1314 btdm.values = vector_type_3d_view(
"btdm.values", num_packed_blocks(), blocksize, blocksize);
1315 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1321 amd.rowptr = size_type_1d_view(
"amd.rowptr", nrows + 1);
1322 amd.A_colindsub = local_ordinal_type_1d_view(Kokkos::ViewAllocateWithoutInitializing(
"amd.A_colindsub"), R_nnz_owned);
1324 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1325 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1327 amd.rowptr_remote = size_type_1d_view(
"amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1328 amd.A_colindsub_remote = local_ordinal_type_1d_view(Kokkos::ViewAllocateWithoutInitializing(
"amd.A_colindsub_remote"), R_nnz_remote);
1330 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1331 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1334 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1335 Kokkos::parallel_for
1336 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1337 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr) {
1338 const local_ordinal_type ri0 = lclrow2idx[lr];
1339 const local_ordinal_type pi0 = rowidx2part(ri0);
1340 const size_type j0 = local_graph_rowptr(lr);
1341 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1342 const local_ordinal_type lc = local_graph_colidx(j);
1343 const local_ordinal_type lc2r = col2row[lc];
1344 if (lc2r != Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) {
1345 const local_ordinal_type ri = lclrow2idx[lc2r];
1346 const local_ordinal_type pi = rowidx2part(ri);
1347 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1352 if (!overlap_communication_and_computation || lc < nrows) {
1355 ++R_rowptr_remote(lr);
1364 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1365 Kokkos::parallel_scan
1366 (
"performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1367 policy, KOKKOS_LAMBDA(
const local_ordinal_type &lr,
1368 update_type &update,
1369 const bool &
final) {
1371 val.v[0] = R_rowptr(lr);
1372 if (overlap_communication_and_computation)
1373 val.v[1] = R_rowptr_remote(lr);
1376 R_rowptr(lr) = update.v[0];
1377 if (overlap_communication_and_computation)
1378 R_rowptr_remote(lr) = update.v[1];
1381 const local_ordinal_type ri0 = lclrow2idx[lr];
1382 const local_ordinal_type pi0 = rowidx2part(ri0);
1384 size_type cnt_rowptr = R_rowptr(lr);
1385 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0;
1387 const size_type j0 = local_graph_rowptr(lr);
1388 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1389 const local_ordinal_type lc = local_graph_colidx(j);
1390 const local_ordinal_type lc2r = col2row[lc];
1391 if (lc2r != Teuchos::OrdinalTraits<local_ordinal_type>::invalid()) {
1392 const local_ordinal_type ri = lclrow2idx[lc2r];
1393 const local_ordinal_type pi = rowidx2part(ri);
1394 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1397 const local_ordinal_type row_entry = j - j0;
1398 if (!overlap_communication_and_computation || lc < nrows)
1399 R_A_colindsub(cnt_rowptr++) = row_entry;
1401 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1408 TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
1409 Kokkos::deep_copy(amd.rowptr, R_rowptr);
1410 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1411 if (overlap_communication_and_computation) {
1412 TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
1413 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1414 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1418 amd.tpetra_values = (
const_cast<block_crs_matrix_type*
>(A.get())->
1419 template getValues<memory_space>());
1427 template<
typename MatrixType>
1432 using execution_space =
typename impl_type::execution_space;
1433 using memory_space =
typename impl_type::memory_space;
1435 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1438 using magnitude_type =
typename impl_type::magnitude_type;
1444 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
1447 using impl_scalar_type_4d_view =
typename impl_type::impl_scalar_type_4d_view;
1448 static constexpr
int vector_length = impl_type::vector_length;
1451 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
1455 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
1457 using a_rowptr_value_type =
typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::device_type>::size_type;
1458 const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,typename impl_type::device_type> > A_rowptr;
1459 const ConstUnmanaged<impl_scalar_type_1d_view> A_values;
1461 const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
1462 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
1463 const Unmanaged<vector_type_3d_view> vector_values;
1464 const Unmanaged<impl_scalar_type_4d_view> scalar_values;
1466 const local_ordinal_type blocksize, blocksize_square;
1468 const magnitude_type tiny;
1469 const local_ordinal_type vector_length_value;
1473 const PartInterface<MatrixType> &interf_,
1474 const Teuchos::RCP<const block_crs_matrix_type> &A_,
1475 const magnitude_type& tiny_) :
1477 partptr(interf_.partptr),
1478 lclrow(interf_.lclrow),
1479 packptr(interf_.packptr),
1481 A_rowptr(A_->getCrsGraph().getLocalGraph().row_map),
1484 pack_td_ptr(btdm_.pack_td_ptr),
1485 flat_td_ptr(btdm_.flat_td_ptr),
1486 A_colindsub(btdm_.A_colindsub),
1487 vector_values(btdm_.values),
1488 scalar_values((impl_scalar_type*)vector_values.data(),
1489 vector_values.extent(0),
1490 vector_values.extent(1),
1491 vector_values.extent(2),
1493 blocksize(btdm_.values.extent(1)),
1494 blocksize_square(blocksize*blocksize),
1497 vector_length_value(vector_length) {}
1503 extract(
const local_ordinal_type &packidx)
const {
1504 local_ordinal_type partidx = packptr(packidx);
1505 local_ordinal_type npacks = packptr(packidx+1) - partidx;
1507 const size_type kps = pack_td_ptr(partidx);
1508 local_ordinal_type kfs[vector_length] = {};
1509 local_ordinal_type ri0[vector_length] = {};
1510 local_ordinal_type nrows[vector_length] = {};
1512 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
1513 kfs[vi] = flat_td_ptr(partidx);
1514 ri0[vi] = partptr(partidx);
1515 nrows[vi] = partptr(partidx+1) - ri0[vi];
1517 for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
1518 for (local_ordinal_type e=0;e<3;++e) {
1519 const impl_scalar_type* block[vector_length] = {};
1520 for (local_ordinal_type vi=0;vi<npacks;++vi) {
1521 const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
1522 block[vi] = &A_values(Aj*blocksize_square);
1524 const size_type pi = kps + j;
1526 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
1527 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
1528 const auto idx = ii*blocksize + jj;
1529 auto& v = vector_values(pi, ii, jj);
1530 for (local_ordinal_type vi=0;vi<npacks;++vi)
1531 v[vi] = block[vi][idx];
1535 if (nrows[0] == 1)
break;
1536 if (e == 1 && (tr == 0 || tr+1 == nrows[0]))
break;
1537 for (local_ordinal_type vi=1;vi<npacks;++vi) {
1538 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
1549 factorize (
const local_ordinal_type& packidx)
const {
1550 namespace KB = KokkosBatched::Experimental;
1551 using AlgoType = KB::Algo::Level3::Blocked;
1554 const auto one = Kokkos::ArithTraits<magnitude_type>::one();
1556 auto i0 = pack_td_ptr(packptr(packidx));
1557 const local_ordinal_type nrows
1558 = BlockTridiags<MatrixType>::IndexToRow(pack_td_ptr(packptr(packidx+1)) - i0 - 1) + 1;
1561 auto A = Kokkos::subview(vector_values, i0, Kokkos::ALL(), Kokkos::ALL());
1562 KB::SerialLU<KB::Algo::LU::Unblocked>::invoke(A, tiny);
1567 for (local_ordinal_type i=1;i<nrows;++i,i0+=3) {
1568 B.assign_data( &vector_values(i0+1,0,0) );
1569 KB::SerialTrsm<KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
1570 ::invoke(one, A, B);
1571 C.assign_data( &vector_values(i0+2,0,0) );
1572 KB::SerialTrsm<KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
1573 ::invoke(one, A, C);
1574 A.assign_data( &vector_values(i0+3,0,0) );
1575 KB::SerialGemm<KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
1576 ::invoke(-one, C, B, one, A);
1577 KB::SerialLU<KB::Algo::LU::Unblocked>::invoke(A, tiny);
1582 KOKKOS_INLINE_FUNCTION
1585 const local_ordinal_type &partidx,
1586 const local_ordinal_type &v)
const {
1587 const size_type kps = pack_td_ptr(partidx);
1588 const local_ordinal_type kfs = flat_td_ptr(partidx);
1589 const local_ordinal_type ri0 = partptr(partidx);
1590 const local_ordinal_type nrows = partptr(partidx+1) - ri0;
1591 for (local_ordinal_type tr=0,j=0;tr<nrows;++tr) {
1592 local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
1593 local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
1594 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
1595 const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
1596 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
1597 const size_type pi = kps + j;
1598 Kokkos::parallel_for
1599 (Kokkos::TeamThreadRange(member,blocksize_square), [&](
const local_ordinal_type &ij) {
1600 const local_ordinal_type ii = ij/blocksize;
1601 const local_ordinal_type jj = ij%blocksize;
1602 scalar_values(pi, ii, jj, v) = block[ii*blocksize + jj];
1608 KOKKOS_INLINE_FUNCTION
1611 const local_ordinal_type &i0,
1612 const local_ordinal_type &nrows,
1613 const local_ordinal_type &v)
const {
1614 namespace KB = KokkosBatched::Experimental;
1615 using AlgoType = KB::Algo::Level3::Unblocked;
1618 const auto one = Kokkos::ArithTraits<magnitude_type>::one();
1621 auto A = Kokkos::subview(scalar_values, i0, Kokkos::ALL(), Kokkos::ALL(), 0);
1622 A.assign_data( &scalar_values(i0,0,0,v) );
1623 KB::TeamLU<member_type,KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
1627 local_ordinal_type i = i0;
1628 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
1629 B.assign_data( &scalar_values(i+1,0,0,v) );
1630 KB::TeamTrsm<member_type,KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
1631 ::invoke(member, one, A, B);
1632 C.assign_data( &scalar_values(i+2,0,0,v) );
1633 KB::TeamTrsm<member_type,KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
1634 ::invoke(member, one, A, C);
1635 A.assign_data( &scalar_values(i+3,0,0,v) );
1636 KB::TeamGemm<member_type,KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
1637 ::invoke(member, -one, C, B, one, A);
1638 KB::TeamLU<member_type,KB::Algo::LU::Unblocked>::invoke(member, A, tiny);
1657 KOKKOS_INLINE_FUNCTION
1661 const local_ordinal_type packidx = member.league_rank();
1662 const local_ordinal_type partidx = packptr(packidx);
1663 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
1664 const local_ordinal_type i0 = pack_td_ptr(partidx);
1665 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
1672 Kokkos::parallel_for
1673 (Kokkos::ThreadVectorRange(member, vector_length_value), [&](
const local_ordinal_type &v) {
1675 if (v < npacks) extract(member, partidx + v, v);
1676 member.team_barrier();
1678 factorize(member, i0, nrows, v);
1683 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 1684 cudaProfilerStart();
1688 #if defined(KOKKOS_ENABLE_CUDA) 1689 const local_ordinal_type vl = vector_length;
1690 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
1691 Kokkos::parallel_for(
"ExtractAndFactorize::TeamPolicy::run", policy, *
this);
1694 #if defined(__CUDA_ARCH__) 1695 TEUCHOS_TEST_FOR_EXCEPT_MSG(
true,
"Error: CUDA should not see this code");
1697 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
1698 Kokkos::parallel_for(
"ExtractAndFactorize::RangePolicy::run", policy, *
this);
1701 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 1710 template<
typename MatrixType>
1713 const PartInterface<MatrixType> &interf,
1715 const typename ImplType<MatrixType>::magnitude_type tiny) {
1716 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 1717 TEUCHOS_FUNC_TIME_MONITOR(
"BlockTriDi::NumericPhase");
1726 template<
typename MatrixType>
1730 using execution_space =
typename impl_type::execution_space;
1732 using local_ordinal_type =
typename impl_type::local_ordinal_type;
1734 using magnitude_type =
typename impl_type::magnitude_type;
1735 using tpetra_multivector_type =
typename impl_type::tpetra_multivector_type;
1736 using local_ordinal_type_1d_view =
typename impl_type::local_ordinal_type_1d_view;
1737 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
1738 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
1739 static constexpr
int vector_length = impl_type::vector_length;
1741 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
1745 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
1746 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
1747 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
1748 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
1749 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
1750 const local_ordinal_type blocksize;
1751 const local_ordinal_type num_vectors;
1754 Unmanaged<vector_type_3d_view> packed_multivector;
1755 Unmanaged<impl_scalar_type_2d_view> scalar_multivector;
1756 impl_scalar_type damping_factor;
1758 template<
typename TagType>
1759 KOKKOS_INLINE_FUNCTION
1760 void copy_multivectors(
const local_ordinal_type &j,
1761 const local_ordinal_type &vi,
1762 const local_ordinal_type &pri,
1763 const local_ordinal_type &,
1764 const local_ordinal_type &ri0)
const {
1765 if (TagType::id == 0) {
1766 for (local_ordinal_type col=0;col<num_vectors;++col)
1767 for (local_ordinal_type i=0;i<blocksize;++i)
1768 packed_multivector(pri, i, col)[vi] = scalar_multivector(blocksize*lclrow(ri0+j)+i,col);
1769 }
else if (TagType::id > 0) {
1770 const impl_scalar_type df = damping_factor;
1771 for (local_ordinal_type col=0;col<num_vectors;++col)
1772 for (local_ordinal_type i=0;i<blocksize;++i) {
1773 impl_scalar_type &y = scalar_multivector(blocksize*lclrow(ri0+j)+i,col);
1774 const impl_scalar_type yc = packed_multivector(pri, i, col)[vi];
1775 if (TagType::id == 1) y = df*yc;
1776 else y += df*(yc - y);
1781 template<
typename TagType>
1782 KOKKOS_INLINE_FUNCTION
1783 void copy_multivectors_with_norm(
const local_ordinal_type &j,
1784 const local_ordinal_type &vi,
1785 const local_ordinal_type &pri,
1786 const local_ordinal_type &,
1787 const local_ordinal_type &ri0,
1788 magnitude_type *norm)
const {
1789 if (TagType::id > 0) {
1790 const impl_scalar_type df = damping_factor;
1791 for (local_ordinal_type col=0;col<num_vectors;++col) {
1792 const local_ordinal_type offset = col*blocksize;
1793 for (local_ordinal_type i=0;i<blocksize;++i) {
1794 impl_scalar_type &y = scalar_multivector(blocksize*lclrow(ri0+j)+i,col);
1795 const impl_scalar_type yc = packed_multivector(pri, i, col)[vi];
1796 const impl_scalar_type yd = TagType::id == 1 ? yc : yc - y;
1797 const magnitude_type abs_yd = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
1798 norm[offset+i] += abs_yd*abs_yd;
1799 if (TagType::id == 1) y = df*yc;
1810 typedef magnitude_type value_type[];
1813 KOKKOS_INLINE_FUNCTION
1815 init(magnitude_type *dst)
const {
1816 for (
int i=0;i<value_count;++i)
1817 dst[i] = Kokkos::reduction_identity<magnitude_type>::sum();
1820 KOKKOS_INLINE_FUNCTION
1822 join(
volatile magnitude_type *dst,
const volatile magnitude_type *src)
const {
1823 for (
int i=0;i<value_count;++i)
1830 const vector_type_3d_view &pmv)
1831 : partptr(interf.partptr),
1832 packptr(interf.packptr),
1833 part2packrowidx0(interf.part2packrowidx0),
1834 part2rowidx0(interf.part2rowidx0),
1835 lclrow(interf.lclrow),
1836 blocksize(pmv.extent(1)),
1837 num_vectors(pmv.extent(2)),
1838 packed_multivector(pmv) {}
1841 template<
typename TagType>
1844 operator() (
const TagType&,
const local_ordinal_type &packidx, magnitude_type*
const norm = NULL)
const {
1845 local_ordinal_type partidx = packptr(packidx);
1846 local_ordinal_type npacks = packptr(packidx+1) - partidx;
1847 const local_ordinal_type pri0 = part2packrowidx0(partidx);
1849 local_ordinal_type ri0[vector_length] = {};
1850 local_ordinal_type nrows[vector_length] = {};
1851 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
1852 ri0[v] = part2rowidx0(partidx);
1853 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
1855 for (local_ordinal_type j=0;j<nrows[0];++j) {
1856 local_ordinal_type cnt = 1;
1857 for (;cnt<npacks && j!= nrows[cnt];++cnt);
1859 const local_ordinal_type pri = pri0 + j;
1860 for (local_ordinal_type v=0;v<npacks;++v) {
1861 if (norm == NULL) copy_multivectors<TagType>(j, v, pri, nrows[v], ri0[v]);
1862 else copy_multivectors_with_norm<TagType>(j, v, pri, nrows[v], ri0[v], norm);
1867 template<
typename TagType>
1868 KOKKOS_INLINE_FUNCTION
1870 operator() (
const TagType&,
const member_type &member, magnitude_type*
const norm = NULL)
const {
1871 const local_ordinal_type packidx = member.league_rank();
1872 const local_ordinal_type partidx_begin = packptr(packidx);
1873 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
1874 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
1875 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](
const local_ordinal_type &v) {
1876 const local_ordinal_type partidx = partidx_begin + v;
1877 const local_ordinal_type ri0 = part2rowidx0(partidx);
1878 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
1879 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](
const local_ordinal_type &j) {
1880 const local_ordinal_type pri = pri0 + j;
1881 if (norm == NULL) copy_multivectors<TagType>(j, v, pri, nrows, ri0);
1882 else copy_multivectors_with_norm<TagType>(j, v, pri, nrows, ri0, norm);
1887 struct ToPackedMultiVectorTag { enum :
int {
id = 0 }; };
1888 struct ToScalarMultiVectorFirstTag { enum :
int {
id = 1 }; };
1889 struct ToScalarMultiVectorSecondTag { enum :
int {
id = 2 }; };
1891 template<
typename TpetraLocalViewType>
1892 void to_packed_multivector(
const TpetraLocalViewType &scalar_multivector_) {
1893 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 1894 TEUCHOS_FUNC_TIME_MONITOR(
"BlockTriDi::MultiVectorConverter::ToPackedMultiVector");
1897 scalar_multivector = scalar_multivector_;
1899 #if defined(KOKKOS_ENABLE_CUDA) 1900 const local_ordinal_type vl = vector_length;
1901 const Kokkos::TeamPolicy<execution_space,ToPackedMultiVectorTag> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
1902 Kokkos::parallel_for
1903 (
"MultiVectorConverter::TeamPolicy::to_packed_multivector",
1907 #if defined(__CUDA_ARCH__) 1908 TEUCHOS_TEST_FOR_EXCEPT_MSG(
true,
"Error: CUDA should not see this code");
1910 const Kokkos::RangePolicy<execution_space,ToPackedMultiVectorTag> policy(0, packptr.extent(0) - 1);
1911 Kokkos::parallel_for
1912 (
"MultiVectorConverter::RangePolicy::to_packed_multivector",
1918 template<
typename TpetraLocalViewType>
1919 void to_scalar_multivector(
const TpetraLocalViewType &scalar_multivector_,
1920 const impl_scalar_type &damping_factor_,
1921 const bool &is_vectors_zero,
1922 magnitude_type *norm = NULL) {
1923 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 1924 TEUCHOS_FUNC_TIME_MONITOR(
"BlockTriDi::MultiVectorConverter::ToScalarMultiVector");
1926 scalar_multivector = scalar_multivector_;
1927 damping_factor = damping_factor_;
1929 value_count = blocksize*num_vectors;
1930 for (
int i=0;i<value_count;++i)
1931 norm[i] = Kokkos::ArithTraits<magnitude_type>::zero();
1937 #if defined(KOKKOS_ENABLE_CUDA) 1939 const local_ordinal_type vl = norm == NULL ? vector_length : 1;
1941 if (is_vectors_zero) {
1942 const Kokkos::TeamPolicy
1943 <execution_space,ToScalarMultiVectorFirstTag> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
1945 Kokkos::parallel_for
1946 (
"MultiVectorConverter::TeamPolicy::to_scalar_multivector::fist", policy, *
this);
1948 Kokkos::parallel_reduce
1949 (
"MultiVectorConverter::TeamPolicy::to_scalar_multivector::fist_w_norm", policy, *
this, norm);
1951 const Kokkos::TeamPolicy
1952 <execution_space,ToScalarMultiVectorSecondTag> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
1954 Kokkos::parallel_for
1955 (
"MultiVectorConverter::TeamPolicy::to_scalar_multivector::second", policy, *
this);
1957 Kokkos::parallel_reduce
1958 (
"MultiVectorConverter::TeamPolicy::to_scalar_multivector::second_w_norm", policy, *
this, norm);
1962 #if defined(__CUDA_ARCH__) 1963 TEUCHOS_TEST_FOR_EXCEPT_MSG(
true,
"Error: CUDA should not see this code");
1965 if (is_vectors_zero) {
1966 const Kokkos::RangePolicy
1967 <execution_space,ToScalarMultiVectorFirstTag> policy(0, packptr.extent(0) - 1);
1969 Kokkos::parallel_for
1970 (
"MultiVectorConverter::RangePolicy::to_scalar_multivector::fist", policy, *
this);
1972 Kokkos::parallel_reduce
1973 (
"MultiVectorConverter::RangePolicy::to_scalar_multivector::fist_w_norm", policy, *
this, norm);
1975 const Kokkos::RangePolicy
1976 <execution_space,ToScalarMultiVectorSecondTag> policy(0, packptr.extent(0) - 1);
1978 Kokkos::parallel_for
1979 (
"MultiVectorConverter::RangePolicy::to_scalar_multivector::second", policy, *
this);
1981 Kokkos::parallel_reduce
1982 (
"MultiVectorConverter::RangePolicy::to_scalar_multivector::second_w_norm", policy, *
this, norm);
1992 template<
typename MatrixType>
1996 using execution_space =
typename impl_type::execution_space;
1998 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2001 using magnitude_type =
typename impl_type::magnitude_type;
2007 using impl_scalar_type_4d_view =
typename impl_type::impl_scalar_type_4d_view;
2008 static constexpr
int vector_length = impl_type::vector_length;
2011 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
2015 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2016 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2017 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2019 const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2021 const ConstUnmanaged<vector_type_3d_view> D_vector_values;
2022 const Unmanaged<vector_type_3d_view> X_vector_values;
2024 const ConstUnmanaged<impl_scalar_type_4d_view> D_scalar_values;
2025 const Unmanaged<impl_scalar_type_4d_view> X_scalar_values;
2027 const local_ordinal_type vector_length_value;
2035 partptr(interf.partptr),
2036 packptr(interf.packptr),
2037 part2packrowidx0(interf.part2packrowidx0),
2039 pack_td_ptr(btdm.pack_td_ptr),
2040 D_vector_values(btdm.values),
2041 X_vector_values(pmv),
2043 D_scalar_values((impl_scalar_type*)D_vector_values.data(),
2044 D_vector_values.extent(0),
2045 D_vector_values.extent(1),
2046 D_vector_values.extent(2),
2048 X_scalar_values((impl_scalar_type*)X_vector_values.data(),
2049 X_vector_values.extent(0),
2050 X_vector_values.extent(1),
2051 X_vector_values.extent(2),
2053 vector_length_value(vector_length)
2064 const local_ordinal_type &packidx)
const {
2065 namespace KB = KokkosBatched::Experimental;
2066 using AlgoType = KB::Algo::Level2::Unblocked;
2069 auto A = D_vector_values.data();
2070 auto X = X_vector_values.data();
2073 const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2075 const local_ordinal_type astep = D_vector_values.stride_0();
2076 const local_ordinal_type as0 = blocksize;
2077 const local_ordinal_type as1 = 1;
2078 const local_ordinal_type xstep = X_vector_values.stride_0();
2079 const local_ordinal_type xs0 = 1;
2082 const local_ordinal_type partidx = packptr(packidx);
2083 const size_type i0 = pack_td_ptr(partidx);
2084 const local_ordinal_type r0 = part2packrowidx0(partidx);
2085 const local_ordinal_type nrows = part2packrowidx0(packptr(packidx+1)) - r0;
2092 KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2093 (AlgoType,KB::Diag::Unit,
2094 blocksize,blocksize,
2099 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2100 KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2102 blocksize, blocksize,
2104 A+2*astep, as0, as1,
2109 KOKKOSBATCHED_SERIAL_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2110 (AlgoType,KB::Diag::Unit,
2111 blocksize,blocksize,
2113 A+3*astep, as0, as1,
2121 KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2122 (AlgoType,KB::Diag::NonUnit,
2123 blocksize, blocksize,
2128 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2130 KOKKOSBATCHED_SERIAL_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2132 blocksize, blocksize,
2134 A+1*astep, as0, as1,
2139 KOKKOSBATCHED_SERIAL_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2140 (AlgoType,KB::Diag::NonUnit,
2141 blocksize, blocksize,
2152 serialSolveMultiVector(
const local_ordinal_type &,
2153 const local_ordinal_type &packidx)
const {
2154 namespace KB = KokkosBatched::Experimental;
2155 using AlgoType = KB::Algo::Level3::Blocked;
2158 const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2162 auto A = Kokkos::subview(D_vector_values, 0, Kokkos::ALL(), Kokkos::ALL());
2163 auto X1 = Kokkos::subview(X_vector_values, 0, Kokkos::ALL(), Kokkos::ALL());
2167 const local_ordinal_type partidx = packptr(packidx);
2168 size_type i = pack_td_ptr(partidx);
2169 local_ordinal_type r = part2packrowidx0(partidx);
2170 const local_ordinal_type nrows = part2packrowidx0(packptr(packidx+1)) - r;
2173 A.assign_data( &D_vector_values(i,0,0) );
2174 X1.assign_data( &X_vector_values(r,0,0) );
2175 KB::SerialTrsm<KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
2176 ::invoke(one, A, X1);
2177 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2178 A.assign_data( &D_vector_values(i+2,0,0) );
2179 X2.assign_data( &X_vector_values(++r,0,0) );
2180 KB::SerialGemm<KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
2181 ::invoke(-one, A, X1, one, X2);
2182 A.assign_data( &D_vector_values(i+3,0,0) );
2183 KB::SerialTrsm<KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
2184 ::invoke(one, A, X2);
2185 X1.assign_data( X2.data() );
2189 KB::SerialTrsm<KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
2190 ::invoke(one, A, X1);
2191 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2193 A.assign_data( &D_vector_values(i+1,0,0) );
2194 X2.assign_data( &X_vector_values(--r,0,0) );
2195 KB::SerialGemm<KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
2196 ::invoke(-one, A, X1, one, X2);
2197 A.assign_data( &D_vector_values(i,0,0) );
2198 KB::SerialTrsm<KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
2199 ::invoke(one, A, X2);
2200 X1.assign_data( X2.data() );
2207 KOKKOS_INLINE_FUNCTION
2210 const local_ordinal_type &blocksize,
2211 const local_ordinal_type &i0,
2212 const local_ordinal_type &r0,
2213 const local_ordinal_type &nrows,
2214 const local_ordinal_type &v)
const {
2215 namespace KB = KokkosBatched::Experimental;
2216 using AlgoType = KB::Algo::Level2::Unblocked;
2219 auto A = D_scalar_values.data();
2220 auto X = X_scalar_values.data();
2223 const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2227 const local_ordinal_type astep = D_scalar_values.stride_0();
2228 const local_ordinal_type as0 = blocksize*vector_length;
2229 const local_ordinal_type as1 = vector_length;
2230 const local_ordinal_type xstep = X_scalar_values.stride_0();
2231 const local_ordinal_type xs0 = vector_length;
2244 KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2245 (AlgoType,member,KB::Diag::Unit,
2246 blocksize,blocksize,
2251 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2252 KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2254 blocksize, blocksize,
2256 A+2*astep, as0, as1,
2261 KOKKOSBATCHED_TEAM_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2262 (AlgoType,member,KB::Diag::Unit,
2263 blocksize,blocksize,
2265 A+3*astep, as0, as1,
2273 KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2274 (AlgoType,member,KB::Diag::NonUnit,
2275 blocksize, blocksize,
2280 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2282 KOKKOSBATCHED_TEAM_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2284 blocksize, blocksize,
2286 A+1*astep, as0, as1,
2291 KOKKOSBATCHED_TEAM_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2292 (AlgoType,member,KB::Diag::NonUnit,
2293 blocksize, blocksize,
2305 KOKKOS_INLINE_FUNCTION
2308 const local_ordinal_type &blocksize,
2309 const local_ordinal_type &i0,
2310 const local_ordinal_type &r0,
2311 const local_ordinal_type &nrows,
2312 const local_ordinal_type &v)
const {
2313 namespace KB = KokkosBatched::Experimental;
2314 using AlgoType = KB::Algo::Level3::Blocked;
2317 const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2320 auto A = Kokkos::subview(D_scalar_values, 0, Kokkos::ALL(), Kokkos::ALL(), 0);
2321 auto X1 = Kokkos::subview(X_scalar_values, 0, Kokkos::ALL(), Kokkos::ALL(), 0);
2324 local_ordinal_type i = i0, r = r0;
2327 A.assign_data( &D_scalar_values(i,0,0,v) );
2328 X1.assign_data( &X_scalar_values(r,0,0,v) );
2329 KB::TeamTrsm<member_type,KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
2330 ::invoke(member, one, A, X1);
2331 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2332 A.assign_data( &D_scalar_values(i+2,0,0,v) );
2333 X2.assign_data( &X_scalar_values(++r,0,0,v) );
2334 KB::TeamGemm<member_type,KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
2335 ::invoke(member, -one, A, X1, one, X2);
2336 A.assign_data( &D_scalar_values(i+3,0,0,v) );
2337 KB::TeamTrsm<member_type,KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,AlgoType>
2338 ::invoke(member, one, A, X2);
2339 X1.assign_data( X2.data() );
2343 KB::TeamTrsm<member_type,KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
2344 ::invoke(member, one, A, X1);
2345 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2347 A.assign_data( &D_scalar_values(i+1,0,0,v) );
2348 X2.assign_data( &X_scalar_values(--r,0,0,v) );
2349 KB::TeamGemm<member_type,KB::Trans::NoTranspose,KB::Trans::NoTranspose,AlgoType>
2350 ::invoke(member, -one, A, X1, one, X2);
2351 A.assign_data( &D_scalar_values(i,0,0,v) );
2352 KB::TeamTrsm<member_type,KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,AlgoType>
2353 ::invoke(member, one, A, X2);
2354 X1.assign_data( X2.data() );
2359 template<
int B>
struct SingleVectorTag {};
2360 template<
int B>
struct MultiVectorTag {};
2363 KOKKOS_INLINE_FUNCTION
2365 operator() (
const SingleVectorTag<B> &,
const local_ordinal_type& packidx)
const {
2366 const local_ordinal_type blocksize = (B == 0 ? D_vector_values.extent(1) : B);
2371 KOKKOS_INLINE_FUNCTION
2373 operator() (
const MultiVectorTag<B> &,
const local_ordinal_type& packidx)
const {
2375 const local_ordinal_type blocksize = (B == 0 ? D_vector_values.extent(1) : B);
2376 serialSolveMultiVector(blocksize, packidx);
2380 KOKKOS_INLINE_FUNCTION
2382 operator() (
const SingleVectorTag<B> &,
const member_type &member)
const {
2383 const local_ordinal_type packidx = member.league_rank();
2384 const local_ordinal_type partidx = packptr(packidx);
2385 const local_ordinal_type i0 = pack_td_ptr(partidx);
2386 const local_ordinal_type r0 = part2packrowidx0(partidx);
2387 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2388 const local_ordinal_type blocksize = (B == 0 ? D_scalar_values.extent(1) : B);
2389 Kokkos::parallel_for
2390 (Kokkos::ThreadVectorRange(member, vector_length_value),[&](
const int &v) {
2396 KOKKOS_INLINE_FUNCTION
2398 operator() (
const MultiVectorTag<B> &,
const member_type &member)
const {
2399 const local_ordinal_type packidx = member.league_rank();
2400 const local_ordinal_type partidx = packptr(packidx);
2401 const local_ordinal_type i0 = pack_td_ptr(partidx);
2402 const local_ordinal_type r0 = part2packrowidx0(partidx);
2403 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2404 const local_ordinal_type blocksize = (B == 0 ? D_scalar_values.extent(1) : B);
2405 Kokkos::parallel_for
2406 (Kokkos::ThreadVectorRange(member, vector_length_value),[&](
const int &v) {
2407 teamSolveMultiVector(member, blocksize, i0, r0, nrows, v);
2412 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 2413 cudaProfilerStart();
2416 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 2417 TEUCHOS_FUNC_TIME_MONITOR(
"BlockTriDi::SolveTridiags::Run");
2419 if (is_cuda<execution_space>::value) {
2420 #if defined(KOKKOS_ENABLE_CUDA) 2421 const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2422 const local_ordinal_type vl = vector_length;
2423 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \ 2424 if (num_vectors == 1) { \ 2425 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl); \ 2426 Kokkos::parallel_for \ 2427 ("SolveTridiags::TeamPolicy::run<SingleVector>", policy, *this); \ 2429 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl); \ 2430 Kokkos::parallel_for \ 2431 ("SolveTridiags::TeamPolicy::run<MultiVector>", policy, *this); \ 2434 const local_ordinal_type blocksize = D_scalar_values.extent(1);
2435 switch (blocksize) {
2436 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
2437 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
2438 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
2439 case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
2440 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
2441 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
2442 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
2443 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
2444 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
2445 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
2447 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS 2451 #if defined(__CUDA_ARCH__) 2452 TEUCHOS_TEST_FOR_EXCEPT_MSG(
true,
"Error: CUDA should not see this code");
2454 const local_ordinal_type num_vectors = X_vector_values.extent(2);
2455 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \ 2456 if (num_vectors == 1) { \ 2457 const Kokkos::RangePolicy<execution_space,SingleVectorTag<B> > policy(0, packptr.extent(0) - 1); \ 2458 Kokkos::parallel_for \ 2459 ("SolveTridiags::RangePolicy::run<SingleVector>", policy, *this); \ 2461 const Kokkos::RangePolicy<execution_space,MultiVectorTag<B> > policy(0, packptr.extent(0) - 1); \ 2462 Kokkos::parallel_for \ 2463 ("SolveTridiags::RangePolicy::run<MultiVector>", policy, *this); \ 2466 const local_ordinal_type blocksize = D_vector_values.extent(1);
2467 switch (blocksize) {
2468 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
2469 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
2470 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
2471 case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
2472 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
2473 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
2474 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
2475 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
2476 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
2477 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
2479 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS 2483 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 2492 template<
typename MatrixType>
2496 using execution_space =
typename impl_type::execution_space;
2497 using memory_space =
typename impl_type::memory_space;
2499 using local_ordinal_type =
typename impl_type::local_ordinal_type;
2502 using magnitude_type =
typename impl_type::magnitude_type;
2506 using tpetra_block_access_view_type =
typename impl_type::tpetra_block_access_view_type;
2507 using impl_scalar_type_1d_view =
typename impl_type::impl_scalar_type_1d_view;
2508 using impl_scalar_type_2d_view =
typename impl_type::impl_scalar_type_2d_view;
2509 using vector_type_3d_view =
typename impl_type::vector_type_3d_view;
2510 using impl_scalar_type_4d_view =
typename impl_type::impl_scalar_type_4d_view;
2511 static constexpr
int vector_length = impl_type::vector_length;
2514 using member_type =
typename Kokkos::TeamPolicy<execution_space>::member_type;
2517 enum :
int { max_blocksize = 32 };
2520 ConstUnmanaged<impl_scalar_type_2d_view> b;
2521 ConstUnmanaged<impl_scalar_type_2d_view> x;
2522 ConstUnmanaged<impl_scalar_type_2d_view> x_remote;
2523 Unmanaged<impl_scalar_type_2d_view> y;
2524 Unmanaged<vector_type_3d_view> y_packed;
2525 Unmanaged<impl_scalar_type_4d_view> y_packed_scalar;
2528 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
2529 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
2530 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
2534 using a_rowptr_value_type =
typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::device_type>::size_type;
2535 const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,typename impl_type::device_type> > A_rowptr;
2536 const ConstUnmanaged<local_ordinal_type_1d_view> A_colind;
2539 const local_ordinal_type blocksize_requested;
2542 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2543 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2544 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
2545 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2546 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2547 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
2548 const bool is_dm2cm_active;
2551 template<
typename LocalCrsGraphType>
2553 const LocalCrsGraphType &graph,
2554 const local_ordinal_type &blocksize_requested_,
2555 const PartInterface<MatrixType> &interf,
2557 : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
2558 colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
2559 tpetra_values(amd.tpetra_values),
2560 A_rowptr(graph.row_map),
2561 A_colind(graph.entries),
2562 blocksize_requested(blocksize_requested_),
2563 part2packrowidx0(interf.part2packrowidx0),
2564 part2rowidx0(interf.part2rowidx0),
2565 rowidx2part(interf.rowidx2part),
2566 partptr(interf.partptr),
2567 lclrow(interf.lclrow),
2569 is_dm2cm_active(dm2cm_.span() > 0)
2575 serialGemv(
const local_ordinal_type &blocksize,
2576 const impl_scalar_type *
const __restrict__ AA,
2577 const impl_scalar_type *
const __restrict__ xx,
2578 impl_scalar_type * __restrict__ yy)
const {
2579 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
2580 impl_scalar_type val = 0;
2581 const local_ordinal_type offset = k0*blocksize;
2582 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) 2585 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) 2588 for (local_ordinal_type k1=0;k1<blocksize;++k1)
2589 val += AA[offset+k1]*xx[k1];
2594 template<
typename bbViewType,
typename yyViewType>
2595 KOKKOS_INLINE_FUNCTION
2598 const local_ordinal_type &blocksize,
2599 const bbViewType &bb,
2600 const yyViewType &yy)
const {
2601 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](
const local_ordinal_type &k0) {
2606 template<
typename AAViewType,
typename xxViewType,
typename yyViewType>
2607 KOKKOS_INLINE_FUNCTION
2610 const local_ordinal_type &blocksize,
2611 const AAViewType &AA,
2612 const xxViewType &xx,
2613 const yyViewType &yy)
const {
2614 Kokkos::parallel_for
2615 (Kokkos::TeamThreadRange(member, blocksize),
2616 [&](
const local_ordinal_type &k0) {
2617 impl_scalar_type val = 0;
2618 Kokkos::parallel_reduce
2619 (Kokkos::ThreadVectorRange(member, blocksize),
2620 [&](
const local_ordinal_type &k1, impl_scalar_type &update) {
2621 update += AA(k0,k1)*xx(k1);
2623 Kokkos::single(Kokkos::PerThread(member), [&]() {
2633 operator() (
const SeqTag &,
const local_ordinal_type& i)
const {
2634 const local_ordinal_type blocksize = blocksize_requested;
2635 const local_ordinal_type blocksize_square = blocksize*blocksize;
2638 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2639 const local_ordinal_type num_vectors = y.extent(1);
2640 const local_ordinal_type row = i*blocksize;
2641 for (local_ordinal_type col=0;col<num_vectors;++col) {
2643 impl_scalar_type *yy = &y(row, col);
2644 const impl_scalar_type *
const bb = &b(row, col);
2645 memcpy(yy, bb,
sizeof(impl_scalar_type)*blocksize);
2648 const size_type A_k0 = A_rowptr[i];
2649 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
2650 const size_type j = A_k0 + colindsub[k];
2651 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
2652 const impl_scalar_type *
const xx = &x(A_colind[j]*blocksize, col);
2653 serialGemv(blocksize,AA,xx,yy);
2658 KOKKOS_INLINE_FUNCTION
2660 operator() (
const SeqTag &,
const member_type &member)
const {
2661 namespace KB = KokkosBatched::Experimental;
2664 const local_ordinal_type blocksize = blocksize_requested;
2665 const local_ordinal_type blocksize_square = blocksize*blocksize;
2667 const local_ordinal_type i = member.league_rank();
2668 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2669 const local_ordinal_type num_vectors = y.extent(1);
2672 auto bb = Kokkos::subview(b, block_range, 0);
2674 auto yy = Kokkos::subview(y, block_range, 0);
2675 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
2677 const local_ordinal_type row = i*blocksize;
2678 for (local_ordinal_type col=0;col<num_vectors;++col) {
2680 yy.assign_data(&y(row, col));
2681 bb.assign_data(&b(row, col));
2682 if (member.team_rank() == 0)
2683 vectorCopy(member, blocksize, bb, yy);
2684 member.team_barrier();
2687 const size_type A_k0 = A_rowptr[i];
2688 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
2689 const size_type j = A_k0 + colindsub[k];
2690 A_block.assign_data( &tpetra_values(j*blocksize_square) );
2691 xx.assign_data( &x(A_colind[j]*blocksize, col) );
2692 teamGemv(member, blocksize, A_block, xx, yy);
2703 operator() (
const AsyncTag<B> &,
const local_ordinal_type &rowidx)
const {
2704 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2705 const local_ordinal_type blocksize_square = blocksize*blocksize;
2708 const local_ordinal_type partidx = rowidx2part(rowidx);
2709 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2710 const local_ordinal_type v = partidx % vector_length;
2712 const local_ordinal_type num_vectors = y_packed.extent(2);
2713 const local_ordinal_type num_local_rows = lclrow.extent(0);
2716 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
2718 const local_ordinal_type lr = lclrow(rowidx);
2719 const local_ordinal_type row = lr*blocksize;
2720 for (local_ordinal_type col=0;col<num_vectors;++col) {
2722 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
2725 const size_type A_k0 = A_rowptr[lr];
2726 for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
2727 const size_type j = A_k0 + colindsub[k];
2728 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
2729 const local_ordinal_type A_colind_at_j = A_colind[j];
2730 if (A_colind_at_j < num_local_rows) {
2731 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2732 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
2733 serialGemv(blocksize, AA,xx,yy);
2735 const auto loc = A_colind_at_j - num_local_rows;
2736 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
2737 serialGemv(blocksize, AA,xx_remote,yy);
2741 for (local_ordinal_type k=0;k<blocksize;++k)
2742 y_packed(pri, k, col)[v] = yy[k];
2747 KOKKOS_INLINE_FUNCTION
2749 operator() (
const AsyncTag<B> &,
const member_type &member)
const {
2750 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2751 const local_ordinal_type blocksize_square = blocksize*blocksize;
2754 const local_ordinal_type rowidx = member.league_rank();
2755 const local_ordinal_type partidx = rowidx2part(rowidx);
2756 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2757 const local_ordinal_type v = partidx % vector_length;
2759 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2760 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
2761 const local_ordinal_type num_local_rows = lclrow.extent(0);
2764 auto bb = Kokkos::subview(b, block_range, 0);
2765 auto xx = Kokkos::subview(x, block_range, 0);
2766 auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
2767 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
2768 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
2770 const local_ordinal_type lr = lclrow(rowidx);
2771 const local_ordinal_type row = lr*blocksize;
2772 for (local_ordinal_type col=0;col<num_vectors;++col) {
2774 bb.assign_data(&b(row, col));
2775 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
2776 if (member.team_rank() == 0)
2777 vectorCopy(member, blocksize, bb, yy);
2778 member.team_barrier();
2781 const size_type A_k0 = A_rowptr[lr];
2782 for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
2783 const size_type j = A_k0 + colindsub[k];
2784 A_block.assign_data( &tpetra_values(j*blocksize_square) );
2786 const local_ordinal_type A_colind_at_j = A_colind[j];
2787 if (A_colind_at_j < num_local_rows) {
2788 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2789 xx.assign_data( &x(loc*blocksize, col) );
2790 teamGemv(member, blocksize, A_block, xx, yy);
2792 const auto loc = A_colind_at_j - num_local_rows;
2793 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
2794 teamGemv(member, blocksize, A_block, xx_remote, yy);
2800 template <
int P,
int B>
struct OverlapTag {};
2802 template<
int P,
int B>
2805 operator() (
const OverlapTag<P,B> &,
const local_ordinal_type& rowidx)
const {
2806 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2807 const local_ordinal_type blocksize_square = blocksize*blocksize;
2810 const local_ordinal_type partidx = rowidx2part(rowidx);
2811 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2812 const local_ordinal_type v = partidx % vector_length;
2814 const local_ordinal_type num_vectors = y_packed.extent(2);
2815 const local_ordinal_type num_local_rows = lclrow.extent(0);
2818 impl_scalar_type yy[max_blocksize] = {};
2820 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
2821 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
2823 const local_ordinal_type lr = lclrow(rowidx);
2824 const local_ordinal_type row = lr*blocksize;
2825 for (local_ordinal_type col=0;col<num_vectors;++col) {
2828 memcpy(yy, &b(row, col),
sizeof(impl_scalar_type)*blocksize);
2831 memset(yy, 0,
sizeof(impl_scalar_type)*blocksize);
2835 const size_type A_k0 = A_rowptr[lr];
2836 for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
2837 const size_type j = A_k0 + colindsub_used[k];
2838 const impl_scalar_type *
const AA = &tpetra_values(j*blocksize_square);
2839 const local_ordinal_type A_colind_at_j = A_colind[j];
2841 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2842 const impl_scalar_type *
const xx = &x(loc*blocksize, col);
2843 serialGemv(blocksize,AA,xx,yy);
2845 const auto loc = A_colind_at_j - num_local_rows;
2846 const impl_scalar_type *
const xx_remote = &x_remote(loc*blocksize, col);
2847 serialGemv(blocksize,AA,xx_remote,yy);
2852 for (local_ordinal_type k=0;k<blocksize;++k)
2853 y_packed(pri, k, col)[v] = yy[k];
2855 for (local_ordinal_type k=0;k<blocksize;++k)
2856 y_packed(pri, k, col)[v] += yy[k];
2861 template<
int P,
int B>
2862 KOKKOS_INLINE_FUNCTION
2864 operator() (
const OverlapTag<P,B> &,
const member_type &member)
const {
2865 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2866 const local_ordinal_type blocksize_square = blocksize*blocksize;
2869 const local_ordinal_type rowidx = member.league_rank();
2870 const local_ordinal_type partidx = rowidx2part(rowidx);
2871 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2872 const local_ordinal_type v = partidx % vector_length;
2874 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2875 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
2876 const local_ordinal_type num_local_rows = lclrow.extent(0);
2879 auto bb = Kokkos::subview(b, block_range, 0);
2880 auto xx = Kokkos::subview(x, block_range, 0);
2881 auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
2882 auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
2883 auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
2884 auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
2885 auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
2887 const local_ordinal_type lr = lclrow(rowidx);
2888 const local_ordinal_type row = lr*blocksize;
2889 for (local_ordinal_type col=0;col<num_vectors;++col) {
2890 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
2893 bb.assign_data(&b(row, col));
2894 if (member.team_rank() == 0)
2895 vectorCopy(member, blocksize, bb, yy);
2896 member.team_barrier();
2900 const size_type A_k0 = A_rowptr[lr];
2901 for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
2902 const size_type j = A_k0 + colindsub_used[k];
2903 A_block.assign_data( &tpetra_values(j*blocksize_square) );
2905 const local_ordinal_type A_colind_at_j = A_colind[j];
2907 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2908 xx.assign_data( &x(loc*blocksize, col) );
2909 teamGemv(member, blocksize, A_block, xx, yy);
2911 const auto loc = A_colind_at_j - num_local_rows;
2912 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
2913 teamGemv(member, blocksize, A_block, xx_remote, yy);
2920 template<
typename MultiVectorLocalViewTypeY,
2921 typename MultiVectorLocalViewTypeB,
2922 typename MultiVectorLocalViewTypeX>
2923 void run(
const MultiVectorLocalViewTypeY &y_,
2924 const MultiVectorLocalViewTypeB &b_,
2925 const MultiVectorLocalViewTypeX &x_) {
2926 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 2927 cudaProfilerStart();
2930 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 2931 TEUCHOS_FUNC_TIME_MONITOR(
"BlockTriDi::ComputeResidual::Run<SeqTag>");
2933 y = y_; b = b_; x = x_;
2934 if (is_cuda<execution_space>::value) {
2935 #if defined(KOKKOS_ENABLE_CUDA) 2936 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, Kokkos::AUTO());
2937 Kokkos::parallel_for
2938 (
"ComputeResidual::TeamPolicy::run<SeqTag>", policy, *
this);
2941 #if defined(__CUDA_ARCH__) 2942 TEUCHOS_TEST_FOR_EXCEPT_MSG(
true,
"Error: CUDA should not see this code");
2944 const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
2945 Kokkos::parallel_for
2946 (
"ComputeResidual::RangePolicy::run<SeqTag>", policy, *
this);
2950 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 2956 template<
typename MultiVectorLocalViewTypeB,
2957 typename MultiVectorLocalViewTypeX,
2958 typename MultiVectorLocalViewTypeX_Remote>
2959 void run(
const vector_type_3d_view &y_packed_,
2960 const MultiVectorLocalViewTypeB &b_,
2961 const MultiVectorLocalViewTypeX &x_,
2962 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
2963 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 2964 cudaProfilerStart();
2967 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 2968 TEUCHOS_FUNC_TIME_MONITOR(
"BlockTriDi::ComputeResidual::Run<AsyncTag>");
2970 b = b_; x = x_; x_remote = x_remote_;
2971 if (is_cuda<execution_space>::value) {
2972 #if defined(KOKKOS_ENABLE_CUDA) 2973 y_packed_scalar = impl_scalar_type_4d_view((impl_scalar_type*)y_packed_.data(),
2974 y_packed_.extent(0),
2975 y_packed_.extent(1),
2976 y_packed_.extent(2),
2980 y_packed = y_packed_;
2983 if (is_cuda<execution_space>::value) {
2984 #if defined(KOKKOS_ENABLE_CUDA) 2985 local_ordinal_type vl_power_of_two = 1;
2986 for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
2987 vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
2988 const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
2989 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \ 2990 const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \ 2991 Kokkos::parallel_for \ 2992 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \ 2993 policy, *this); } break 2994 switch (blocksize_requested) {
2995 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
2996 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
2997 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
2998 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
2999 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3000 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3001 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3002 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3003 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3004 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3006 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL 3009 #if defined(__CUDA_ARCH__) 3010 TEUCHOS_TEST_FOR_EXCEPT_MSG(
true,
"Error: CUDA should not see this code");
3012 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \ 3013 const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \ 3014 Kokkos::parallel_for \ 3015 ("ComputeResidual::RangePolicy::run<AsyncTag>", \ 3016 policy, *this); } break 3017 switch (blocksize_requested) {
3018 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3019 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3020 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3021 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3022 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3023 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3024 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3025 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3026 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3027 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3029 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL 3032 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 3038 template<
typename MultiVectorLocalViewTypeB,
3039 typename MultiVectorLocalViewTypeX,
3040 typename MultiVectorLocalViewTypeX_Remote>
3041 void run(
const vector_type_3d_view &y_packed_,
3042 const MultiVectorLocalViewTypeB &b_,
3043 const MultiVectorLocalViewTypeX &x_,
3044 const MultiVectorLocalViewTypeX_Remote &x_remote_,
3045 const bool compute_owned) {
3046 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 3047 cudaProfilerStart();
3050 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 3051 TEUCHOS_FUNC_TIME_MONITOR(
"BlockTriDi::ComputeResidual::Run<OverlapTag>");
3053 b = b_; x = x_; x_remote = x_remote_;
3054 if (is_cuda<execution_space>::value) {
3055 #if defined(KOKKOS_ENABLE_CUDA) 3056 y_packed_scalar = impl_scalar_type_4d_view((impl_scalar_type*)y_packed_.data(),
3057 y_packed_.extent(0),
3058 y_packed_.extent(1),
3059 y_packed_.extent(2),
3063 y_packed = y_packed_;
3066 if (is_cuda<execution_space>::value) {
3067 #if defined(KOKKOS_ENABLE_CUDA) 3068 local_ordinal_type vl_power_of_two = 1;
3069 for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3070 vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3071 const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3072 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \ 3073 if (compute_owned) { \ 3074 const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \ 3075 Kokkos::parallel_for \ 3076 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \ 3078 const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \ 3079 Kokkos::parallel_for \ 3080 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \ 3082 switch (blocksize_requested) {
3083 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3084 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3085 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3086 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3087 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3088 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3089 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3090 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3091 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3092 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3094 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL 3097 #if defined(__CUDA_ARCH__) 3098 TEUCHOS_TEST_FOR_EXCEPT_MSG(
true,
"Error: CUDA should not see this code");
3100 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \ 3101 if (compute_owned) { \ 3102 const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > policy(0, rowidx2part.extent(0)); \ 3103 Kokkos::parallel_for \ 3104 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \ 3106 const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > policy(0, rowidx2part.extent(0)); \ 3107 Kokkos::parallel_for \ 3108 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \ 3111 switch (blocksize_requested) {
3112 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3113 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3114 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3115 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3116 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3117 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3118 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3119 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3120 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3121 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3123 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL 3126 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) 3135 template<
typename MatrixType>
3139 using magnitude_type =
typename impl_type::magnitude_type;
3143 int sweep_step_, blocksize_, num_vectors_;
3144 #ifdef HAVE_IFPACK2_MPI 3145 MPI_Request mpi_request_;
3148 std::vector<magnitude_type> work_;
3153 NormManager(
const Teuchos::RCP<
const Teuchos::Comm<int> >& comm) {
3158 collective_ = comm->getSize() > 1;
3160 #ifdef HAVE_IFPACK2_MPI 3161 const auto mpi_comm = Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int> >(comm);
3162 TEUCHOS_ASSERT( ! mpi_comm.is_null());
3163 comm_ = *mpi_comm->getRawMpiComm();
3168 int getBlocksize()
const {
return blocksize_; }
3169 int getNumVectors()
const {
return num_vectors_; }
3173 void resize(
const int& blocksize,
const int& num_vectors) {
3174 blocksize_ = blocksize;
3175 num_vectors_ = num_vectors;
3176 work_.resize((2*blocksize_ + 1)*num_vectors_);
3180 void setCheckFrequency(
const int& sweep_step) {
3181 TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1
"); 3182 sweep_step_ = sweep_step; 3185 // Get the buffer into which to store rank-local squared norms. 3186 magnitude_type* getBuffer() { return work_.data(); } 3188 // Call MPI_Iallreduce to find the global squared norms. 3189 void ireduce(const int& sweep, const bool force = false) { 3190 if ( ! force && sweep % sweep_step_) return; 3191 const int n = blocksize_*num_vectors_; 3193 std::copy(work_.begin(), work_.begin() + n, work_.begin() + n); 3194 #ifdef HAVE_IFPACK2_MPI 3195 # if MPI_VERSION >= 3 3196 MPI_Iallreduce(work_.data() + n, work_.data(), n, 3197 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(), 3198 MPI_SUM, comm_, &mpi_request_); 3200 MPI_Allreduce (work_.data() + n, work_.data(), n, 3201 Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(), 3208 // Check if the norm-based termination criterion is met. tol2 is the 3209 // tolerance squared. Sweep is the sweep index. If not every iteration is 3210 // being checked, this function immediately returns false. If a check must 3211 // be done at this iteration, it waits for the reduction triggered by 3212 // ireduce to complete, then checks the global norm against the tolerance. 3213 bool checkDone (const int& sweep, const magnitude_type& tol2, const bool force = false) { 3215 if (sweep <= 0) return false; 3217 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 3218 TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::NormManager::CheckDone
"); 3220 TEUCHOS_ASSERT(sweep >= 1); 3221 if ( ! force && (sweep - 1) % sweep_step_) return false; 3223 #ifdef HAVE_IFPACK2_MPI 3224 # if MPI_VERSION >= 3 3225 MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE); 3231 const auto n = blocksize_*num_vectors_; 3233 magnitude_type* const n0 = work_.data() + 2*n; 3234 for (int v = 0; v < num_vectors_; ++v) { 3235 const magnitude_type* const dn0 = work_.data() + v*blocksize_; 3236 magnitude_type mdn0 = 0; 3237 for (int i = 0; i < blocksize_; ++i) 3238 mdn0 = std::max(mdn0, dn0[i]); 3243 const auto n0 = work_.data() + 2*n; 3245 for (int v = 0; v < num_vectors_; ++v) { 3246 const magnitude_type* const dnf = work_.data() + v*blocksize_; 3247 magnitude_type mdnf = 0; 3248 for (int i = 0; i < blocksize_; ++i) 3249 mdnf = std::max(mdnf, dnf[i]); 3250 if (mdnf > tol2*n0[v]) { 3259 // After termination has occurred, finalize the norms for use in 3260 // get_norms{0,final}. 3262 for (int v = 0; v < num_vectors_; ++v) { 3263 const magnitude_type* const dnf = work_.data() + v*blocksize_; 3264 magnitude_type mdnf = 0; 3265 for (int i = 0; i < blocksize_; ++i) 3266 mdnf = std::max(mdnf, dnf[i]); 3267 // This overwrites the receive buffer, but that's OK; at the time of 3268 // this write, we no longer need the data in this slot. 3271 for (int i = 0; i < num_vectors_; ++i) 3272 work_[i] = std::sqrt(work_[i]); 3273 magnitude_type* const nf = work_.data() + 2*blocksize_*num_vectors_; 3274 for (int v = 0; v < num_vectors_; ++v) 3275 nf[v] = std::sqrt(nf[v]); 3278 // Report norms to the caller. 3279 const magnitude_type* getNorms0 () const { return work_.data() + 2*blocksize_*num_vectors_; } 3280 const magnitude_type* getNormsFinal () const { return work_.data(); } 3286 template<typename MatrixType> 3288 applyInverseJacobi(// importer 3289 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A, 3290 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer, 3291 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer, 3292 const bool overlap_communication_and_computation, 3294 const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface 3295 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface 3296 /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface 3297 // local object interface 3298 const PartInterface<MatrixType> &interf, // mesh interface 3299 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices 3300 const AmD<MatrixType> &amd, // R = A - D 3301 /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side 3302 /* */ NormManager<MatrixType> &norm_manager, 3303 // preconditioner parameters 3304 const typename ImplType<MatrixType>::impl_scalar_type &damping_factor, 3305 /* */ bool is_y_zero, 3306 const int max_num_sweeps, 3307 const typename ImplType<MatrixType>::magnitude_type tol, 3308 const int check_tol_every) { 3310 #ifdef HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS 3311 TEUCHOS_FUNC_TIME_MONITOR("BlockTriDi::ApplyInverseJacobi
"); 3313 using impl_type = ImplType<MatrixType>; 3314 using memory_space = typename impl_type::memory_space; 3316 using local_ordinal_type = typename impl_type::local_ordinal_type; 3317 using size_type = typename impl_type::size_type; 3318 using magnitude_type = typename impl_type::magnitude_type; 3319 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; 3320 using vector_type_1d_view = typename impl_type::vector_type_1d_view; 3321 using vector_type_3d_view = typename impl_type::vector_type_3d_view; 3322 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type; 3324 // either tpetra importer or async importer must be active 3325 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(), 3326 "Neither Tpetra importer nor Async importer is null.
"); 3327 // max number of sweeps should be positive number 3328 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0, 3329 "Maximum number of sweeps must be >= 1.
"); 3332 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero(); 3333 const bool is_seq_method_requested = !tpetra_importer.is_null(); 3334 const bool is_async_importer_active = !async_importer.is_null(); 3335 const magnitude_type tolerance = tol*tol; 3336 const local_ordinal_type blocksize = btdm.values.extent(1); 3337 const local_ordinal_type num_vectors = Y.getNumVectors(); 3338 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back; 3340 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested, 3342 "which in any
case is
for developer use only,
" << 3343 "does not support norm-based termination.
"); 3345 // if workspace is needed more, resize it 3346 const size_type work_span_required = num_blockrows*num_vectors*blocksize; 3347 if (work.span() < work_span_required) 3348 work = vector_type_1d_view("vector workspace 1d view
", work_span_required); 3350 typename AsyncableImport<MatrixType>::impl_scalar_type_2d_view remote_multivector; 3351 if (is_seq_method_requested) { 3352 // construct copy of Y again if num vectors are different 3353 if (static_cast<local_ordinal_type>(Z.getNumVectors()) != num_vectors) 3354 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false); 3356 if (is_async_importer_active) { 3357 // create comm data buffer and keep it here 3358 async_importer->createDataBuffer(num_vectors); 3359 remote_multivector = async_importer->getRemoteMultiVectorLocalView(); 3363 // wrap the workspace with 3d view 3364 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors); 3365 const auto XX = X.template getLocalView<memory_space>(); 3366 const auto YY = Y.template getLocalView<memory_space>(); 3367 const auto ZZ = Z.template getLocalView<memory_space>(); 3369 MultiVectorConverter<MatrixType> multivector_converter(interf, pmv); 3370 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv); 3372 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view; 3373 ComputeResidualVector<MatrixType> 3374 compute_residual_vector(amd, A->getCrsGraph().getLocalGraph(), blocksize, interf, 3375 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view); 3377 // norm manager workspace resize 3378 if (is_norm_manager_active) { 3379 norm_manager.resize(blocksize, num_vectors); 3380 norm_manager.setCheckFrequency(check_tol_every); 3385 for (;sweep<max_num_sweeps;++sweep) { 3388 multivector_converter.to_packed_multivector(XX); 3390 if (is_seq_method_requested) { 3392 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE); 3393 compute_residual_vector.run(YY, XX, ZZ); 3395 // pmv := y(lclrow). 3396 multivector_converter.to_packed_multivector(YY); 3398 // fused y := x - R y and pmv := y(lclrow); 3399 if (overlap_communication_and_computation || !is_async_importer_active) { 3400 if (is_async_importer_active) async_importer->asyncSendRecv(YY); 3401 compute_residual_vector.run(pmv, XX, YY, remote_multivector, true); 3402 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) { 3403 if (is_async_importer_active) async_importer->cancel(); 3406 if (is_async_importer_active) { 3407 async_importer->syncRecv(); 3408 compute_residual_vector.run(pmv, XX, YY, remote_multivector, false); 3411 if (is_async_importer_active) 3412 async_importer->syncExchange(YY); 3413 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break; 3414 compute_residual_vector.run(pmv, XX, YY, remote_multivector); 3419 // pmv := inv(D) pmv. 3420 solve_tridiags.run(); 3422 // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always. 3423 multivector_converter.to_scalar_multivector(YY, damping_factor, is_y_zero, 3424 is_norm_manager_active ? norm_manager.getBuffer() : NULL); 3426 if (is_norm_manager_active) { 3427 if (sweep + 1 == max_num_sweeps) { 3428 norm_manager.ireduce(sweep, true); 3429 norm_manager.checkDone(sweep + 1, tolerance, true); 3431 norm_manager.ireduce(sweep); 3438 //sqrt the norms for the caller's use. 3439 if (is_norm_manager_active) norm_manager.finalize(); 3445 template<typename MatrixType> 3447 using impl_type = ImplType<MatrixType>; 3448 using part_interface_type = PartInterface<MatrixType>; 3449 using block_tridiags_type = BlockTridiags<MatrixType>; 3450 using amd_type = AmD<MatrixType>; 3451 using norm_manager_type = NormManager<MatrixType>; 3452 using async_import_type = AsyncableImport<MatrixType>; 3454 // distructed objects 3455 Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A; 3456 Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer; 3457 Teuchos::RCP<async_import_type> async_importer; 3458 bool overlap_communication_and_computation; 3460 // copy of Y (mutable to penentrate const) 3461 mutable typename impl_type::tpetra_multivector_type Z; 3464 part_interface_type part_interface; 3465 block_tridiags_type block_tridiags; // D 3466 amd_type a_minus_d; // R = A - D 3467 mutable typename impl_type::vector_type_1d_view work; // right hand side workspace 3468 mutable norm_manager_type norm_manager; 3471 } // namespace BlockTriDiContainerDetails 3473 } // namespace Ifpack2 void operator()(const local_ordinal_type &packidx) const
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1649
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:955
typename impl_type::local_ordinal_type_1d_view local_ordinal_type_1d_view
views
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2003
typename impl_type::local_ordinal_type_1d_view local_ordinal_type_1d_view
views
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2504
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:779
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1428
typename impl_type::vector_type_3d_view vector_type_3d_view
vectorization
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2006
typename impl_type::local_ordinal_type_1d_view local_ordinal_type_1d_view
views
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1442
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:134
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1712
typename Kokkos::TeamPolicy< execution_space >::member_type member_type
team policy member type (used in cuda)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2514
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:259
node_type::device_type device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:232
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:221
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:113
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:212
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:122
typename Kokkos::TeamPolicy< execution_space >::member_type member_type
team policy member type (used in cuda)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1451
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:170
typename Kokkos::TeamPolicy< execution_space >::member_type member_type
team policy member type (used in cuda)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2011
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:90
typename impl_type::vector_type_3d_view vector_type_3d_view
vectorization
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1446
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1993
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2493
KokkosBatched::Experimental::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:247
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:667
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3136
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:276
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:227
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1130
KOKKOS_INLINE_FUNCTION void teamSolveSingleVector(const member_type &member, const local_ordinal_type &blocksize, const local_ordinal_type &i0, const local_ordinal_type &r0, const local_ordinal_type &nrows, const local_ordinal_type &v) const
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2209
typename impl_type::tpetra_block_crs_matrix_type block_crs_matrix_type
tpetra interface
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1440
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1099
Preconditioners and smoothers for Tpetra sparse matrices.
Definition: Ifpack2_AdditiveSchwarz_decl.hpp:72
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:911
void serialSolveSingleVector(const local_ordinal_type &blocksize, const local_ordinal_type &packidx) const
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2063
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:208
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1727
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3288