MueLu  Version of the Day
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
48 
49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
50 
51 #include "Kokkos_UnorderedMap.hpp"
52 
54 
55 #include "MueLu_Aggregates_kokkos.hpp"
56 #include "MueLu_AmalgamationFactory_kokkos.hpp"
57 #include "MueLu_AmalgamationInfo_kokkos.hpp"
58 #include "MueLu_CoarseMapFactory_kokkos.hpp"
59 #include "MueLu_MasterList.hpp"
60 #include "MueLu_NullspaceFactory_kokkos.hpp"
61 #include "MueLu_PerfUtils.hpp"
62 #include "MueLu_Monitor.hpp"
63 #include "MueLu_Utilities_kokkos.hpp"
64 
65 namespace MueLu {
66 
67  namespace { // anonymous
68 
69  template<class LocalOrdinal, class View>
70  class ReduceMaxFunctor{
71  public:
72  ReduceMaxFunctor(View view) : view_(view) { }
73 
74  KOKKOS_INLINE_FUNCTION
75  void operator()(const LocalOrdinal &i, LocalOrdinal& vmax) const {
76  if (vmax < view_(i))
77  vmax = view_(i);
78  }
79 
80  KOKKOS_INLINE_FUNCTION
81  void join (volatile LocalOrdinal& dst, const volatile LocalOrdinal& src) const {
82  if (dst < src) {
83  dst = src;
84  }
85  }
86 
87  KOKKOS_INLINE_FUNCTION
88  void init (LocalOrdinal& dst) const {
89  dst = 0;
90  }
91  private:
92  View view_;
93  };
94 
95  // local QR decomposition
96  template<class LOType, class GOType, class SCType,class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
97  class LocalQRDecompFunctor {
98  private:
99  typedef LOType LO;
100  typedef GOType GO;
101  typedef SCType SC;
102 
103  typedef typename DeviceType::execution_space execution_space;
104  typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
105  typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
106  typedef typename impl_ATS::magnitudeType Magnitude;
107 
110 
111  private:
112 
113  NspType fineNS;
114  NspType coarseNS;
115  aggRowsType aggRows;
116  maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
117  agg2RowMapLOType agg2RowMapLO;
118  statusType statusAtomic;
119  rowsType rows;
120  rowsAuxType rowsAux;
121  colsAuxType colsAux;
122  valsAuxType valsAux;
123  bool doQRStep;
124  public:
125  LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_) :
126  fineNS(fineNS_),
127  coarseNS(coarseNS_),
128  aggRows(aggRows_),
129  maxAggDofSize(maxAggDofSize_),
130  agg2RowMapLO(agg2RowMapLO_),
131  statusAtomic(statusAtomic_),
132  rows(rows_),
133  rowsAux(rowsAux_),
134  colsAux(colsAux_),
135  valsAux(valsAux_),
136  doQRStep(doQRStep_)
137  { }
138 
139  KOKKOS_INLINE_FUNCTION
140  void operator() ( const typename Kokkos::TeamPolicy<execution_space>::member_type & thread, size_t& nnz) const {
141  auto agg = thread.league_rank();
142 
143  // size of aggregate: number of DOFs in aggregate
144  auto aggSize = aggRows(agg+1) - aggRows(agg);
145 
146  const impl_SC one = impl_ATS::one();
147  const impl_SC two = one + one;
148  const impl_SC zero = impl_ATS::zero();
149  const auto zeroM = impl_ATS::magnitude(zero);
150 
151  int m = aggSize;
152  int n = fineNS.extent(1);
153 
154  // calculate row offset for coarse nullspace
155  Xpetra::global_size_t offset = agg * n;
156 
157  if (doQRStep) {
158 
159  // Extract the piece of the nullspace corresponding to the aggregate
160  shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
161  for (int j = 0; j < n; j++)
162  for (int k = 0; k < m; k++)
163  r(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
164 #if 0
165  printf("A\n");
166  for (int i = 0; i < m; i++) {
167  for (int j = 0; j < n; j++)
168  printf(" %5.3lf ", r(i,j));
169  printf("\n");
170  }
171 #endif
172 
173  // Calculate QR decomposition (standard)
174  shared_matrix q(thread.team_shmem(), m, m); // Q
175  if (m >= n) {
176  bool isSingular = false;
177 
178  // Initialize Q^T
179  auto qt = q;
180  for (int i = 0; i < m; i++) {
181  for (int j = 0; j < m; j++)
182  qt(i,j) = zero;
183  qt(i,i) = one;
184  }
185 
186  for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
187  // FIXME_KOKKOS: use team
188  Magnitude s = zeroM, norm, norm_x;
189  for (int i = k+1; i < m; i++)
190  s += pow(impl_ATS::magnitude(r(i,k)), 2);
191  norm = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
192 
193  if (norm == zero) {
194  isSingular = true;
195  break;
196  }
197 
198  r(k,k) -= norm*one;
199 
200  norm_x = sqrt(pow(impl_ATS::magnitude(r(k,k)), 2) + s);
201  if (norm_x == zeroM) {
202  // We have a single diagonal element in the column.
203  // No reflections required. Just need to restor r(k,k).
204  r(k,k) = norm*one;
205  continue;
206  }
207 
208  // FIXME_KOKKOS: use team
209  for (int i = k; i < m; i++)
210  r(i,k) /= norm_x;
211 
212  // Update R(k:m,k+1:n)
213  for (int j = k+1; j < n; j++) {
214  // FIXME_KOKKOS: use team in the loops
215  impl_SC si = zero;
216  for (int i = k; i < m; i++)
217  si += r(i,k) * r(i,j);
218  for (int i = k; i < m; i++)
219  r(i,j) -= two*si * r(i,k);
220  }
221 
222  // Update Q^T (k:m,k:m)
223  for (int j = k; j < m; j++) {
224  // FIXME_KOKKOS: use team in the loops
225  impl_SC si = zero;
226  for (int i = k; i < m; i++)
227  si += r(i,k) * qt(i,j);
228  for (int i = k; i < m; i++)
229  qt(i,j) -= two*si * r(i,k);
230  }
231 
232  // Fix R(k:m,k)
233  r(k,k) = norm*one;
234  for (int i = k+1; i < m; i++)
235  r(i,k) = zero;
236  }
237 
238 #if 0
239  // Q = (Q^T)^T
240  for (int i = 0; i < m; i++)
241  for (int j = 0; j < i; j++) {
242  impl_SC tmp = qt(i,j);
243  qt(i,j) = qt(j,i);
244  qt(j,i) = tmp;
245  }
246 #endif
247 
248  // Build coarse nullspace using the upper triangular part of R
249  for (int j = 0; j < n; j++)
250  for (int k = 0; k <= j; k++)
251  coarseNS(offset+k,j) = r(k,j);
252 
253  if (isSingular) {
254  statusAtomic(1) = true;
255  return;
256  }
257 
258  } else {
259  // Special handling for m < n (i.e. single node aggregates in structural mechanics)
260 
261  // The local QR decomposition is not possible in the "overconstrained"
262  // case (i.e. number of columns in qr > number of rowsAux), which
263  // corresponds to #DOFs in Aggregate < n. For usual problems this
264  // is only possible for single node aggregates in structural mechanics.
265  // (Similar problems may arise in discontinuous Galerkin problems...)
266  // We bypass the QR decomposition and use an identity block in the
267  // tentative prolongator for the single node aggregate and transfer the
268  // corresponding fine level null space information 1-to-1 to the coarse
269  // level null space part.
270 
271  // NOTE: The resulting tentative prolongation operator has
272  // (m*DofsPerNode-n) zero columns leading to a singular
273  // coarse level operator A. To deal with that one has the following
274  // options:
275  // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
276  // false) to add some identity block to the diagonal of the zero rowsAux
277  // in the coarse level operator A, such that standard level smoothers
278  // can be used again.
279  // - Use special (projection-based) level smoothers, which can deal
280  // with singular matrices (very application specific)
281  // - Adapt the code below to avoid zero columns. However, we do not
282  // support a variable number of DOFs per node in MueLu/Xpetra which
283  // makes the implementation really hard.
284  //
285  // FIXME: do we need to check for singularity here somehow? Zero
286  // columns would be easy but linear dependency would require proper QR.
287 
288  // R = extended (by adding identity rowsAux) qr
289  for (int j = 0; j < n; j++)
290  for (int k = 0; k < n; k++)
291  if (k < m)
292  coarseNS(offset+k,j) = r(k,j);
293  else
294  coarseNS(offset+k,j) = (k == j ? one : zero);
295 
296  // Q = I (rectangular)
297  for (int i = 0; i < m; i++)
298  for (int j = 0; j < n; j++)
299  q(i,j) = (j == i ? one : zero);
300  }
301 
302  // Process each row in the local Q factor and fill helper arrays to assemble P
303  for (int j = 0; j < m; j++) {
304  LO localRow = agg2RowMapLO(aggRows(agg)+j);
305  size_t rowStart = rowsAux(localRow);
306  size_t lnnz = 0;
307  for (int k = 0; k < n; k++) {
308  // skip zeros
309  if (q(j,k) != zero) {
310  colsAux(rowStart+lnnz) = offset + k;
311  valsAux(rowStart+lnnz) = q(j,k);
312  lnnz++;
313  }
314  }
315  rows(localRow+1) = lnnz;
316  nnz += lnnz;
317  }
318 
319 #if 0
320  printf("R\n");
321  for (int i = 0; i < m; i++) {
322  for (int j = 0; j < n; j++)
323  printf(" %5.3lf ", coarseNS(i,j));
324  printf("\n");
325  }
326 
327  printf("Q\n");
328  for (int i = 0; i < aggSize; i++) {
329  for (int j = 0; j < aggSize; j++)
330  printf(" %5.3lf ", q(i,j));
331  printf("\n");
332  }
333 #endif
334  } else {
336  // "no-QR" option //
338  // Local Q factor is just the fine nullspace support over the current aggregate.
339  // Local R factor is the identity.
340  // TODO I have not implemented any special handling for aggregates that are too
341  // TODO small to locally support the nullspace, as is done in the standard QR
342  // TODO case above.
343 
344  for (int j = 0; j < m; j++) {
345  LO localRow = agg2RowMapLO(aggRows(agg)+j);
346  size_t rowStart = rowsAux(localRow);
347  size_t lnnz = 0;
348  for (int k = 0; k < n; k++) {
349  const impl_SC qr_jk = fineNS(localRow,k);
350  // skip zeros
351  if (qr_jk != zero) {
352  colsAux(rowStart+lnnz) = offset + k;
353  valsAux(rowStart+lnnz) = qr_jk;
354  lnnz++;
355  }
356  }
357  rows(localRow+1) = lnnz;
358  nnz += lnnz;
359  }
360 
361  for (int j = 0; j < n; j++)
362  coarseNS(offset+j,j) = one;
363 
364  }
365 
366  }
367 
368  // amount of shared memory
369  size_t team_shmem_size( int /* team_size */ ) const {
370  if (doQRStep) {
371  int m = maxAggDofSize;
372  int n = fineNS.extent(1);
373  return shared_matrix::shmem_size(m, n) + // r
374  shared_matrix::shmem_size(m, m); // q
375  } else
376  return 0;
377  }
378  };
379 
380  } // namespace anonymous
381 
382  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
383  RCP<const ParameterList> TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList() const {
384  RCP<ParameterList> validParamList = rcp(new ParameterList());
385 
386 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
387  SET_VALID_ENTRY("tentative: calculate qr");
388  SET_VALID_ENTRY("tentative: build coarse coordinates");
389 #undef SET_VALID_ENTRY
390 
391  validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
392  validParamList->set< RCP<const FactoryBase> >("Aggregates", Teuchos::null, "Generating factory of the aggregates");
393  validParamList->set< RCP<const FactoryBase> >("Nullspace", Teuchos::null, "Generating factory of the nullspace");
394  validParamList->set< RCP<const FactoryBase> >("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
395  validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
396  validParamList->set< RCP<const FactoryBase> >("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
397  validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory of the coordinates");
398 
399  // Make sure we don't recursively validate options for the matrixmatrix kernels
400  ParameterList norecurse;
401  norecurse.disableRecursiveValidation();
402  validParamList->set<ParameterList> ("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
403 
404  return validParamList;
405  }
406 
407  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
408  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level& fineLevel, Level& /* coarseLevel */) const {
409 
410  const ParameterList& pL = GetParameterList();
411  // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
412  std::string nspName = "Nullspace";
413  if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
414 
415  Input(fineLevel, "A");
416  Input(fineLevel, "Aggregates");
417  Input(fineLevel, nspName);
418  Input(fineLevel, "UnAmalgamationInfo");
419  Input(fineLevel, "CoarseMap");
420  if( fineLevel.GetLevelID() == 0 &&
421  fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
422  pL.get<bool>("tentative: build coarse coordinates") ) { // and we want coordinates on other levels
423  bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
424  Input(fineLevel, "Coordinates");
425  } else if (bTransferCoordinates_) {
426  Input(fineLevel, "Coordinates");
427  }
428  }
429 
430  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
431  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::Build(Level& fineLevel, Level& coarseLevel) const {
432  return BuildP(fineLevel, coarseLevel);
433  }
434 
435  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
436  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel) const {
437  FactoryMonitor m(*this, "Build", coarseLevel);
438 
439  typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
440  typedef Xpetra::MultiVectorFactory<coordinate_type,LO,GO,NO> RealValuedMultiVectorFactory;
441  const ParameterList& pL = GetParameterList();
442  std::string nspName = "Nullspace";
443  if(pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
444 
445  auto A = Get< RCP<Matrix> > (fineLevel, "A");
446  auto aggregates = Get< RCP<Aggregates_kokkos> > (fineLevel, "Aggregates");
447  auto amalgInfo = Get< RCP<AmalgamationInfo_kokkos> > (fineLevel, "UnAmalgamationInfo");
448  auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, nspName);
449  auto coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap");
450  RCP<RealValuedMultiVector> fineCoords;
451  if(bTransferCoordinates_) {
452  fineCoords = Get< RCP<RealValuedMultiVector> >(fineLevel, "Coordinates");
453  }
454 
455  RCP<Matrix> Ptentative;
456  RCP<MultiVector> coarseNullspace;
457  RCP<RealValuedMultiVector> coarseCoords;
458 
459  if(bTransferCoordinates_) {
460  ArrayView<const GO> elementAList = coarseMap->getNodeElementList();
461  GO indexBase = coarseMap->getIndexBase();
462 
463  LO blkSize = 1;
464  if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
465  blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
466 
467  Array<GO> elementList;
468  ArrayView<const GO> elementListView;
469  if (blkSize == 1) {
470  // Scalar system
471  // No amalgamation required
472  elementListView = elementAList;
473 
474  } else {
475  auto numElements = elementAList.size() / blkSize;
476 
477  elementList.resize(numElements);
478 
479  // Amalgamate the map
480  for (LO i = 0; i < Teuchos::as<LO>(numElements); i++)
481  elementList[i] = (elementAList[i*blkSize]-indexBase)/blkSize + indexBase;
482 
483  elementListView = elementList;
484  }
485 
486  auto uniqueMap = fineCoords->getMap();
487  auto coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
488  elementListView, indexBase, coarseMap->getComm());
489  coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
490 
491  // Create overlapped fine coordinates to reduce global communication
492  RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
493  if (aggregates->AggregatesCrossProcessors()) {
494  auto nonUniqueMap = aggregates->GetMap();
495  auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
496 
497  ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
498  ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
499  }
500 
501  // The good new is that his graph has already been constructed for the
502  // TentativePFactory and was cached in Aggregates. So this is a no-op.
503  auto aggGraph = aggregates->GetGraph();
504  auto numAggs = aggGraph.numRows();
505 
506  auto fineCoordsView = fineCoords ->getDeviceLocalView(Xpetra::Access::ReadOnly);
507  auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
508 
509  // Fill in coarse coordinates
510  {
511  SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
512 
513  const auto dim = fineCoords->getNumVectors();
514 
515  typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
516  for (size_t j = 0; j < dim; j++) {
517  Kokkos::parallel_for("MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
518  KOKKOS_LAMBDA(const LO i) {
519  // A row in this graph represents all node ids in the aggregate
520  // Therefore, averaging is very easy
521 
522  auto aggregate = aggGraph.rowConst(i);
523 
524  coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
525  for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
526  sum += fineCoordsRandomView(aggregate(colID),j);
527 
528  coarseCoordsView(i,j) = sum / aggregate.length;
529  });
530  }
531  }
532  }
533 
534  if (!aggregates->AggregatesCrossProcessors())
535  BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
536  else
537  BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
538 
539  // If available, use striding information of fine level matrix A for range
540  // map and coarseMap as domain map; otherwise use plain range map of
541  // Ptent = plain range map of A for range map and coarseMap as domain map.
542  // NOTE:
543  // The latter is not really safe, since there is no striding information
544  // for the range map. This is not really a problem, since striding
545  // information is always available on the intermedium levels and the
546  // coarsest levels.
547  if (A->IsView("stridedMaps") == true)
548  Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
549  else
550  Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
551 
552  if(bTransferCoordinates_) {
553  Set(coarseLevel, "Coordinates", coarseCoords);
554  }
555  Set(coarseLevel, "Nullspace", coarseNullspace);
556  Set(coarseLevel, "P", Ptentative);
557 
558  if (IsPrint(Statistics2)) {
559  RCP<ParameterList> params = rcp(new ParameterList());
560  params->set("printLoadBalancingInfo", true);
561  GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
562  }
563  }
564 
565  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
566  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
567  BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates,
568  RCP<AmalgamationInfo_kokkos> amalgInfo, RCP<MultiVector> fineNullspace,
569  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
570  RCP<MultiVector>& coarseNullspace, const int levelID) const {
571  auto rowMap = A->getRowMap();
572  auto colMap = A->getColMap();
573 
574  const size_t numRows = rowMap->getNodeNumElements();
575  const size_t NSDim = fineNullspace->getNumVectors();
576 
577  typedef Kokkos::ArithTraits<SC> ATS;
578  using impl_SC = typename ATS::val_type;
579  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
580  const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
581 
582  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
583 
584  typename Aggregates_kokkos::local_graph_type aggGraph;
585  {
586  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
587  aggGraph = aggregates->GetGraph();
588  }
589  auto aggRows = aggGraph.row_map;
590  auto aggCols = aggGraph.entries;
591 
592  // Aggregates map is based on the amalgamated column map
593  // We can skip global-to-local conversion if LIDs in row map are
594  // same as LIDs in column map
595  bool goodMap;
596  {
597  SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
598  goodMap = isGoodMap(*rowMap, *colMap);
599  }
600  // FIXME_KOKKOS: need to proofread later code for bad maps
601  TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
602  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
603  "(i.e. \"matching\" row and column maps)");
604 
605  // STEP 1: do unamalgamation
606  // The non-kokkos version uses member functions from the AmalgamationInfo
607  // container class to unamalgamate the data. In contrast, the kokkos
608  // version of TentativePFactory does the unamalgamation here and only uses
609  // the data of the AmalgamationInfo container class
610 
611  // Extract information for unamalgamation
612  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
613  GO indexBase;
614  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
615  GO globalOffset = amalgInfo->GlobalOffset();
616 
617  // Extract aggregation info (already in Kokkos host views)
618  auto procWinner = aggregates->GetProcWinner() ->getDeviceLocalView(Xpetra::Access::ReadOnly);
619  auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
620  const size_t numAggregates = aggregates->GetNumAggregates();
621 
622  int myPID = aggregates->GetMap()->getComm()->getRank();
623 
624  // Create Kokkos::View (on the device) to store the aggreate dof sizes
625  // Later used to get aggregate dof offsets
626  // NOTE: This zeros itself on construction
627  typedef typename Aggregates_kokkos::aggregates_sizes_type::non_const_type AggSizeType;
628  AggSizeType aggDofSizes;
629 
630  if (stridedBlockSize == 1) {
631  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
632 
633  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
634  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates+1);
635 
636  auto sizesConst = aggregates->ComputeAggregateSizes();
637  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates+1)), sizesConst);
638 
639  } else {
640  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
641 
642  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
643  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
644 
645  auto nodeMap = aggregates->GetMap()->getLocalMap();
646  auto dofMap = colMap->getLocalMap();
647 
648  Kokkos::parallel_for("MueLu:TentativePF:Build:compute_agg_sizes", range_type(0,numAggregates),
649  KOKKOS_LAMBDA(const LO agg) {
650  auto aggRowView = aggGraph.rowConst(agg);
651 
652  size_t size = 0;
653  for (LO colID = 0; colID < aggRowView.length; colID++) {
654  GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
655 
656  for (LO k = 0; k < stridedBlockSize; k++) {
657  GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
658 
659  if (dofMap.getLocalElement(dofGID) != INVALID)
660  size++;
661  }
662  }
663  aggDofSizes(agg+1) = size;
664  });
665  }
666 
667  // Find maximum dof size for aggregates
668  // Later used to reserve enough scratch space for local QR decompositions
669  LO maxAggSize = 0;
670  ReduceMaxFunctor<LO,decltype(aggDofSizes)> reduceMax(aggDofSizes);
671  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
672 
673  // parallel_scan (exclusive)
674  // The aggDofSizes View then contains the aggregate dof offsets
675  Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1),
676  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
677  update += aggDofSizes(i);
678  if (final_pass)
679  aggDofSizes(i) = update;
680  });
681 
682  // Create Kokkos::View on the device to store mapping
683  // between (local) aggregate id and row map ids (LIDs)
684  Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
685  {
686  SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
687 
688  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
689  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
690 
691  Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
692  KOKKOS_LAMBDA(const LO lnode) {
693  if (procWinner(lnode, 0) == myPID) {
694  // No need for atomics, it's one-to-one
695  auto aggID = vertex2AggId(lnode,0);
696 
697  auto offset = Kokkos::atomic_fetch_add( &aggOffsets(aggID), stridedBlockSize );
698  // FIXME: I think this may be wrong
699  // We unconditionally add the whole block here. When we calculated
700  // aggDofSizes, we did the isLocalElement check. Something's fishy.
701  for (LO k = 0; k < stridedBlockSize; k++)
702  agg2RowMapLO(offset + k) = lnode*stridedBlockSize + k;
703  }
704  });
705  }
706 
707  // STEP 2: prepare local QR decomposition
708  // Reserve memory for tentative prolongation operator
709  coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
710 
711  // Pull out the nullspace vectors so that we can have random access (on the device)
712  auto fineNS = fineNullspace ->getDeviceLocalView(Xpetra::Access::ReadWrite);
713  auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
714 
715  size_t nnz = 0; // actual number of nnz
716 
717  typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
718  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
719  typedef typename local_matrix_type::index_type::non_const_type cols_type;
720  typedef typename local_matrix_type::values_type::non_const_type vals_type;
721 
722 
723  // Device View for status (error messages...)
724  typedef Kokkos::View<int[10], DeviceType> status_type;
725  status_type status("status");
726 
727  typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
728  typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
729 
730  const ParameterList& pL = GetParameterList();
731  const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
732  if (!doQRStep) {
733  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
734  if (NSDim>1)
735  GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
736  }
737 
738  size_t nnzEstimate = numRows * NSDim;
739  rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows+1);
740  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
741  vals_type valsAux("Ptent_aux_vals", nnzEstimate);
742  rows_type rows("Ptent_rows", numRows+1);
743  {
744  // Stage 0: fill in views.
745  SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
746 
747  // The main thing to notice is initialization of vals with INVALID. These
748  // values will later be used to compress the arrays
749  Kokkos::parallel_for("MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows+1),
750  KOKKOS_LAMBDA(const LO row) {
751  rowsAux(row) = row*NSDim;
752  });
753  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
754  KOKKOS_LAMBDA(const LO j) {
755  colsAux(j) = INVALID;
756  });
757  }
758 
759  if (NSDim == 1) {
760  // 1D is special, as it is the easiest. We don't even need to the QR,
761  // just normalize an array. Plus, no worries abot small aggregates. In
762  // addition, we do not worry about compression. It is unlikely that
763  // nullspace will have zeros. If it does, a prolongator row would be
764  // zero and we'll get singularity anyway.
765  SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
766 
767  // Set up team policy with numAggregates teams and one thread per team.
768  // Each team handles a slice of the data associated with one aggregate
769  // and performs a local QR decomposition (in this case real QR is
770  // unnecessary).
771  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
772 
773  if (doQRStep) {
774  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop", policy,
775  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
776  auto agg = thread.league_rank();
777 
778  // size of the aggregate (number of DOFs in aggregate)
779  LO aggSize = aggRows(agg+1) - aggRows(agg);
780 
781  // Extract the piece of the nullspace corresponding to the aggregate, and
782  // put it in the flat array, "localQR" (in column major format) for the
783  // QR routine. Trivial in 1D.
784  auto norm = impl_ATS::magnitude(zero);
785 
786  // Calculate QR by hand
787  // FIXME: shouldn't there be stridedblock here?
788  // FIXME_KOKKOS: shouldn't there be stridedblock here?
789  for (decltype(aggSize) k = 0; k < aggSize; k++) {
790  auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
791  norm += dnorm*dnorm;
792  }
793  norm = sqrt(norm);
794 
795  if (norm == zero) {
796  // zero column; terminate the execution
797  statusAtomic(1) = true;
798  return;
799  }
800 
801  // R = norm
802  coarseNS(agg, 0) = norm;
803 
804  // Q = localQR(:,0)/norm
805  for (decltype(aggSize) k = 0; k < aggSize; k++) {
806  LO localRow = agg2RowMapLO(aggRows(agg)+k);
807  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
808 
809  rows(localRow+1) = 1;
810  colsAux(localRow) = agg;
811  valsAux(localRow) = localVal;
812 
813  }
814  });
815 
816  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
817  Kokkos::deep_copy(statusHost, status);
818  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
819  if (statusHost(i)) {
820  std::ostringstream oss;
821  oss << "MueLu::TentativePFactory::MakeTentative: ";
822  switch (i) {
823  case 0: oss << "!goodMap is not implemented"; break;
824  case 1: oss << "fine level NS part has a zero column"; break;
825  }
826  throw Exceptions::RuntimeError(oss.str());
827  }
828 
829  } else {
830  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
831  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type &thread) {
832  auto agg = thread.league_rank();
833 
834  // size of the aggregate (number of DOFs in aggregate)
835  LO aggSize = aggRows(agg+1) - aggRows(agg);
836 
837  // R = norm
838  coarseNS(agg, 0) = one;
839 
840  // Q = localQR(:,0)/norm
841  for (decltype(aggSize) k = 0; k < aggSize; k++) {
842  LO localRow = agg2RowMapLO(aggRows(agg)+k);
843  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0);
844 
845  rows(localRow+1) = 1;
846  colsAux(localRow) = agg;
847  valsAux(localRow) = localVal;
848 
849  }
850  });
851  }
852 
853  Kokkos::parallel_reduce("MueLu:TentativeP:CountNNZ", range_type(0, numRows+1),
854  KOKKOS_LAMBDA(const LO i, size_t &nnz_count) {
855  nnz_count += rows(i);
856  }, nnz);
857 
858  } else { // NSdim > 1
859  // FIXME_KOKKOS: This code branch is completely unoptimized.
860  // Work to do:
861  // - Optimize QR decomposition
862  // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
863  // packing new values in the beginning of each row
864  // We do use auxilary view in this case, so keep a second rows view for
865  // counting nonzeros in rows
866 
867  {
868  SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
869  // Set up team policy with numAggregates teams and one thread per team.
870  // Each team handles a slice of the data associated with one aggregate
871  // and performs a local QR decomposition
872  const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1); // numAggregates teams a 1 thread
873  LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
874  decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
875  decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
876  decltype(valsAux)>
877  localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
878  rows, rowsAux, colsAux, valsAux, doQRStep);
879  Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
880  }
881 
882  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
883  Kokkos::deep_copy(statusHost, status);
884  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
885  if (statusHost(i)) {
886  std::ostringstream oss;
887  oss << "MueLu::TentativePFactory::MakeTentative: ";
888  switch(i) {
889  case 0: oss << "!goodMap is not implemented"; break;
890  case 1: oss << "fine level NS part has a zero column"; break;
891  }
892  throw Exceptions::RuntimeError(oss.str());
893  }
894  }
895 
896  // Compress the cols and vals by ignoring INVALID column entries that correspond
897  // to 0 in QR.
898 
899  // The real cols and vals are constructed using calculated (not estimated) nnz
900  cols_type cols;
901  vals_type vals;
902 
903  if (nnz != nnzEstimate) {
904  {
905  // Stage 2: compress the arrays
906  SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
907 
908  Kokkos::parallel_scan("MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1),
909  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
910  upd += rows(i);
911  if (final)
912  rows(i) = upd;
913  });
914  }
915 
916  {
917  SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
918 
919  cols = cols_type("Ptent_cols", nnz);
920  vals = vals_type("Ptent_vals", nnz);
921 
922  // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
923  // to the beginning of rows. See CoalesceDropFactory_kokkos for
924  // example.
925  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows),
926  KOKKOS_LAMBDA(const LO i) {
927  LO rowStart = rows(i);
928 
929  size_t lnnz = 0;
930  for (auto j = rowsAux(i); j < rowsAux(i+1); j++)
931  if (colsAux(j) != INVALID) {
932  cols(rowStart+lnnz) = colsAux(j);
933  vals(rowStart+lnnz) = valsAux(j);
934  lnnz++;
935  }
936  });
937  }
938 
939  } else {
940  rows = rowsAux;
941  cols = colsAux;
942  vals = valsAux;
943  }
944 
945  GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
946 
947  {
948  // Stage 3: construct Xpetra::Matrix
949  SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
950 
951  local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getNodeNumElements(), nnz, vals, rows, cols);
952 
953  // Managing labels & constants for ESFC
954  RCP<ParameterList> FCparams;
955  if (pL.isSublist("matrixmatrix: kernel params"))
956  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
957  else
958  FCparams = rcp(new ParameterList);
959 
960  // By default, we don't need global constants for TentativeP
961  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
962  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
963 
964  auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
965  Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
966  }
967  }
968 
969  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
970  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
971  BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates_kokkos> /* aggregates */,
972  RCP<AmalgamationInfo_kokkos> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
973  RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
974  RCP<MultiVector>& /* coarseNullspace */) const {
975  throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
976  }
977 
978  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
979  bool TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
980  isGoodMap(const Map& rowMap, const Map& colMap) const {
981  auto rowLocalMap = rowMap.getLocalMap();
982  auto colLocalMap = colMap.getLocalMap();
983 
984  const size_t numRows = rowLocalMap.getNodeNumElements();
985  const size_t numCols = colLocalMap.getNodeNumElements();
986 
987  if (numCols < numRows)
988  return false;
989 
990  size_t numDiff = 0;
991  Kokkos::parallel_reduce("MueLu:TentativePF:isGoodMap", range_type(0, numRows),
992  KOKKOS_LAMBDA(const LO i, size_t &diff) {
993  diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
994  }, numDiff);
995 
996  return (numDiff == 0);
997  }
998 
999 } //namespace MueLu
1000 
1001 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1002 #endif // HAVE_MUELU_KOKKOS_REFACTOR
1003 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
View
MueLu::DefaultLocalOrdinal LocalOrdinal
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
Namespace for MueLu classes and methods.
static const NoFactory * get()
Print even more statistics.
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Description of what is happening (more verbose)
#define SET_VALID_ENTRY(name)