Intrepid2
Intrepid2_IntegrationToolsDef.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ************************************************************************
3 //
4 // Intrepid2 Package
5 // Copyright (2007) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Kyungjoo Kim (kyukim@sandia.gov),
38 // Mauro Perego (mperego@sandia.gov), or
39 // Nate Roberts (nvrober@sandia.gov)
40 //
41 // ************************************************************************
42 // @HEADER
43 
49 #ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
50 #define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
51 
54 
55 #include "Teuchos_TimeMonitor.hpp"
56 
57 namespace Intrepid2 {
58 
59  namespace Impl
60  {
64  template<class Scalar, class DeviceType, int integralViewRank>
66  {
67  using ExecutionSpace = typename DeviceType::execution_space;
68  using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
69  using TeamMember = typename TeamPolicy::member_type;
70 
71  using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
72  IntegralViewType integralView_;
73  TensorData<Scalar,DeviceType> leftComponent_;
74  Data<Scalar,DeviceType> composedTransform_;
75  TensorData<Scalar,DeviceType> rightComponent_;
76  TensorData<Scalar,DeviceType> cellMeasures_;
77  int a_offset_;
78  int b_offset_;
79  int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
80  int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
81  int numTensorComponents_;
82  int leftFieldOrdinalOffset_;
83  int rightFieldOrdinalOffset_;
84  bool forceNonSpecialized_; // if true, don't use the specialized (more manual) implementation(s) for particular component counts. Primary use case is for testing.
85 
86  size_t fad_size_output_ = 0; // 0 if not a fad type
87 
88  Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
89 
90  // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
91  // (this also makes it easier to reorder loops, etc., for further optimizations)
92  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
93  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
94  Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
95 
96  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_; // total number of enumeration indices with arguments prior to the startingComponent fixed
97  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
98 
99  int maxFieldsLeft_;
100  int maxFieldsRight_;
101  int maxPointCount_;
102  public:
104  TensorData<Scalar,DeviceType> leftComponent,
105  Data<Scalar,DeviceType> composedTransform,
106  TensorData<Scalar,DeviceType> rightComponent,
107  TensorData<Scalar,DeviceType> cellMeasures,
108  int a_offset,
109  int b_offset,
110  int leftFieldOrdinalOffset,
111  int rightFieldOrdinalOffset,
112  bool forceNonSpecialized)
113  :
114  integralView_(integralData.template getUnderlyingView<integralViewRank>()),
115  leftComponent_(leftComponent),
116  composedTransform_(composedTransform),
117  rightComponent_(rightComponent),
118  cellMeasures_(cellMeasures),
119  a_offset_(a_offset),
120  b_offset_(b_offset),
121  leftComponentSpan_(leftComponent.extent_int(2)),
122  rightComponentSpan_(rightComponent.extent_int(2)),
123  numTensorComponents_(leftComponent.numTensorComponents()),
124  leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
125  rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
126  forceNonSpecialized_(forceNonSpecialized)
127  {
128  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
129 
130  // set up bounds containers
131  const int FIELD_DIM = 0;
132  const int POINT_DIM = 1;
133  maxFieldsLeft_ = 0;
134  maxFieldsRight_ = 0;
135  maxPointCount_ = 0;
136  for (int r=0; r<numTensorComponents_; r++)
137  {
138  leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
139  maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
140  rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
141  maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
142  pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
143  maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
144  }
145 
146  // set up relative enumeration spans: total number of enumeration indices with arguments prior to the startingComponent fixed. These are for *truncated* iterators; hence the -2 rather than -1 for the first startingComponent value.
147  int leftRelativeEnumerationSpan = 1;
148  int rightRelativeEnumerationSpan = 1;
149  for (int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
150  {
151  leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
152  rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
153  leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
154  rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
155  }
156 
157  // prepare for allocation of temporary storage
158  // note: tempStorage goes "backward", starting from the final component, which needs just one entry
159 
160  const bool allocateFadStorage = !std::is_pod<Scalar>::value;
161  if (allocateFadStorage)
162  {
163  fad_size_output_ = dimension_scalar(integralView_);
164  }
165 
166  const int R = numTensorComponents_ - 1;
167 
168  int num_ij = 1; // this counts how many entries there are corresponding to components from r to R-1.
169  int allocationSoFar = 0;
170  offsetsForComponentOrdinal_[R] = allocationSoFar;
171  allocationSoFar++; // we store one entry corresponding to R, the final component
172 
173  for (int r=R-1; r>0; r--) // last component is innermost in the for loops (requires least storage)
174  {
175  const int leftFields = leftComponent.getTensorComponent(r).extent_int(0);
176  const int rightFields = rightComponent.getTensorComponent(r).extent_int(0);
177 
178  num_ij *= leftFields * rightFields;
179 
180  offsetsForComponentOrdinal_[r] = allocationSoFar;
181  allocationSoFar += num_ij;
182  }
183  offsetsForComponentOrdinal_[0] = allocationSoFar; // first component stores directly to final integralView.
184  }
185 
186  template<size_t maxComponents, size_t numComponents = maxComponents>
187  KOKKOS_INLINE_FUNCTION
188  int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
189  const Kokkos::Array<int,maxComponents> &bounds) const
190  {
191  if (numComponents == 0)
192  {
193  return -1;
194  }
195  else
196  {
197  int r = static_cast<int>(numComponents - 1);
198  while (arguments[r] + 1 >= bounds[r])
199  {
200  arguments[r] = 0; // reset
201  r--;
202  if (r < 0) break;
203  }
204  if (r >= 0) ++arguments[r];
205  return r;
206  }
207  }
208 
210  KOKKOS_INLINE_FUNCTION
211  int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
212  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
213  const int &numComponents) const
214  {
215  if (numComponents == 0) return -1;
216  int r = static_cast<int>(numComponents - 1);
217  while (arguments[r] + 1 >= bounds[r])
218  {
219  arguments[r] = 0; // reset
220  r--;
221  if (r < 0) break;
222  }
223  if (r >= 0) ++arguments[r];
224  return r;
225  }
226 
227  template<size_t maxComponents, size_t numComponents = maxComponents>
228  KOKKOS_INLINE_FUNCTION
229  int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
230  const Kokkos::Array<int,maxComponents> &bounds) const
231  {
232  if (numComponents == 0)
233  {
234  return -1;
235  }
236  else
237  {
238  int r = static_cast<int>(numComponents - 1);
239  while (arguments[r] + 1 >= bounds[r])
240  {
241  r--;
242  if (r < 0) break;
243  }
244  return r;
245  }
246  }
247 
249  KOKKOS_INLINE_FUNCTION
250  int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
251  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
252  const int &numComponents) const
253  {
254  if (numComponents == 0) return -1;
255  int r = numComponents - 1;
256  while (arguments[r] + 1 >= bounds[r])
257  {
258  r--;
259  if (r < 0) break;
260  }
261  return r;
262  }
263 
264  template<size_t maxComponents, size_t numComponents = maxComponents>
265  KOKKOS_INLINE_FUNCTION
266  int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
267  const Kokkos::Array<int,maxComponents> &bounds,
268  const int startIndex) const
269  {
270  // the following mirrors what is done in TensorData
271  if (numComponents == 0)
272  {
273  return 0;
274  }
275  else
276  {
277  int enumerationIndex = 0;
278  for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
279  {
280  enumerationIndex += arguments[r];
281  enumerationIndex *= bounds[r-1];
282  }
283  enumerationIndex += arguments[startIndex];
284  return enumerationIndex;
285  }
286  }
287 
289 
290  // nvcc refuses to compile the below with error, "explicit specialization is not allowed in the current scope". Clang is OK with it. We just do a non-templated version below.
291 // template<size_t numTensorComponents>
292 // KOKKOS_INLINE_FUNCTION
293 // void runSpecialized( const TeamMember & teamMember ) const;
294 
295 // template<>
296 // KOKKOS_INLINE_FUNCTION
297 // void runSpecialized<3>( const TeamMember & teamMember ) const
298  KOKKOS_INLINE_FUNCTION
299  void runSpecialized3( const TeamMember & teamMember ) const
300  {
301  constexpr int numTensorComponents = 3;
302 
303  Kokkos::Array<int,numTensorComponents> pointBounds;
304  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
305  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
306  for (unsigned r=0; r<numTensorComponents; r++)
307  {
308  pointBounds[r] = pointBounds_[r];
309  leftFieldBounds[r] = leftFieldBounds_[r];
310  rightFieldBounds[r] = rightFieldBounds_[r];
311  }
312 
313  const int cellDataOrdinal = teamMember.league_rank();
314  const int numThreads = teamMember.team_size(); // num threads
315  const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
316 
317  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
318  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
319  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z; // cache the field values for faster access
320  if (fad_size_output_ > 0) {
321  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
322  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
323  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
324  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
325  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
326  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
327  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
328  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
329  }
330  else {
331  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
332  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
333  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
334  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
335  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
336  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
337  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
338  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
339  }
340 
341 // int approximateFlopCount = 0;
342 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
343 
344  constexpr int R = numTensorComponents - 1;
345 
346  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
347  {
348  const int a = a_offset_ + a_component;
349  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
350  {
351  const int b = b_offset_ + b_component;
352 
353  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
354  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
355 
356  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
357  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
358 
359  const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
360  const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
361  const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
362  const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
363  const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
364  const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
365 
366  const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
367  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (const int& fieldOrdinalPointOrdinal) {
368  const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
369  const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
370  if ((fieldOrdinal < leftTensorComponent_x.extent_int(0)) && (pointOrdinal < leftTensorComponent_x.extent_int(1)))
371  {
372  const int leftRank = leftTensorComponent_x.rank();
373  leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
374  }
375  if ((fieldOrdinal < leftTensorComponent_y.extent_int(0)) && (pointOrdinal < leftTensorComponent_y.extent_int(1)))
376  {
377  const int leftRank = leftTensorComponent_y.rank();
378  leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
379  }
380  if ((fieldOrdinal < leftTensorComponent_z.extent_int(0)) && (pointOrdinal < leftTensorComponent_z.extent_int(1)))
381  {
382  const int leftRank = leftTensorComponent_z.rank();
383  leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
384  }
385  if ((fieldOrdinal < rightTensorComponent_x.extent_int(0)) && (pointOrdinal < rightTensorComponent_x.extent_int(1)))
386  {
387  const int rightRank = rightTensorComponent_x.rank();
388  rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
389  }
390  if ((fieldOrdinal < rightTensorComponent_y.extent_int(0)) && (pointOrdinal < rightTensorComponent_y.extent_int(1)))
391  {
392  const int rightRank = rightTensorComponent_y.rank();
393  rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
394  }
395  if ((fieldOrdinal < rightTensorComponent_z.extent_int(0)) && (pointOrdinal < rightTensorComponent_z.extent_int(1)))
396  {
397  const int rightRank = rightTensorComponent_z.rank();
398  rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
399  }
400  });
401 
402  if (composedTransform_.underlyingMatchesLogical())
403  {
404  const auto & composedTransformView = composedTransform_.getUnderlyingView4();
405  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
406  pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
407  });
408  }
409  else
410  {
411  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
412  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
413  });
414  }
415 
416  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
417 
418  // synchronize threads
419  teamMember.team_barrier();
420 
421  // Setting scratchView to 0 here is not necessary from an algorithmic point of view, but *might* help with performance (due to a first-touch policy)
422  const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
423  for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
424  {
425  scratchView(i) = 0.0;
426  }
427 
428  // TODO: consider adding an innerLoopRange that is sized to be the maximum of the size of the inner loops we'd like to parallelize over. (note that this means we do the work in the outer loop redundantly that many times…)
429  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
430  const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
431  const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
432 
433  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
434  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
435  Kokkos::Array<int,numTensorComponents> leftFieldArguments;
436  Kokkos::Array<int,numTensorComponents> rightFieldArguments;
437  rightFieldArguments[R] = jz;
438  leftFieldArguments[R] = iz;
439 
440  Kokkos::Array<int,numTensorComponents> pointArguments;
441  for (int i=0; i<numTensorComponents; i++)
442  {
443  pointArguments[i] = 0;
444  }
445 
446  for (int lx=0; lx<pointBounds[0]; lx++)
447  {
448  pointArguments[0] = lx;
449 
450  // clear Gy scratch:
451  // in scratch, Gz (1 entry) comes first, then Gy entries.
452  const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
453  const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
454 
455  for (int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
456  {
457  scratchView(Gy_index) = 0;
458  }
459 
460  for (int ly=0; ly<pointBounds[1]; ly++)
461  {
462  pointArguments[1] = ly;
463 
464  Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
465  *Gz = 0;
466 
467  for (int lz=0; lz < pointBounds[2]; lz++)
468  {
469  const Scalar & leftValue = leftFields_z(iz,lz);
470  const Scalar & rightValue = rightFields_z(jz,lz);
471 
472  pointArguments[2] = lz;
473  int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
474 
475  *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
476 
477  // approximateFlopCount += 3; // 2 multiplies, 1 sum
478  } // lz
479 
480  for (int iy=0; iy<leftFieldBounds_[1]; iy++)
481  {
482  leftFieldArguments[1] = iy;
483  const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
484 
485  const Scalar & leftValue = leftFields_y(iy,ly);
486 
487  for (int jy=0; jy<rightFieldBounds_[1]; jy++)
488  {
489  rightFieldArguments[1] = jy;
490 
491  const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
492  const Scalar & rightValue = rightFields_y(jy,ly);
493 
494  const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
495  const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
496 
497  const int Gz_index = 0;
498  const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
499 
500  Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
501 
502  Gy += leftValue * Gz * rightValue;
503  // approximateFlopCount += 3; // 2 multiplies, 1 sum
504  }
505  }
506  } // ly
507  for (int ix=0; ix<leftFieldBounds_[0]; ix++)
508  {
509  leftFieldArguments[0] = ix;
510  const Scalar & leftValue = leftFields_x(ix,lx);
511 
512  for (int iy=0; iy<leftFieldBounds_[1]; iy++)
513  {
514  leftFieldArguments[1] = iy;
515 
516  const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
517 
518  for (int jx=0; jx<rightFieldBounds_[0]; jx++)
519  {
520  rightFieldArguments[0] = jx;
521  const Scalar & rightValue = rightFields_x(jx,lx);
522 
523  for (int jy=0; jy<rightFieldBounds_[1]; jy++)
524  {
525  rightFieldArguments[1] = jy;
526  const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
527 
528  const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
529 
530  const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
531  Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
532 
533  // compute enumeration indices to get field indices into output view
534  int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
535  int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
536  const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
537  const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
538 
539  if (integralViewRank == 3)
540  {
541 // if ((leftFieldIndex==0) && (rightFieldIndex==2))
542 // {
543 // using std::cout;
544 // using std::endl;
545 // cout << "***** Contribution to (0,0,2) *****\n";
546 // cout << "lx = " << lx << endl;
547 // cout << "ix = " << ix << endl;
548 // cout << "iy = " << iy << endl;
549 // cout << "jx = " << jx << endl;
550 // cout << "jy = " << jy << endl;
551 // cout << "iz = " << iz << endl;
552 // cout << "jz = " << jz << endl;
553 // cout << " leftValue = " << leftValue << endl;
554 // cout << "rightValue = " << rightValue << endl;
555 // cout << "Gy = " << Gy << endl;
556 //
557 // cout << endl;
558 // }
559 
560  // shape (C,F1,F2)
561  integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
562  }
563  else
564  {
565  // shape (F1,F2)
566  integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
567  }
568  // approximateFlopCount += 3; // 2 multiplies, 1 sum
569  } // jy
570  } // ix
571  } // iy
572  } // ix
573  } // lx
574  }); // TeamThreadRange parallel_for - (iz,jz) loop
575  }
576  }
577 // std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
578  }
579 
580  template<size_t numTensorComponents>
581  KOKKOS_INLINE_FUNCTION
582  void run( const TeamMember & teamMember ) const
583  {
584  Kokkos::Array<int,numTensorComponents> pointBounds;
585  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
586  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
587  for (unsigned r=0; r<numTensorComponents; r++)
588  {
589  pointBounds[r] = pointBounds_[r];
590  leftFieldBounds[r] = leftFieldBounds_[r];
591  rightFieldBounds[r] = rightFieldBounds_[r];
592  }
593 
594  const int cellDataOrdinal = teamMember.league_rank();
595  const int numThreads = teamMember.team_size(); // num threads
596  const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
597  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
598  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
599  Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values for faster access
600  if (fad_size_output_ > 0) {
601  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
602  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
603  leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
604  rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
605  }
606  else {
607  scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
608  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
609  leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
610  rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
611  }
612 
613 // int approximateFlopCount = 0;
614 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
615 
616  constexpr int R = numTensorComponents - 1;
617 
618  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
619  {
620  const int a = a_offset_ + a_component;
621  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
622  {
623  const int b = b_offset_ + b_component;
624 
625  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
626  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
627 
628  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
629  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
630 
631  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (const int& r) {
632  const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
633  const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
634  const int leftFieldCount = leftTensorComponent.extent_int(0);
635  const int pointCount = leftTensorComponent.extent_int(1);
636  const int leftRank = leftTensorComponent.rank();
637  const int rightFieldCount = rightTensorComponent.extent_int(0);
638  const int rightRank = rightTensorComponent.rank();
639  for (int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
640  {
641  // slightly weird logic here in effort to avoid branch divergence
642  const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
643  for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
644  {
645  const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646  leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
647  }
648  }
649  for (int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
650  {
651  // slightly weird logic here in effort to avoid branch divergence
652  const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
653  for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
654  {
655  const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
656  rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
657  }
658  }
659  });
660 
661  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
662  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
663  });
664  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
665 
666  // synchronize threads
667  teamMember.team_barrier();
668 
669  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
670  const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
671  const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
672  const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
673 
674  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
675  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
676  Kokkos::Array<int,numTensorComponents> leftFieldArguments;
677  Kokkos::Array<int,numTensorComponents> rightFieldArguments;
678  rightFieldArguments[R] = j_R;
679  leftFieldArguments[R] = i_R;
680 
681  //TODO: I believe that this can be moved outside the thread parallel_for
682  for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
683  {
684  scratchView(i) = 0.0;
685  }
686  Kokkos::Array<int,numTensorComponents> pointArguments;
687  for (unsigned i=0; i<numTensorComponents; i++)
688  {
689  pointArguments[i] = 0;
690  }
691 
692  int r = R;
693  while (r >= 0)
694  {
695  // integrate in final component dimension; this is where we need the M weight, as well as the weighted measure
696  const int pointBounds_R = pointBounds[R];
697  int & pointArgument_R = pointArguments[R];
698  for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
699  {
700  Scalar * G_R;
701  if (R != 0)
702  {
703  G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
704  }
705  else
706  {
707  const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
708  const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
709 
710  if (integralViewRank == 3)
711  {
712  // shape (C,F1,F2)
713  G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
714  }
715  else
716  {
717  // shape (F1,F2)
718  G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
719  }
720  }
721 
722  const int & pointIndex_R = pointArguments[R];
723 
724  const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
725  const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
726 
727  int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
728 
729  *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
730 
731 // approximateFlopCount += 3; // 2 multiplies, 1 sum
732  } // pointArgument_R
733 
734  const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in for loop above
735  const int r_min = (r_next >= 0) ? r_next : 0;
736 
737  for (int s=R-1; s>=r_min; s--)
738  {
739  const int & pointIndex_s = pointArguments[s];
740 
741  // want to cover all the multi-indices from s to R-1
742  for (int s1=s; s1<R; s1++)
743  {
744  leftFieldArguments[s1] = 0;
745  }
746 
747  // i_s, j_s are the indices into the "current" tensor component; these are references, so they actually vary as the arguments are incremented.
748  const int & i_s = leftFieldArguments[s];
749  const int & j_s = rightFieldArguments[s];
750 
751  int sLeft = s; // hereafter, sLeft is the return from the left field increment: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
752  const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
753  const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
754 
755  while (sLeft >= s)
756  {
757  const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
758 
759  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
760  const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
761 
762  const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
763 
764  for (int s1=s; s1<R; s1++)
765  {
766  rightFieldArguments[s1] = 0;
767  }
768  int sRight = s; // hereafter, sRight is the return from the leftFieldTensorIterator: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
769  while (sRight >= s)
770  {
771  const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
772 
773  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
774  const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
775 
776  const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
777 
778  const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
779 
780  Scalar* G_s;
781 
782  // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage
783  // (above, we have set the leftEnumerationIndex_sp, rightEnumerationIndex_sp to be 0 in this case, so G_sp_index will then also be 0)
784  const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
785 
786  const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
787 
788 
789  if (s != 0)
790  {
791  G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
792  }
793  else
794  {
795  // compute enumeration indices
796  int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
797  int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
798  const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
799  const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
800 
801  if (integralViewRank == 3)
802  {
803  // shape (C,F1,F2)
804  G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
805  }
806  else
807  {
808  // shape (F1,F2)
809  G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
810  }
811  }
812 
813  *G_s += leftValue * G_sp * rightValue;
814 
815 // approximateFlopCount += 3; // 2 multiplies, 1 sum
816 
817  // increment rightField
818  sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
819  }
820 
821  // increment leftField
822  sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
823  }
824  }
825 
826  // clear tempStorage for r_next+1 to R
827  if (r_min+1 <= R)
828  {
829  const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
830  for (int i=scratchOffsetForThread; i<endIndex; i++)
831  {
832  scratchView(i) = 0.0;
833  }
834 // auto tempStorageSubview = Kokkos::subview(scratchView, Kokkos::pair<int,int>{0,offsetsForComponentOrdinal_[r_min]});
835 // Kokkos::deep_copy(tempStorageSubview, 0.0);
836  }
837 
838  // proceed to next point
839  r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in the G_R integration for loop above
840  }
841  }); // TeamThreadRange parallel_for
842  }
843  }
844 // std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
845  }
846 
847  KOKKOS_INLINE_FUNCTION
848  void operator()( const TeamMember & teamMember ) const
849  {
850  switch (numTensorComponents_)
851  {
852  case 1: run<1>(teamMember); break;
853  case 2: run<2>(teamMember); break;
854  case 3:
855  if (forceNonSpecialized_)
856  run<3>(teamMember);
857  else
858  runSpecialized3(teamMember);
859  break;
860  case 4: run<4>(teamMember); break;
861  case 5: run<5>(teamMember); break;
862  case 6: run<6>(teamMember); break;
863  case 7: run<7>(teamMember); break;
864 #ifdef INTREPID2_HAVE_DEBUG
865  default:
866  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
867 #endif
868  }
869  }
870 
873  {
874  // compute flop count on a single cell, then multiply by the number of cells
875  int flopCount = 0;
876 
877  const int R = numTensorComponents_ - 1;
878 
879  // we cache the value of M_ab * cellMeasure at each point.
880  // access to cellMeasures involves cellMeasures.numTensorComponents() - 1 multiplies, so total is the below:
881  const int flopsPerPoint_ab = cellMeasures_.numTensorComponents(); // the access involves multiplying all the components together
882 
883  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
884  {
885  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
886  {
887  const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
888  const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
889 
890  const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
891  const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
892 
893  flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
894 
895  int flopsPer_i_R_j_R = 0;
896 
897  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
898  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
899  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
900  leftFieldArguments[R] = 0;
901 
902  Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
903  for (int i=0; i<numTensorComponents_; i++)
904  {
905  pointArguments[i] = 0;
906  }
907 
908  // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
909  // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
910  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
911  rightFieldArguments[R] = 0;
912 
913  int r = R;
914  while (r >= 0)
915  {
916  // integrate in final component dimension
917  for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
918  {
919  flopsPer_i_R_j_R += 4;
920  }
921  // TODO: figure out why the below is not the same thing as the above -- the below overcounts, somehow
922 // if (0 < pointBounds_[R])
923 // {
924 // flopsPer_i_R_j_R += pointBounds_[R] * 4;
925 // }
926 
927  const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
928  const int r_min = (r_next >= 0) ? r_next : 0;
929 
930  for (int s=R-1; s>=r_min; s--)
931  {
932  // want to cover all the multi-indices from s to R-1: for each we have 2 multiplies and one add (3 flops)
933  int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
934  int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
935 
936  flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
937  }
938 
939  // proceed to next point
940  r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
941  }
942 
943  flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
944  }
945  }
946 // std::cout << "flop count per cell: " << flopCount << std::endl;
947  return flopCount;
948  }
949 
951  int teamSize(const int &maxTeamSizeFromKokkos) const
952  {
953  const int R = numTensorComponents_ - 1;
954  const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
955  return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
956  }
957 
959  size_t team_shmem_size (int team_size) const
960  {
961  // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
962  size_t shmem_size = 0;
963 
964  if (fad_size_output_ > 0)
965  {
966  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
967  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1), fad_size_output_);
968  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
969  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
970  }
971  else
972  {
973  shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
974  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1));
975  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
976  shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
977  }
978 
979  return shmem_size;
980  }
981  };
982 
992  template<class Scalar, class DeviceType, int integralViewRank>
994  {
995  using ExecutionSpace = typename DeviceType::execution_space;
996  using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
997  using TeamMember = typename TeamPolicy::member_type;
998 
999  using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
1000  IntegralViewType integralView_;
1001  TensorData<Scalar,DeviceType> leftComponent_;
1002  Data<Scalar,DeviceType> composedTransform_;
1003  TensorData<Scalar,DeviceType> rightComponent_;
1004  TensorData<Scalar,DeviceType> cellMeasures_;
1005  int a_offset_;
1006  int b_offset_;
1007  int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
1008  int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
1009  int numTensorComponents_;
1010  int leftFieldOrdinalOffset_;
1011  int rightFieldOrdinalOffset_;
1012 
1013  size_t fad_size_output_ = 0; // 0 if not a fad type
1014 
1015  // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
1016  // (this also makes it easier to reorder loops, etc., for further optimizations)
1017  Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1018  Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1019  Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1020 
1021  int maxFieldsLeft_;
1022  int maxFieldsRight_;
1023  int maxPointCount_;
1024  public:
1026  TensorData<Scalar,DeviceType> leftComponent,
1027  Data<Scalar,DeviceType> composedTransform,
1028  TensorData<Scalar,DeviceType> rightComponent,
1029  TensorData<Scalar,DeviceType> cellMeasures,
1030  int a_offset,
1031  int b_offset,
1032  int leftFieldOrdinalOffset,
1033  int rightFieldOrdinalOffset)
1034  :
1035  integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1036  leftComponent_(leftComponent),
1037  composedTransform_(composedTransform),
1038  rightComponent_(rightComponent),
1039  cellMeasures_(cellMeasures),
1040  a_offset_(a_offset),
1041  b_offset_(b_offset),
1042  leftComponentSpan_(leftComponent.extent_int(2)),
1043  rightComponentSpan_(rightComponent.extent_int(2)),
1044  numTensorComponents_(leftComponent.numTensorComponents()),
1045  leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1046  rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1047  {
1048  INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
1049 
1050  const int FIELD_DIM = 0;
1051  const int POINT_DIM = 1;
1052  maxFieldsLeft_ = 0;
1053  maxFieldsRight_ = 0;
1054  maxPointCount_ = 0;
1055  for (int r=0; r<numTensorComponents_; r++)
1056  {
1057  leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1058  maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1059  rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1060  maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1061  pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1062  maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1063  }
1064 
1065  // prepare for allocation of temporary storage
1066  // note: tempStorage goes "backward", starting from the final component, which needs just one entry
1067 
1068  const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1069  if (allocateFadStorage)
1070  {
1071  fad_size_output_ = dimension_scalar(integralView_);
1072  }
1073  }
1074 
1075  template<size_t maxComponents, size_t numComponents = maxComponents>
1076  KOKKOS_INLINE_FUNCTION
1077  int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1078  const Kokkos::Array<int,maxComponents> &bounds) const
1079  {
1080  if (numComponents == 0) return -1;
1081  int r = static_cast<int>(numComponents - 1);
1082  while (arguments[r] + 1 >= bounds[r])
1083  {
1084  arguments[r] = 0; // reset
1085  r--;
1086  if (r < 0) break;
1087  }
1088  if (r >= 0) ++arguments[r];
1089  return r;
1090  }
1091 
1093  KOKKOS_INLINE_FUNCTION
1094  int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1095  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1096  const int &numComponents) const
1097  {
1098  if (numComponents == 0) return -1;
1099  int r = static_cast<int>(numComponents - 1);
1100  while (arguments[r] + 1 >= bounds[r])
1101  {
1102  arguments[r] = 0; // reset
1103  r--;
1104  if (r < 0) break;
1105  }
1106  if (r >= 0) ++arguments[r];
1107  return r;
1108  }
1109 
1110  template<size_t maxComponents, size_t numComponents = maxComponents>
1111  KOKKOS_INLINE_FUNCTION
1112  int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
1113  const Kokkos::Array<int,maxComponents> &bounds) const
1114  {
1115  if (numComponents == 0) return -1;
1116  int r = static_cast<int>(numComponents - 1);
1117  while (arguments[r] + 1 >= bounds[r])
1118  {
1119  r--;
1120  if (r < 0) break;
1121  }
1122  return r;
1123  }
1124 
1126  KOKKOS_INLINE_FUNCTION
1127  int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1128  const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1129  const int &numComponents) const
1130  {
1131  if (numComponents == 0) return -1;
1132  int r = numComponents - 1;
1133  while (arguments[r] + 1 >= bounds[r])
1134  {
1135  r--;
1136  if (r < 0) break;
1137  }
1138  return r;
1139  }
1140 
1141  template<size_t maxComponents, size_t numComponents = maxComponents>
1142  KOKKOS_INLINE_FUNCTION
1143  int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
1144  const Kokkos::Array<int,maxComponents> &bounds,
1145  const int startIndex) const
1146  {
1147  // the following mirrors what is done in TensorData
1148  if (numComponents == 0)
1149  {
1150  return 0;
1151  }
1152  int enumerationIndex = 0;
1153  for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
1154  {
1155  enumerationIndex += arguments[r];
1156  enumerationIndex *= bounds[r-1];
1157  }
1158  enumerationIndex += arguments[startIndex];
1159  return enumerationIndex;
1160  }
1161 
1162  template<int rank>
1163  KOKKOS_INLINE_FUNCTION
1164  enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1165  integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1166  {
1167  return integralView(cellDataOrdinal,i,j);
1168  }
1169 
1170  template<int rank>
1171  KOKKOS_INLINE_FUNCTION
1172  enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1173  integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1174  {
1175  return integralView(i,j);
1176  }
1177 
1179  KOKKOS_INLINE_FUNCTION
1180  void runSpecialized3( const TeamMember & teamMember ) const
1181  {
1182  constexpr int numTensorComponents = 3;
1183 
1184  const int pointBounds_x = pointBounds_[0];
1185  const int pointBounds_y = pointBounds_[1];
1186  const int pointBounds_z = pointBounds_[2];
1187  const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1188 
1189  const int leftFieldBounds_x = leftFieldBounds_[0];
1190  const int rightFieldBounds_x = rightFieldBounds_[0];
1191  const int leftFieldBounds_y = leftFieldBounds_[1];
1192  const int rightFieldBounds_y = rightFieldBounds_[1];
1193  const int leftFieldBounds_z = leftFieldBounds_[2];
1194  const int rightFieldBounds_z = rightFieldBounds_[2];
1195 
1196  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1197  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1198  for (unsigned r=0; r<numTensorComponents; r++)
1199  {
1200  leftFieldBounds[r] = leftFieldBounds_[r];
1201  rightFieldBounds[r] = rightFieldBounds_[r];
1202  }
1203 
1204  const auto integralView = integralView_;
1205  const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1206  const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1207 
1208  const int cellDataOrdinal = teamMember.league_rank();
1209  const int threadNumber = teamMember.team_rank();
1210 
1211  const int numThreads = teamMember.team_size(); // num threads
1212  const int GyEntryCount = pointBounds_z; // for each thread: store one Gy value per z coordinate
1213  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals; // for caching Gx values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1214  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals; // for caching Gy values (each thread gets a stack, of the same height as tensorComponents - 1)
1215  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GzIntegral; // for one Gz value that we sum into before summing into the destination matrix
1216  Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1217 
1218  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1219  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1220  Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1221  if (fad_size_output_ > 0) {
1222  GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1223  GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1224  GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads, fad_size_output_);
1225  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1226 
1227  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1228  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1229  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1230  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1231  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1232  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1233  }
1234  else {
1235  GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1236  GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1237  GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads);
1238  pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1239 
1240  leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1241  rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1242  leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1243  rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1244  leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1245  rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1246  }
1247 
1248 // int approximateFlopCount = 0;
1249 // int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
1250 
1251  // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1252 
1253  // synchronize threads
1254  teamMember.team_barrier();
1255 
1256  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1257  {
1258  const int a = a_offset_ + a_component;
1259  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1260  {
1261  const int b = b_offset_ + b_component;
1262 
1263  const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
1264  const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
1265  const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
1266  const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
1267  const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
1268  const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
1269 
1270  const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1271  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (const int& fieldOrdinal) {
1272  if (fieldOrdinal < leftTensorComponent_x.extent_int(0))
1273  {
1274  const int pointCount = leftTensorComponent_x.extent_int(1);
1275  const int leftRank = leftTensorComponent_x.rank();
1276  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1277  {
1278  leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1279  }
1280  }
1281  if (fieldOrdinal < leftTensorComponent_y.extent_int(0))
1282  {
1283  const int pointCount = leftTensorComponent_y.extent_int(1);
1284  const int leftRank = leftTensorComponent_y.rank();
1285  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1286  {
1287  leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1288  }
1289  }
1290  if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1291  {
1292  const int pointCount = leftTensorComponent_z.extent_int(1);
1293  const int leftRank = leftTensorComponent_z.rank();
1294  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1295  {
1296  leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1297  }
1298  }
1299  if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1300  {
1301  const int pointCount = rightTensorComponent_x.extent_int(1);
1302  const int rightRank = rightTensorComponent_x.rank();
1303  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1304  {
1305  rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1306  }
1307  }
1308  if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1309  {
1310  const int pointCount = rightTensorComponent_y.extent_int(1);
1311  const int rightRank = rightTensorComponent_y.rank();
1312  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1313  {
1314  rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1315  }
1316  }
1317  if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1318  {
1319  const int pointCount = rightTensorComponent_z.extent_int(1);
1320  const int rightRank = rightTensorComponent_z.rank();
1321  for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1322  {
1323  rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1324  }
1325  }
1326  });
1327 
1328  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1329  pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1330  });
1331 
1332  // synchronize threads
1333  teamMember.team_barrier();
1334 
1335  for (int i0=0; i0<leftFieldBounds_x; i0++)
1336  {
1337  for (int j0=0; j0<rightFieldBounds_x; j0++)
1338  {
1339  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1340  // first component is fastest-moving; we can get a tensorPointEnumerationOffset just by multiplying by the pointBounds in x
1341  const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions; // compute offset for pointWeights container, for which x is the fastest-moving
1342 
1343  Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1344 
1345  Gx = 0;
1346  if (fad_size_output_ == 0)
1347  {
1348  // not a Fad type; we're allow to have a vector range
1349  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1350  {
1351  integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1352  }, Gx);
1353  }
1354  else
1355  {
1356  for (int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1357  {
1358  Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1359  }
1360  }
1361  });
1362 
1363  // synchronize threads
1364  teamMember.team_barrier();
1365 
1366  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (const int& i1j1) {
1367  const int i1 = i1j1 % leftFieldBounds_y;
1368  const int j1 = i1j1 / leftFieldBounds_y;
1369 
1370  int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1371 
1372  int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1373  for (int lz=0; lz<pointBounds_z; lz++)
1374  {
1375  Scalar & Gy = GyIntegrals(Gy_index);
1376  Gy = 0.0;
1377 
1378  for (int ly=0; ly<pointBounds_y; ly++)
1379  {
1380  const Scalar & leftValue = leftFields_y(i1,ly);
1381  const Scalar & rightValue = rightFields_y(j1,ly);
1382 
1383  Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1384 
1385  pointEnumerationIndex++;
1386  }
1387  Gy_index++;
1388  }
1389 
1390  Scalar & Gz = GzIntegral(threadNumber); // one entry per thread
1391  for (int i2=0; i2<leftFieldBounds_z; i2++)
1392  {
1393  for (int j2=0; j2<rightFieldBounds_z; j2++)
1394  {
1395  Gz = 0.0;
1396 
1397  int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1398 
1399  for (int lz=0; lz<pointBounds_z; lz++)
1400  {
1401  const Scalar & leftValue = leftFields_z(i2,lz);
1402  const Scalar & rightValue = rightFields_z(j2,lz);
1403 
1404  Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1405 
1406  Gy_index++;
1407  }
1408 
1409  const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1410  const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1411  // the above are an optimization of the below, undertaken on the occasion of a weird Intel compiler segfault, possibly a compiler bug.
1412 // const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset;
1413 // const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset;
1414 
1415  integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1416  }
1417  }
1418  });
1419  // synchronize threads
1420  teamMember.team_barrier();
1421  }
1422  }
1423  }
1424  }
1425  }
1426 
1427  template<size_t numTensorComponents>
1428  KOKKOS_INLINE_FUNCTION
1429  void run( const TeamMember & teamMember ) const
1430  {
1431  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "implementation incomplete");
1432 // Kokkos::Array<int,numTensorComponents> pointBounds;
1433 // Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1434 // Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1435 //
1436 // int pointsInNonzeroComponentDimensions = 1;
1437 // for (unsigned r=0; r<numTensorComponents; r++)
1438 // {
1439 // pointBounds[r] = pointBounds_[r];
1440 // if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1441 // leftFieldBounds[r] = leftFieldBounds_[r];
1442 // rightFieldBounds[r] = rightFieldBounds_[r];
1443 // }
1444 //
1445 // const int cellDataOrdinal = teamMember.league_rank();
1446 // const int numThreads = teamMember.team_size(); // num threads
1447 // const int G_k_StackHeight = numTensorComponents - 1; // per thread
1448 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_0_IntegralsView; // for caching G0 values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1449 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_k_StackView; // for caching G_k values (each thread gets a stack, of the same height as tensorComponents - 1)
1450 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1451 // Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values at each level for faster access
1452 // if (fad_size_output_ > 0) {
1453 // G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads, fad_size_output_);
1454 // G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1455 // pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1456 // leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1457 // rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1458 // }
1459 // else {
1460 // G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads);
1461 // G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1462 // pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1463 // leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1464 // rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1465 // }
1466 //
1469 //
1470 // constexpr int R = numTensorComponents - 1;
1471 //
1472 // // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1473 //
1474 // // synchronize threads
1475 // teamMember.team_barrier();
1476 //
1477 // for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1478 // {
1479 // const int a = a_offset_ + a_component;
1480 // for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1481 // {
1482 // const int b = b_offset_ + b_component;
1483 //
1484 // const Data<Scalar,DeviceType> & leftFirstComponent = leftComponent_.getTensorComponent(0);
1485 // const Data<Scalar,DeviceType> & rightFirstComponent = rightComponent_.getTensorComponent(0);
1486 //
1487 // const int numLeftFieldsFirst = leftFirstComponent.extent_int(0); // shape (F,P[,D])
1488 // const int numRightFieldsFirst = rightFirstComponent.extent_int(0); // shape (F,P[,D])
1489 //
1490 // const int numPointsFirst = leftFirstComponent.extent_int(1);
1491 //
1492 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1493 // pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1494 // });
1495 //
1496 // // synchronize threads
1497 // teamMember.team_barrier();
1498 //
1499 // for (int i0=0; i0<numLeftFieldsFirst; i0++)
1500 // {
1501 // for (int j0=0; j0<numRightFieldsFirst; j0++)
1502 // {
1503 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1504 // const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1505 // const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1506 // leftFields(pointOrdinal) = leftFirstComponent(fieldOrdinal,pointOrdinal);
1507 // });
1508 //
1509 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numRightFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1510 // const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1511 // const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1512 // rightFields(pointOrdinal) = rightFirstComponent(fieldOrdinal,pointOrdinal);
1513 // });
1514 //
1515 // // synchronize threads
1516 // teamMember.team_barrier();
1517 //
1518 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1519 // Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1520 // int remainingIndex = pointEnumerationIndexLaterDimensions;
1521 //
1522 // for (int d=R-1; d>0; d--) // last component (z in 3D hypercube) is fastest-moving // TODO: consider doing first component as fastest-moving. That would make indexing into pointWeights simpler
1523 // {
1524 // pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1];
1525 // remainingIndex /= pointBounds[d+1];
1526 // }
1527 // pointArgumentsInLaterDimensions[0] = remainingIndex;
1528 //
1529 // int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1530 // for (int d=R; d>0; d--)
1531 // {
1532 // tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1533 // tensorPointEnumerationOffset *= pointBounds[d-1];
1534 // }
1535 //
1536 // Scalar integralValue = 0;
1537 // if (fad_size_output_ == 0)
1538 // {
1539 // // not a Fad type; we're allow to have a vector range
1540 // Kokkos::parallel_reduce("first component integral", Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1541 // {
1542 // integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1543 // }, integralValue);
1544 // }
1545 // else
1546 // {
1547 // for (int pointOrdinal=0; pointOrdinal<numPointsFirst; pointOrdinal++)
1548 // {
1549 // integralValue += leftFields(pointOrdinal) * rightFields(pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1550 // }
1551 // }
1552 //
1553 // G_0_IntegralsView(pointEnumerationIndexLaterDimensions) = integralValue;
1554 // });
1555 //
1556 // // synchronize threads
1557 // teamMember.team_barrier();
1558 //
1559 // // TODO: finish this, probably after having written up the algorithm for arbitrary component count. (I have it written down for 3D.)
1560 // }
1561 // }
1562 // }
1563 // }
1565  }
1566 
1567  KOKKOS_INLINE_FUNCTION
1568  void operator()( const TeamMember & teamMember ) const
1569  {
1570  switch (numTensorComponents_)
1571  {
1572  case 1: run<1>(teamMember); break;
1573  case 2: run<2>(teamMember); break;
1574  case 3: runSpecialized3(teamMember); break;
1575  case 4: run<4>(teamMember); break;
1576  case 5: run<5>(teamMember); break;
1577  case 6: run<6>(teamMember); break;
1578  case 7: run<7>(teamMember); break;
1579 #ifdef INTREPID2_HAVE_DEBUG
1580  default:
1581  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
1582 #endif
1583  }
1584  }
1585 
1588  {
1589  // compute flop count on a single cell
1590  int flopCount = 0;
1591 
1592  constexpr int numTensorComponents = 3;
1593  Kokkos::Array<int,numTensorComponents> pointBounds;
1594  Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1595  Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1596 
1597  int pointsInNonzeroComponentDimensions = 1;
1598  for (unsigned r=0; r<numTensorComponents; r++)
1599  {
1600  pointBounds[r] = pointBounds_[r];
1601  if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1602  leftFieldBounds[r] = leftFieldBounds_[r];
1603  rightFieldBounds[r] = rightFieldBounds_[r];
1604  }
1605 
1606  for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1607  {
1608  for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1609  {
1610 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1611 // pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1612 // });
1613  flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1614 
1615  for (int i0=0; i0<leftFieldBounds[0]; i0++)
1616  {
1617  for (int j0=0; j0<rightFieldBounds[0]; j0++)
1618  {
1619  flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3; // 3 flops per integration point in the loop commented out below
1620 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1621 // Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1622 // int remainingIndex = pointEnumerationIndexLaterDimensions;
1623 //
1624 // for (int d=0; d<R-1; d++) // first component is fastest-moving; this is determined by order of access in the lz/ly loop to compute Gy (integrals in y dimension)
1625 // {
1626 // pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1]; // d+1 because x dimension is being integrated away
1627 // remainingIndex /= pointBounds[d+1];
1628 // }
1629 // pointArgumentsInLaterDimensions[R-1] = remainingIndex;
1630 //
1631 // int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1632 // for (int d=R; d>0; d--)
1633 // {
1634 // tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1635 // tensorPointEnumerationOffset *= pointBounds[d-1];
1636 // }
1637 //
1638 // Scalar integralValue = 0;
1639 // if (fad_size_output_ == 0)
1640 // {
1641 // // not a Fad type; we're allow to have a vector range
1642 // Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1643 // {
1644 // integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1645 // }, integralValue);
1646 // }
1647 // else
1648 // {
1649 // for (int x_pointOrdinal=0; x_pointOrdinal<numPointsFirst; x_pointOrdinal++)
1650 // {
1651 // integralValue += leftFields_x(x_pointOrdinal) * rightFields_x(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1652 // }
1653 // }
1654 //
1655 // GxIntegrals(pointEnumerationIndexLaterDimensions) = integralValue;
1656 // });
1657 
1658 
1659  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3; // 3 flops for each Gy += line in the below
1660  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3; // 3 flops for each Gz += line in the below
1661  flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1; // 1 flops for the integralView += line below
1662 
1663 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds[1] * rightFieldBounds[1]), [&] (const int& i1j1) {
1664 // const int i1 = i1j1 % leftFieldBounds[1];
1665 // const int j1 = i1j1 / leftFieldBounds[1];
1666 //
1668 //
1669 // int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1670 // for (int lz=0; lz<pointBounds[2]; lz++)
1671 // {
1672 // Scalar & Gy = GyIntegrals(Gy_index);
1673 // Gy = 0.0;
1674 //
1675 // const bool leftRankIs3 = ( leftFields_y.rank() == 3);
1676 // const bool rightRankIs3 = (rightFields_y.rank() == 3);
1677 // for (int ly=0; ly<pointBounds[1]; ly++)
1678 // {
1679 // const Scalar & leftValue = leftRankIs3 ? leftFields_y(i1,ly,a_component) : leftFields_y(i1,ly);
1680 // const Scalar & rightValue = rightRankIs3 ? rightFields_y(j1,ly,b_component) : rightFields_y(j1,ly);
1681 //
1682 // Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1683 //
1684 // pointEnumerationIndex++;
1685 // }
1686 // Gy_index++;
1687 // }
1688 //
1689 // for (int i2=0; i2<leftFieldBounds[2]; i2++)
1690 // {
1691 // for (int j2=0; j2<rightFieldBounds[2]; j2++)
1692 // {
1693 // Scalar Gz = 0.0;
1694 //
1695 // int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1696 //
1697 // const bool leftRankIs3 = ( leftFields_z.rank() == 3);
1698 // const bool rightRankIs3 = (rightFields_z.rank() == 3);
1699 // for (int lz=0; lz<pointBounds[2]; lz++)
1700 // {
1701 // const Scalar & leftValue = leftRankIs3 ? leftFields_z(i2,lz,a_component) : leftFields_z(i2,lz);
1702 // const Scalar & rightValue = rightRankIs3 ? rightFields_z(j2,lz,b_component) : rightFields_z(j2,lz);
1703 //
1704 // Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1705 //
1706 // Gy_index++;
1707 // }
1708 //
1709 // Kokkos::Array<int,3> leftArguments {i0,i1,i2};
1710 // Kokkos::Array<int,3> rightArguments {j0,j1,j2};
1711 //
1712 // const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset_;
1713 // const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset_;
1714 //
1715 // if (integralViewRank == 2)
1716 // {
1717 // integralView_.access(i,j,0) += Gz;
1718 // }
1719 // else
1720 // {
1721 // integralView_.access(cellDataOrdinal,i,j) += Gz;
1722 // }
1723 // }
1724 // }
1725 // });
1726  }
1727  }
1728  }
1729  }
1730  return flopCount;
1731  }
1732 
1734  int teamSize(const int &maxTeamSizeFromKokkos) const
1735  {
1736  // TODO: fix this to match the actual parallelism expressed
1737  const int R = numTensorComponents_ - 1;
1738  const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1739  return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1740  }
1741 
1743  size_t team_shmem_size (int numThreads) const
1744  {
1745  // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
1746  size_t shmem_size = 0;
1747 
1748  const int GyEntryCount = pointBounds_[2]; // for each thread: store one Gy value per z coordinate
1749 
1750  int pointsInNonzeroComponentDimensions = 1;
1751  for (int d=1; d<numTensorComponents_; d++)
1752  {
1753  pointsInNonzeroComponentDimensions *= pointBounds_[d];
1754  }
1755 
1756  if (fad_size_output_ > 0)
1757  {
1758  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_); // GxIntegrals: entries with x integrated away
1759  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_); // GyIntegrals: entries with x,y integrated away
1760  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads, fad_size_output_); // GzIntegral: entry with x,y,z integrated away
1761  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1), fad_size_output_); // pointWeights
1762 
1763  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_); // leftFields_x
1764  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_); // rightFields_x
1765  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_); // leftFields_y
1766  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_); // rightFields_y
1767  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_); // leftFields_z
1768  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_); // rightFields_z
1769  }
1770  else
1771  {
1772  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions); // GxIntegrals: entries with x integrated away
1773  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads); // GyIntegrals: entries with x,y integrated away
1774  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads); // GzIntegral: entry with x,y,z integrated away
1775  shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1)); // pointWeights
1776 
1777  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]); // leftFields_x
1778  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]); // rightFields_x
1779  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]); // leftFields_y
1780  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]); // rightFields_y
1781  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]); // leftFields_z
1782  shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]); // rightFields_z
1783  }
1784 
1785  return shmem_size;
1786  }
1787  };
1788  }
1789 
1790 template<typename DeviceType>
1791 template<class Scalar>
1793  const TensorData<Scalar,DeviceType> cellMeasures,
1794  const TransformedVectorData<Scalar,DeviceType> vectorDataRight)
1795 {
1796  // allocates a (C,F,F) container for storing integral data
1797 
1798  // determine cellDataExtent and variation type. We currently support CONSTANT, MODULAR, and GENERAL as possible output variation types, depending on the inputs.
1799  // If cellMeasures has non-trivial tensor structure, the rank-1 cell Data object is the first component.
1800  // If cellMeasures has trivial tensor structure, then the first and only component has the cell index in its first dimension.
1801  // I.e., either way the relevant Data object is cellMeasures.getTensorComponent(0)
1802  const int CELL_DIM = 0;
1803  const auto cellMeasureData = cellMeasures.getTensorComponent(0);
1804  const auto leftTransform = vectorDataLeft.transform();
1805 
1806  DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1807  combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.transform().getDimensionInfo(CELL_DIM));
1808  combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.transform().getDimensionInfo(CELL_DIM));
1809 
1810  DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1811  int cellDataExtent = combinedCellDimInfo.dataExtent;
1812 
1813  const int numCells = vectorDataLeft.numCells();
1814  const int numFieldsLeft = vectorDataLeft.numFields();
1815  const int numFieldsRight = vectorDataRight.numFields();
1816 
1817  Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1818  Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1819 
1820  if (cellVariationType != CONSTANT)
1821  {
1822  Kokkos::View<Scalar***,DeviceType> data("Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1823  return Data<Scalar,DeviceType>(data, extents, variationTypes);
1824  }
1825  else
1826  {
1827  Kokkos::View<Scalar**,DeviceType> data("Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1828  return Data<Scalar,DeviceType>(data, extents, variationTypes);
1829  }
1830 }
1831 
1835 template<typename DeviceType>
1836 template<class Scalar>
1838  const TensorData<Scalar,DeviceType> & cellMeasures,
1839  const TransformedVectorData<Scalar,DeviceType> & vectorDataRight, const bool sumInto,
1840  double* approximateFlops)
1841 {
1842  using ExecutionSpace = typename DeviceType::execution_space;
1843 
1844  if (approximateFlops != NULL)
1845  {
1846  *approximateFlops = 0;
1847  }
1848 
1849  // integral data may have shape (C,F1,F2) or (if the variation type is CONSTANT in the cell dimension) shape (F1,F2)
1850  const int integralViewRank = integrals.getUnderlyingViewRank();
1851 
1852  if (!sumInto)
1853  {
1854  integrals.clear();
1855  }
1856 
1857  const int cellDataExtent = integrals.getDataExtent(0);
1858  const int numFieldsLeft = vectorDataLeft.numFields();
1859  const int numFieldsRight = vectorDataRight.numFields();
1860  const int spaceDim = vectorDataLeft.spaceDim();
1861 
1862  INTREPID2_TEST_FOR_EXCEPTION(vectorDataLeft.spaceDim() != vectorDataRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
1863 
1864  // TODO: add more checks against validity of inputs
1865 
1866  // we require that the number of tensor components in the vectors are the same for each vector entry
1867  // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
1868  int numTensorComponentsLeft = -1;
1869  const auto &refVectorLeft = vectorDataLeft.vectorData();
1870  int numFamiliesLeft = refVectorLeft.numFamilies();
1871  int numVectorComponentsLeft = refVectorLeft.numComponents();
1872  Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1873  Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
1874  for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1875  {
1876  for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1877  {
1878  const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
1879  if (tensorData.numTensorComponents() > 0)
1880  {
1881  if (numTensorComponentsLeft == -1)
1882  {
1883  numTensorComponentsLeft = tensorData.numTensorComponents();
1884  }
1885  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1886  for (int r=0; r<numTensorComponentsLeft; r++)
1887  {
1888  maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
1889  }
1890  }
1891  }
1892  }
1893  int numTensorComponentsRight = -1;
1894  const auto &refVectorRight = vectorDataRight.vectorData();
1895  int numFamiliesRight = refVectorRight.numFamilies();
1896  int numVectorComponentsRight = refVectorRight.numComponents();
1897  for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
1898  {
1899  for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
1900  {
1901  const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
1902  if (tensorData.numTensorComponents() > 0)
1903  {
1904  if (numTensorComponentsRight == -1)
1905  {
1906  numTensorComponentsRight = tensorData.numTensorComponents();
1907  }
1908  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
1909  for (int r=0; r<numTensorComponentsRight; r++)
1910  {
1911  maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
1912  }
1913  }
1914  }
1915  }
1916  INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument, "Left and right vector entries must have the same number of tensorial components");
1917  const int numPointTensorComponents = cellMeasures.numTensorComponents() - 1;
1918 
1919  if ((numPointTensorComponents == numTensorComponentsLeft) && vectorDataLeft.axisAligned() && vectorDataRight.axisAligned())
1920  {
1921  // cellMeasures is a non-trivial tensor product, and the pullbacks are all diagonals.
1922 
1923  // in this case, the integrals in each tensorial direction are entirely separable
1924  // allocate some temporary storage for the integrals in each tensorial direction
1925 
1926  // if cellMeasures is a nontrivial tensor product, that means that all cells have the same shape, up to scaling.
1927 
1928  Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
1929  for (int r=0; r<numPointTensorComponents; r++)
1930  {
1931  // first tensorial component of cell measures is the cell dimension; after that we have (P1,P2,…)
1932  pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0);
1933  }
1934 
1935  Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents> componentIntegrals; // these are reference-space integrals; no Jacobians or cell measures applied as yet
1936 
1937  for (int r=0; r<numPointTensorComponents; r++)
1938  {
1939  for (int d=0; d<spaceDim; d++)
1940  {
1941  /*
1942  Four cases to consider:
1943  1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
1944  2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
1945  3. First container is axial, second arbitrary.
1946  4. First is arbitrary, second axial.
1947 
1948  But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed. The actual difference between
1949  the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
1950 
1951  The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
1952  while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
1953  */
1954 
1955  const Data<Scalar,DeviceType> & leftComponent = vectorDataLeft.vectorData().getComponent(d).getTensorComponent(r);
1956  const Data<Scalar,DeviceType> & rightComponent = vectorDataRight.vectorData().getComponent(d).getTensorComponent(r);
1957 
1958  // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.)
1959 
1960  const Data<Scalar,DeviceType> & quadratureWeights = cellMeasures.getTensorComponent(r+1);
1961 
1962  int leftComponentCount = leftComponent.extent_int(0);
1963  int rightComponentCount = rightComponent.extent_int(0);
1964 
1965  const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1966  if (allocateFadStorage)
1967  {
1968  auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
1969  componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftComponentCount, rightComponentCount, fad_size_output);
1970  }
1971  else
1972  {
1973  componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftComponentCount, rightComponentCount);
1974  }
1975 
1976  Kokkos::deep_copy(componentIntegrals[r][d], 0.0); // initialize
1977 
1978  const int & numPoints = pointDimensions[r];
1979  const int leftComponentRank = leftComponent.rank();
1980  const int rightComponentRank = rightComponent.rank();
1981 
1982  const int leftComponentDimSpan = leftComponent.extent_int(2);
1983  const int rightComponentDimSpan = rightComponent.extent_int(2);
1984  INTREPID2_TEST_FOR_EXCEPTION(( leftComponentDimSpan != rightComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
1985 
1986 // // TODO: add support for cases where the component ranks are 3, and spaceDim > 1.
1987 // INTREPID2_TEST_FOR_EXCEPTION(( leftComponentRank == 3) && ( leftComponent.extent_int(2) > 1), std::invalid_argument, "spaceDim > 1 not yet supported by this integrate() use case.");
1988 // INTREPID2_TEST_FOR_EXCEPTION((rightComponentRank == 3) && (rightComponent.extent_int(2) > 1), std::invalid_argument, "spaceDim > 1 not yet supported by this integrate() use case.");
1989 
1990  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftComponentCount,rightComponentCount});
1991 
1992  for (int a=0; a<leftComponentDimSpan; a++)
1993  {
1994  Kokkos::parallel_for("compute componentIntegrals", policy,
1995  KOKKOS_LAMBDA (const int &i, const int &j) {
1996  Scalar refSpaceIntegral = 0.0;
1997  for (int ptOrdinal=0; ptOrdinal<numPoints; ptOrdinal++)
1998  {
1999  const Scalar & leftValue = ( leftComponentRank == 2) ? leftComponent(i,ptOrdinal) : leftComponent(i,ptOrdinal,a);
2000  const Scalar & rightValue = (rightComponentRank == 2) ? rightComponent(j,ptOrdinal) : rightComponent(j,ptOrdinal,a);
2001  refSpaceIntegral += leftValue * rightValue * quadratureWeights(ptOrdinal);
2002  }
2003  componentIntegrals[r][d](i,j) = refSpaceIntegral;
2004  });
2005  }
2006 
2007  if (approximateFlops != NULL)
2008  {
2009  *approximateFlops += leftComponentCount*rightComponentCount*numPoints*(3); // two multiplies, one add in innermost loop
2010  }
2011  }
2012  }
2013  ExecutionSpace().fence(); // ensure that kernels launched above have completed before the kernels below launch.
2014 
2015  // only one of these will be a valid container:
2016  Kokkos::View<Scalar**, DeviceType> integralView2;
2017  Kokkos::View<Scalar***, DeviceType> integralView3;
2018  if (integralViewRank == 3)
2019  {
2020  integralView3 = integrals.getUnderlyingView3();
2021  }
2022  else
2023  {
2024  integralView2 = integrals.getUnderlyingView2();
2025  }
2026 
2027  const int leftFamilyCount = vectorDataLeft.vectorData().numFamilies();
2028  const int rightFamilyCount = vectorDataRight.vectorData().numFamilies();
2029 
2030  for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2031  {
2032  bool haveLaunchedContributionToLeftFamily = false; // for tracking whether we need a fence() below
2033 
2034  int a_offset = 0; // left vector component offset
2035  int leftFieldOffset = vectorDataLeft.vectorData().familyFieldOrdinalOffset(leftFamilyOrdinal);
2036 
2037  const int leftVectorComponentCount = vectorDataLeft.vectorData().numComponents();
2038  for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2039  {
2040  const auto & leftComponent = vectorDataLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal);
2041  if (!leftComponent.isValid())
2042  {
2043  a_offset++; // empty components are understood to take up one dimension
2044  continue;
2045  }
2046  const int leftDimSpan = leftComponent.extent_int(2);
2047 
2048  const int leftComponentFieldCount = leftComponent.extent_int(0);
2049 
2050  for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2051  {
2052  bool haveLaunchedContributionToRightFamily = false; // for tracking whether we need a fence() below
2053 
2054  int b_offset = 0; // right vector component offset
2055  int rightFieldOffset = vectorDataRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2056 
2057  const int rightVectorComponentCount = vectorDataRight.vectorData().numComponents();
2058  for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2059  {
2060  const auto & rightComponent = vectorDataRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal);
2061  if (!rightComponent.isValid())
2062  {
2063  b_offset++; // empty components are understood to take up one dimension
2064  continue;
2065  }
2066  const int rightDimSpan = rightComponent.extent_int(2);
2067 
2068  const int rightComponentFieldCount = rightComponent.extent_int(0);
2069 
2070  // we only accumulate for a == b (since this is the axis-aligned case). Do the a, b spans intersect?
2071  if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset)) // no intersection
2072  {
2073  b_offset += rightDimSpan;
2074 
2075  continue;
2076  }
2077 
2078  // if the a, b spans intersect, by assumption they should align exactly
2079  INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error, "left and right dimension offsets should match.");
2080  INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2081 
2082  const int d_start = a_offset;
2083  const int d_end = d_start + leftDimSpan;
2084 
2085  if (haveLaunchedContributionToLeftFamily && haveLaunchedContributionToRightFamily)
2086  {
2087  // accumulation to the same block in integrals container; fence to ensure that that has completed before the next kernel is launched
2088  ExecutionSpace().fence();
2089  }
2090  haveLaunchedContributionToLeftFamily = true;
2091  haveLaunchedContributionToRightFamily = true;
2092 
2093  Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness.
2094  Kokkos::Array<int,3> lowerBounds {0,0,0};
2095  auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2096  // TODO: note that for best performance, especially with Fad types, we should replace this parallel for with a Functor and use hierarchical parallelism
2097  Kokkos::parallel_for("compute field integrals", policy,
2098  KOKKOS_LAMBDA (const int &cellDataOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
2099  const Scalar & cellMeasureWeight = cellMeasures.getTensorComponent(0)(cellDataOrdinal);
2100 
2101  TensorArgumentIterator leftTensorIterator(leftComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2102  leftTensorIterator.setEnumerationIndex(leftFieldOrdinal);
2103 
2104  TensorArgumentIterator rightTensorIterator(rightComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2105  rightTensorIterator.setEnumerationIndex(rightFieldOrdinal);
2106 
2107  Scalar integralSum = 0.0;
2108  for (int d=d_start; d<d_end; d++)
2109  {
2110  const Scalar & transformLeft_d = vectorDataLeft.transformWeight(cellDataOrdinal,0,d,d);
2111  const Scalar & transformRight_d = vectorDataRight.transformWeight(cellDataOrdinal,0,d,d);
2112 
2113  const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2114  // approximateFlopCount++;
2115 
2116  Scalar integral_d = 1.0;
2117 
2118  for (int r=0; r<numPointTensorComponents; r++)
2119  {
2120  integral_d *= componentIntegrals[r][d](leftTensorIterator.argument(r),rightTensorIterator.argument(r));
2121  // approximateFlopCount++; // product
2122  }
2123  integralSum += leftRightTransform_d * integral_d;
2124  // approximateFlopCount += 2; // multiply and sum
2125 
2126  const int i = leftFieldOrdinal + leftFieldOffset;
2127  const int j = rightFieldOrdinal + rightFieldOffset;
2128 
2129  if (integralViewRank == 3)
2130  {
2131  integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2132  }
2133  else
2134  {
2135  integralView2(i,j) += cellMeasureWeight * integralSum;
2136  }
2137  }
2138  });
2139 
2140  b_offset += rightDimSpan;
2141  } // rightVectorComponentOrdinal loop
2142  } // rightFamilyOrdinal loop
2143 
2144  a_offset += leftDimSpan;
2145  } // leftVectorComponentOrdinal
2146  } // leftFamilyOrdinal
2147 
2148  if (approximateFlops != NULL)
2149  {
2150  // TODO: check the accuracy of this
2151  *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2152  }
2153  }
2154  else // general case (not axis-aligned + affine tensor-product structure)
2155  {
2156  // prepare composed transformation matrices
2157  const Data<Scalar,DeviceType> & leftTransform = vectorDataLeft.transform();
2158  const Data<Scalar,DeviceType> & rightTransform = vectorDataRight.transform();
2159  const bool transposeLeft = true;
2160  const bool transposeRight = false;
2161 // auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
2162 // timer->start();
2163  Data<Scalar,DeviceType> composedTransform = leftTransform.allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2164  composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2165 // timer->stop();
2166 // std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n";
2167 // timer->reset();
2168 
2169  // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount.
2170  if (approximateFlops != NULL)
2171  {
2172  *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
2173  }
2174 
2175  const int leftFamilyCount = vectorDataLeft. vectorData().numFamilies();
2176  const int rightFamilyCount = vectorDataRight.vectorData().numFamilies();
2177  const int leftComponentCount = vectorDataLeft. vectorData().numComponents();
2178  const int rightComponentCount = vectorDataRight.vectorData().numComponents();
2179 
2180  int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2181  for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2182  {
2183  // "a" keeps track of the spatial dimension over which we are integrating in the left vector
2184  // components are allowed to span several dimensions; we keep track of the offset for the component in a_offset
2185  int a_offset = 0;
2186  bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2187  for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2188  {
2189  const TensorData<Scalar,DeviceType> & leftComponent = vectorDataLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal);
2190  if (!leftComponent.isValid())
2191  {
2192  // represents zero
2193  a_offset += vectorDataLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
2194  continue;
2195  }
2196 
2197  int rightFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2198  for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2199  {
2200  // "b" keeps track of the spatial dimension over which we are integrating in the right vector
2201  // components are allowed to span several dimensions; we keep track of the offset for the component in b_offset
2202  bool haveLaunchedContributionToCurrentFamilyRight = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2203  int b_offset = 0;
2204  for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2205  {
2206  const TensorData<Scalar,DeviceType> & rightComponent = vectorDataRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal);
2207  if (!rightComponent.isValid())
2208  {
2209  // represents zero
2210  b_offset += vectorDataRight.vectorData().numDimsForComponent(rightComponentOrdinal);
2211  continue;
2212  }
2213 
2214  INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.numTensorComponents() != rightComponent.numTensorComponents(), std::invalid_argument, "left TensorData and right TensorData have different number of tensor components. This is not supported.");
2215 
2216  const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2217  Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2218 
2219  // TODO: expose the options for forceNonSpecialized and usePointCacheForRank3Tensor through an IntegrationAlgorithm enumeration.
2220  // AUTOMATIC: let Intrepid2 choose an algorithm based on the inputs (and, perhaps, the execution space)
2221  // STANDARD: don't use sum factorization or axis alignment -- just do the simple contraction, a (p+1)^9 algorithm in 3D
2222  // SUM_FACTORIZATION // (p+1)^7 algorithm in 3D
2223  // SUM_FACTORIZATION_AXIS_ALIGNED // (p+1)^6 algorithm in 3D
2224  // SUM_FACTORIZATION_FORCE_GENERIC_IMPLEMENTATION // mainly intended for testing purposes (specialized implementations perform better when they are provided)
2225  // SUM_FACTORIZATION_WITH_POINT_CACHE // novel (p+1)^7 (in 3D) algorithm in Intrepid2; unclear as yet when and whether this may be a superior approach
2226  bool forceNonSpecialized = false; // We might expose this in the integrate() arguments in the future. We *should* default to false in the future.
2227  bool usePointCacheForRank3Tensor = true; // EXPERIMENTAL; has better performance under CUDA, but slightly worse performance than standard on serial CPU
2228 
2229  // in one branch or another below, we will launch a parallel kernel that contributes to (leftFamily, rightFamily) field ordinal pairs.
2230  // if we have already launched something that contributes to that part of the integral container, we need a Kokkos fence() to ensure that these do not interfere with each other.
2231  if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2232  {
2233  ExecutionSpace().fence();
2234  }
2235  haveLaunchedContributionToCurrentFamilyLeft = true;
2236  haveLaunchedContributionToCurrentFamilyRight = true;
2237 
2238  if (integralViewRank == 2)
2239  {
2240  if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2241  {
2242  auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2243 
2244  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2245  const int teamSize = functor.teamSize(recommendedTeamSize);
2246 
2247  policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2248 
2249  Kokkos::parallel_for( policy , functor, "F_IntegratePointValueCache rank 2");
2250 
2251  if (approximateFlops != NULL)
2252  {
2253  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2254  }
2255  }
2256  else
2257  {
2258  auto functor = Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2259 
2260  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2261  const int teamSize = functor.teamSize(recommendedTeamSize);
2262 
2263  policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2264 
2265  Kokkos::parallel_for( policy , functor, "F_Integrate rank 2");
2266 
2267  if (approximateFlops != NULL)
2268  {
2269  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2270  }
2271  }
2272  }
2273  else if (integralViewRank == 3)
2274  {
2275  if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2276  {
2277  auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2278 
2279  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2280  const int teamSize = functor.teamSize(recommendedTeamSize);
2281 
2282  policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2283 
2284  Kokkos::parallel_for( policy , functor, "F_IntegratePointValueCache rank 3");
2285 
2286  if (approximateFlops != NULL)
2287  {
2288  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2289  }
2290  }
2291  else
2292  {
2293  auto functor = Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2294 
2295  const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2296  const int teamSize = functor.teamSize(recommendedTeamSize);
2297 
2298  policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2299 
2300  Kokkos::parallel_for( policy , functor, "F_Integrate rank 3");
2301 
2302  if (approximateFlops != NULL)
2303  {
2304  *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2305  }
2306  }
2307  }
2308  b_offset += vectorDataRight.vectorData().numDimsForComponent(rightComponentOrdinal);
2309  }
2310  rightFieldOrdinalOffset += vectorDataRight.vectorData().numFieldsInFamily(rightFamilyOrdinal);
2311  }
2312  a_offset += vectorDataLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
2313  }
2314  leftFieldOrdinalOffset += vectorDataLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal);
2315  }
2316  }
2317 // if (approximateFlops != NULL)
2318 // {
2319 // std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl;
2320 // }
2321  ExecutionSpace().fence(); // make sure we've finished writing to integrals container before exiting…
2322 }
2323 
2324 } // end namespace Intrepid2
2325 
2326 #endif
arbitrary variation
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
KOKKOS_INLINE_FUNCTION bool axisAligned() const
Returns true if the transformation matrix is diagonal.
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts...
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1...
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don&#39;t (e.g., those that have been co...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
static void integrate(Data< Scalar, DeviceType > integrals, const TransformedVectorData< Scalar, DeviceType > &vectorDataLeft, const TensorData< Scalar, DeviceType > &cellMeasures, const TransformedVectorData< Scalar, DeviceType > &vectorDataRight, const bool sumInto=false, double *approximateFlops=NULL)
Contracts vectorDataLeft and vectorDataRight containers on point and space dimensions, weighting each point according to cellMeasures, and stores the result in outputValues. The integrals container can be constructed using allocateIntegralData().
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object...
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
Struct expressing all variation information about a Data object in a single dimension, including its logical extent and storage extent.
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
KOKKOS_INLINE_FUNCTION int numFields() const
Returns the logical extent in the fields dimension, which is the 1 dimension in this container...
const VectorData< Scalar, DeviceType > & vectorData() const
Returns the reference-space vector data.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< std::is_pod< T >::value, unsigned >::type dimension_scalar(const Kokkos::DynRankView< T, P... >)
specialization of functions for pod types, returning the scalar dimension (1 for pod types) of a view...
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
Structure-preserving representation of transformed vector data; reference space values and transforma...
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal, const int &dim1, const int &dim2) const
Returns the specified entry in the transform matrix.
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments...
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
KOKKOS_INLINE_FUNCTION int numCells() const
Returns the logical extent in the cell dimension, which is the 0 dimension in this container...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION DimensionInfo combinedDimensionInfo(const DimensionInfo &myData, const DimensionInfo &otherData)
Returns DimensionInfo for a Data container that combines (through multiplication, say...
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
KOKKOS_INLINE_FUNCTION int spaceDim() const
Returns the logical extent in the space dimension, which is the 3 dimension in this container...
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
static Data< Scalar, DeviceType > allocateIntegralData(const TransformedVectorData< Scalar, DeviceType > vectorDataLeft, const TensorData< Scalar, DeviceType > cellMeasures, const TransformedVectorData< Scalar, DeviceType > vectorDataRight)
Allocates storage for the contraction of vectorDataLeft and vectorDataRight containers on point and s...
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
DataVariationType
Enumeration to indicate how data varies in a particular dimension of an Intrepid2::Data object...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...
const Data< Scalar, DeviceType > & transform() const
Returns the transform matrix. An invalid/empty container indicates the identity transform.