44 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP 45 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP 47 #include <initializer_list> 49 #include<impl/KokkosExp_Host_IterateTile.hpp> 50 #include <Kokkos_ExecPolicy.hpp> 53 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA ) 54 #include<Cuda/KokkosExp_Cuda_IterateTile.hpp> 55 #include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp> 58 #if defined( __HCC__ ) && defined( KOKKOS_ENABLE_ROCM ) 60 #include <ROCm/KokkosExp_ROCm_IterateTile_Refactor.hpp> 74 template <
typename ExecSpace>
75 struct default_outer_direction
78 #if defined( KOKKOS_ENABLE_CUDA)||defined( KOKKOS_ENABLE_ROCM) 79 static constexpr Iterate value = Iterate::Left;
81 static constexpr Iterate value = Iterate::Right;
85 template <
typename ExecSpace>
86 struct default_inner_direction
89 #if defined( KOKKOS_ENABLE_CUDA)||defined( KOKKOS_ENABLE_ROCM) 90 static constexpr Iterate value = Iterate::Left;
92 static constexpr Iterate value = Iterate::Right;
99 , Iterate OuterDir = Iterate::Default
100 , Iterate InnerDir = Iterate::Default
104 static_assert( N != 0u,
"Kokkos Error: rank 0 undefined");
105 static_assert( N != 1u,
"Kokkos Error: rank 1 is not a multi-dimensional range");
106 static_assert( N < 7u,
"Kokkos Error: Unsupported rank...");
108 using iteration_pattern = Rank<N, OuterDir, InnerDir>;
110 static constexpr
int rank = N;
111 static constexpr Iterate outer_direction = OuterDir;
112 static constexpr Iterate inner_direction = InnerDir;
117 template <
typename... Properties>
119 :
public Kokkos::Impl::PolicyTraits<Properties ...>
121 using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
122 using range_policy = RangePolicy<Properties...>;
124 using impl_range_policy = RangePolicy<
typename traits::execution_space
125 ,
typename traits::schedule_type
126 ,
typename traits::index_type
129 typedef MDRangePolicy execution_policy;
131 static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
132 ,
"Kokkos Error: MD iteration pattern not defined" );
134 using iteration_pattern =
typename traits::iteration_pattern;
135 using work_tag =
typename traits::work_tag;
136 using launch_bounds =
typename traits::launch_bounds;
137 using member_type =
typename range_policy::member_type;
139 enum {
rank =
static_cast<int>(iteration_pattern::rank) };
141 using index_type =
typename traits::index_type;
142 using array_index_type = long;
156 point_type m_tile_end;
157 index_type m_num_tiles;
158 index_type m_prod_tile_dims;
177 static constexpr
int outer_direction =
static_cast<int> (
178 (iteration_pattern::outer_direction != Iterate::Default)
179 ? iteration_pattern::outer_direction
180 : default_outer_direction< typename traits::execution_space>::value );
182 static constexpr
int inner_direction =
static_cast<int> (
183 iteration_pattern::inner_direction != Iterate::Default
184 ? iteration_pattern::inner_direction
185 : default_inner_direction< typename traits::execution_space>::value ) ;
188 static constexpr
int Right =
static_cast<int>( Iterate::Right );
189 static constexpr
int Left =
static_cast<int>( Iterate::Left );
191 MDRangePolicy( point_type
const& lower, point_type
const& upper, tile_type
const& tile = tile_type{} )
196 , m_prod_tile_dims(1)
200 #
if defined(KOKKOS_ENABLE_CUDA)
201 && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
203 #
if defined(KOKKOS_ENABLE_ROCM)
204 && !std::is_same< typename traits::execution_space, Kokkos::Experimental::ROCm >::value
209 for (
int i=0; i<
rank; ++i) {
210 span = upper[i] - lower[i];
211 if ( m_tile[i] <= 0 ) {
212 if ( ((
int)inner_direction == (
int)Right && (i <
rank-1))
213 || ((
int)inner_direction == (
int)Left && (i > 0)) )
218 m_tile[i] = (span == 0 ? 1 : span);
221 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
222 m_num_tiles *= m_tile_end[i];
223 m_prod_tile_dims *= m_tile[i];
226 #if defined(KOKKOS_ENABLE_CUDA) 233 if((
int)inner_direction == (int)Right) {
238 for (
int i=rank_start; i!=rank_end; i+=increment) {
239 span = m_upper[i] - m_lower[i];
240 if ( m_tile[i] <= 0 ) {
243 if ( ((
int)inner_direction == (int)Right && (i <
rank-1))
244 || ((
int)inner_direction == (int)Left && (i > 0)) )
246 if ( m_prod_tile_dims < 256 ) {
256 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
257 m_num_tiles *= m_tile_end[i];
258 m_prod_tile_dims *= m_tile[i];
260 if ( m_prod_tile_dims > 1024 ) {
261 printf(
" Tile dimensions exceed Cuda limits\n");
262 Kokkos::abort(
" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
267 #if defined(KOKKOS_ENABLE_ROCM) 274 if((
int)inner_direction == (
int)Right) {
279 for (
int i=rank_start; i!=rank_end; i+=increment) {
280 span = m_upper[i] - m_lower[i];
281 if ( m_tile[i] <= 0 ) {
284 if ( ((
int)inner_direction == (
int)Right && (i <
rank-1))
285 || ((
int)inner_direction == (
int)Left && (i > 0)) )
287 if ( m_prod_tile_dims < 256 ) {
297 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
298 m_num_tiles *= m_tile_end[i];
299 m_prod_tile_dims *= m_tile[i];
301 if ( m_prod_tile_dims > 1024 ) {
302 printf(
" Tile dimensions exceed ROCm limits\n");
303 Kokkos::abort(
" ROCm ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
311 template <
typename LT ,
typename UT ,
typename TT = array_index_type >
312 MDRangePolicy( std::initializer_list<LT>
const& lower, std::initializer_list<UT>
const& upper, std::initializer_list<TT>
const& tile = {} )
315 if(static_cast<int>(m_lower.size()) !=
rank || static_cast<int>(m_upper.size()) !=
rank)
316 Kokkos::abort(
"MDRangePolicy: Constructor initializer lists have wrong size");
318 for (
auto i = 0; i <
rank; ++i ) {
319 m_lower[i] =
static_cast<array_index_type
>(lower.begin()[i]);
320 m_upper[i] =
static_cast<array_index_type
>(upper.begin()[i]);
321 if(static_cast<int>(tile.size())==
rank)
322 m_tile[i] =
static_cast<array_index_type
>(tile.begin()[i]);
328 m_prod_tile_dims = 1;
332 #
if defined(KOKKOS_ENABLE_CUDA)
333 && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
335 #if defined(KOKKOS_ENABLE_ROCM) 336 && !std::is_same< typename traits::execution_space, Kokkos::Experimental::ROCm >::value
341 for (
int i=0; i<
rank; ++i) {
342 span = m_upper[i] - m_lower[i];
343 if ( m_tile[i] <= 0 ) {
344 if ( ((
int)inner_direction == (
int)Right && (i <
rank-1))
345 || ((int)inner_direction == (
int)Left && (i > 0)) )
350 m_tile[i] = (span == 0 ? 1 : span);
353 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
354 m_num_tiles *= m_tile_end[i];
355 m_prod_tile_dims *= m_tile[i];
358 #if defined(KOKKOS_ENABLE_CUDA) 365 if((
int)inner_direction == (int)Right) {
370 for (
int i=rank_start; i!=rank_end; i+=increment) {
371 span = m_upper[i] - m_lower[i];
372 if ( m_tile[i] <= 0 ) {
375 if ( ((
int)inner_direction == (int)Right && (i <
rank-1))
376 || ((
int)inner_direction == (int)Left && (i > 0)) )
378 if ( m_prod_tile_dims < 256 ) {
388 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
389 m_num_tiles *= m_tile_end[i];
390 m_prod_tile_dims *= m_tile[i];
392 if ( m_prod_tile_dims > 1024 ) {
393 printf(
" Tile dimensions exceed Cuda limits\n");
394 Kokkos::abort(
" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
399 #if defined(KOKKOS_ENABLE_ROCM) 406 if((
int)inner_direction == (
int)Right) {
411 for (
int i=rank_start; i!=rank_end; i+=increment) {
412 span = m_upper[i] - m_lower[i];
413 if ( m_tile[i] <= 0 ) {
416 if ( ((
int)inner_direction == (
int)Right && (i <
rank-1))
417 || ((
int)inner_direction == (
int)Left && (i > 0)) )
419 if ( m_prod_tile_dims < 256 ) {
429 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
430 m_num_tiles *= m_tile_end[i];
431 m_prod_tile_dims *= m_tile[i];
433 if ( m_prod_tile_dims > 1024 ) {
434 printf(
" Tile dimensions exceed ROCm limits\n");
435 Kokkos::abort(
" ROCm ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
447 namespace Kokkos {
namespace Experimental {
448 using Kokkos::MDRangePolicy;
450 using Kokkos::Iterate;
454 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE 459 namespace Kokkos {
namespace Experimental {
461 template <
typename MDRange,
typename Functor,
typename Enable =
void>
462 void md_parallel_for( MDRange
const& range
464 ,
const std::string& str =
"" 465 ,
typename std::enable_if<(
true 466 #
if defined( KOKKOS_ENABLE_CUDA)
467 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
469 #
if defined( KOKKOS_ENABLE_ROCM)
470 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
475 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
477 using range_policy =
typename MDRange::impl_range_policy;
482 template <
typename MDRange,
typename Functor>
483 void md_parallel_for(
const std::string& str
484 , MDRange
const& range
486 ,
typename std::enable_if<(
true 487 #
if defined( KOKKOS_ENABLE_CUDA)
488 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
490 #
if defined( KOKKOS_ENABLE_ROCM)
491 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
496 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
498 using range_policy =
typename MDRange::impl_range_policy;
504 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA ) 505 template <
typename MDRange,
typename Functor>
506 void md_parallel_for(
const std::string& str
507 , MDRange
const& range
509 ,
typename std::enable_if<(
true 510 #
if defined( KOKKOS_ENABLE_CUDA)
511 && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
516 Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
520 template <
typename MDRange,
typename Functor>
521 void md_parallel_for( MDRange
const& range
523 ,
const std::string& str =
"" 524 ,
typename std::enable_if<(
true 525 #
if defined( KOKKOS_ENABLE_CUDA)
526 && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
531 Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
540 template <
typename MDRange,
typename Functor,
typename ValueType>
541 void md_parallel_reduce( MDRange
const& range
544 ,
const std::string& str =
"" 545 ,
typename std::enable_if<(
true 546 #
if defined( KOKKOS_ENABLE_CUDA)
547 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
549 #
if defined( KOKKOS_ENABLE_ROCM)
550 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
555 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
557 using range_policy =
typename MDRange::impl_range_policy;
561 template <
typename MDRange,
typename Functor,
typename ValueType>
562 void md_parallel_reduce(
const std::string& str
563 , MDRange
const& range
566 ,
typename std::enable_if<(
true 567 #
if defined( KOKKOS_ENABLE_CUDA)
568 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
570 #
if defined( KOKKOS_ENABLE_ROCM)
571 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
576 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
578 using range_policy =
typename MDRange::impl_range_policy;
588 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< Kokkos::Impl::is_execution_policy< ExecPolicy >::value >::type *=0)
Execute functor in parallel according to the execution policy.
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
Parallel reduction.
Declaration of parallel operators.
KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View< D, P... > &V)
Temporary free function rank() until rank() is implemented in the View.