Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
Kokkos_Parallel_MP_Vector.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP
43 #define KOKKOS_PARALLEL_MP_VECTOR_HPP
44 
45 #include "Sacado_MP_Vector.hpp"
46 #include "Kokkos_Core.hpp"
47 
48 //----------------------------------------------------------------------------
49 // Kokkos execution policies useful for Sacado::MP::Vector scalar type
50 //----------------------------------------------------------------------------
51 
52 namespace Kokkos {
53 
57  template< class ExecSpace, class Tag = void >
59 
61  typedef ExecSpace execution_space ;
62  typedef Tag work_tag ;
63 
64  size_t range;
65  size_t team;
66  size_t shared;
67 
68  MPVectorWorkConfig( const size_t range_,
69  const size_t team_,
70  const size_t shared_ = 0 ) :
71  range(range_), team(team_), shared(shared_) {}
72 
73  ExecSpace space() const { return ExecSpace(); }
74 };
75 
76 namespace Impl {
77 
78 #if defined( KOKKOS_ENABLE_THREADS )
79 // Specialization of ParallelFor<> for MPVectorWorkConfig and Threads
80 // The default implementation ignores the team size and uses the standard
81 // work-range implementation. In the future maybe we should try and use
82 // hyperthreads in a useful way. That would require:
83 // -- interpreting the team-size differently, rather as the sacado size
84 // -- determining the vector size of the architecture
85 // -- laying out the threads differently to use hyperthreads across the
86 // the sacado dimension
87 template< class FunctorType, class Tag >
88 class ParallelFor< FunctorType , MPVectorWorkConfig< Threads, Tag > > :
89  public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Threads > > {
90  typedef Kokkos::RangePolicy< Tag, Threads > Policy ;
91 public:
92  ParallelFor( const FunctorType & functor ,
93  const MPVectorWorkConfig< Threads, Tag > & work_config ) :
94  ParallelFor< FunctorType , Policy >( functor ,
95  Policy( 0, work_config.range ) ) {}
96 };
97 #endif
98 
99 #if defined( KOKKOS_ENABLE_OPENMP )
100 // Specialization of ParallelFor<> for MPVectorWorkConfig and OpenMP
101 // The default implementation ignores the team size and uses the standard
102 // work-range implementation. In the future maybe we should try and use
103 // hyperthreads in a useful way. That would require:
104 // -- interpreting the team-size differently, rather as the sacado size
105 // -- determining the vector size of the architecture
106 // -- laying out the threads differently to use hyperthreads across the
107 // the sacado dimension
108 template< class FunctorType, class Tag >
109 class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP, Tag > > :
110  public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, OpenMP > > {
111  typedef Kokkos::RangePolicy< Tag, OpenMP > Policy ;
112 public:
113  ParallelFor( const FunctorType & functor ,
114  const MPVectorWorkConfig< OpenMP, Tag > & work_config ) :
115  ParallelFor< FunctorType , Policy >( functor ,
116  Policy( 0, work_config.range ) ) {}
117 };
118 #endif
119 
120 #if defined(KOKKOS_ENABLE_SERIAL)
121 // Specialization of ParallelFor<> for MPVectorWorkConfig and Serial
122 // The default implementation ignores the team size and uses the standard
123 // work-range implementation. In the future maybe we should try and use
124 // hyperthreads in a useful way. That would require:
125 // -- interpreting the team-size differently, rather as the sacado size
126 // -- determining the vector size of the architecture
127 // -- laying out the threads differently to use hyperthreads across the
128 // the sacado dimension
129 template< class FunctorType, class Tag >
130 class ParallelFor< FunctorType , MPVectorWorkConfig< Serial, Tag > > :
131  public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Serial > > {
132  typedef Kokkos::RangePolicy< Tag, Serial > Policy ;
133 public:
134  ParallelFor( const FunctorType & functor ,
135  const MPVectorWorkConfig< Serial, Tag > & work_config ) :
136  ParallelFor< FunctorType , Policy >( functor ,
137  Policy( 0, work_config.range ) ) {}
138 };
139 #endif // defined(KOKKOS_ENABLE_SERIAL)
140 
141 #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
142 
143 // Specialization of ParallelFor<> for MPVectorWorkConfig on Cuda
144 // Here we use threadIdx.x for each entry in the specified team-size
145 template< class FunctorType, class Tag >
146 class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda, Tag > > {
147 public:
148 
149  typedef Kokkos::RangePolicy< Tag, Cuda > Policy;
150 
151  const FunctorType m_functor ;
152  const MPVectorWorkConfig< Cuda, Tag > m_config;
153  const Cuda::size_type m_work ;
154  const Policy m_policy;
155 
156  template <class TagType>
157  inline __device__
158  typename std::enable_if<std::is_same<TagType, void>::value>::type
159  exec_range(const Cuda::size_type i, Cuda::size_type j) const {
160  m_functor(i, j);
161  }
162 
163  template <class TagType>
164  inline __device__
165  typename std::enable_if<!std::is_same<TagType, void>::value>::type
166  exec_range(const Cuda::size_type i, Cuda::size_type j) const {
167  m_functor(TagType(), i, j);
168  }
169 
170  Policy const& get_policy() const { return m_policy; }
171 
172  inline
173  __device__
174  void operator()(void) const
175  {
176  const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
177 
178  for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
179  iwork < m_work ;
180  iwork += work_stride ) {
181  this->template exec_range<Tag>(iwork, threadIdx.x);
182  }
183  }
184 
185  ParallelFor( const FunctorType & functor ,
186  const MPVectorWorkConfig< Cuda, Tag > & work_config )
187  : m_functor( functor ) ,
188  m_config( work_config ) ,
189  m_work( work_config.range ),
190  m_policy()
191  {
192  }
193 
194  inline
195  void execute() const
196  {
197  // To do: query number of registers used by functor and adjust
198  // nwarp accordingly to get maximum occupancy
199 
200  Cuda::size_type nwarp = 0;
201  if (m_config.team > CudaTraits::WarpSize) {
202  const Cuda::size_type warps_per_team =
203  ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
204  nwarp = cuda_internal_maximum_warp_count() / warps_per_team;
205  }
206  else {
207  const Cuda::size_type teams_per_warp =
208  CudaTraits::WarpSize / m_config.team ;
209  nwarp = cuda_internal_maximum_warp_count() * teams_per_warp;
210  }
211  const dim3 block( m_config.team , nwarp , 1 );
212 
213  Cuda::size_type nblock =
214  std::min( (m_work + block.y - 1 ) / block.y ,
215  cuda_internal_maximum_grid_count() );
216  const dim3 grid( nblock , 1 , 1 );
217 
218  const Cuda::size_type shared = m_config.shared;
219  CudaParallelLaunch< ParallelFor >( *this , grid , block , shared , m_policy.space().impl_internal_space_instance(), false );
220  }
221 };
222 
223 #endif
224 
225 } // namespace Impl
226 
227 } // namespace Kokkos
228 
229 //----------------------------------------------------------------------------
230 //----------------------------------------------------------------------------
231 
232 #endif /* #ifndef KOKKOS_ATOMIC_MP_VECTOR_HPP */
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
Team-based parallel work configuration for Sacado::MP::Vector.