42#ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP
43#define KOKKOS_PARALLEL_MP_VECTOR_HPP
46#include "Kokkos_Core.hpp"
57 template<
class ExecSpace,
class Tag =
void >
70 const size_t shared_ = 0 ) :
73 ExecSpace
space()
const {
return ExecSpace(); }
78#if defined( KOKKOS_ENABLE_THREADS )
87template<
class FunctorType,
class Tag >
88class ParallelFor< FunctorType , MPVectorWorkConfig< Threads, Tag > > :
89 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Threads > > {
90 typedef Kokkos::RangePolicy< Tag, Threads > Policy ;
92 ParallelFor(
const FunctorType & functor ,
93 const MPVectorWorkConfig< Threads, Tag > & work_config ) :
94 ParallelFor< FunctorType , Policy >( functor ,
95 Policy( 0, work_config.range ) ) {}
99#if defined( KOKKOS_ENABLE_OPENMP )
108template<
class FunctorType,
class Tag >
109class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP, Tag > > :
110 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, OpenMP > > {
111 typedef Kokkos::RangePolicy< Tag, OpenMP > Policy ;
113 ParallelFor(
const FunctorType & functor ,
114 const MPVectorWorkConfig< OpenMP, Tag > & work_config ) :
115 ParallelFor< FunctorType , Policy >( functor ,
116 Policy( 0, work_config.range ) ) {}
120#if defined(KOKKOS_ENABLE_SERIAL)
129template<
class FunctorType,
class Tag >
130class ParallelFor< FunctorType , MPVectorWorkConfig< Serial, Tag > > :
131 public ParallelFor< FunctorType , Kokkos::RangePolicy< Tag, Serial > > {
132 typedef Kokkos::RangePolicy< Tag, Serial > Policy ;
134 ParallelFor(
const FunctorType & functor ,
135 const MPVectorWorkConfig< Serial, Tag > & work_config ) :
136 ParallelFor< FunctorType , Policy >( functor ,
137 Policy( 0, work_config.range ) ) {}
141#if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
145template<
class FunctorType,
class Tag >
146class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda, Tag > > {
149 typedef Kokkos::RangePolicy< Tag, Cuda > Policy;
151 const FunctorType m_functor ;
152 const MPVectorWorkConfig< Cuda, Tag > m_config;
153 const Cuda::size_type m_work ;
154 const Policy m_policy;
156 template <
class TagType>
158 typename std::enable_if<std::is_same<TagType, void>::value>::type
159 exec_range(
const Cuda::size_type i, Cuda::size_type
j)
const {
163 template <
class TagType>
165 typename std::enable_if<!std::is_same<TagType, void>::value>::type
166 exec_range(
const Cuda::size_type i, Cuda::size_type
j)
const {
167 m_functor(TagType(), i,
j);
170 Policy
const& get_policy()
const {
return m_policy; }
174 void operator()(
void)
const
176 const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
178 for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
180 iwork += work_stride ) {
181 this->
template exec_range<Tag>(iwork, threadIdx.x);
185 ParallelFor(
const FunctorType & functor ,
186 const MPVectorWorkConfig< Cuda, Tag > & work_config )
187 : m_functor( functor ) ,
188 m_config( work_config ) ,
189 m_work( work_config.range ),
200 Cuda::size_type nwarp = 0;
201 if (m_config.team > CudaTraits::WarpSize) {
202 const Cuda::size_type warps_per_team =
203 ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
204 nwarp = cuda_internal_maximum_warp_count() / warps_per_team;
207 const Cuda::size_type teams_per_warp =
208 CudaTraits::WarpSize / m_config.team ;
209 nwarp = cuda_internal_maximum_warp_count() * teams_per_warp;
211 const dim3 block( m_config.team , nwarp , 1 );
213 Cuda::size_type nblock =
214 std::min( (m_work + block.y - 1 ) / block.y ,
215 cuda_internal_maximum_grid_count()[0] );
216 const dim3 grid( nblock , 1 , 1 );
218 const Cuda::size_type shared = m_config.shared;
219 CudaParallelLaunch< ParallelFor >( *
this , grid , block , shared , m_policy.space().impl_internal_space_instance() );
Team-based parallel work configuration for Sacado::MP::Vector.
ExecSpace execution_space
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
MPVectorWorkConfig execution_policy