Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Stokhos_Cuda_CooProductTensor.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP
43#define STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP
44
45#include <iostream>
46
47#include "Kokkos_Core.hpp"
48
49#include "Stokhos_Multiply.hpp"
52
53#include "cuda_profiler_api.h"
54
55namespace Stokhos {
56
57//----------------------------------------------------------------------------
58
59template< typename TensorScalar,
60 typename MatrixScalar,
61 typename VectorScalar,
62 bool Pack>
64 BlockCrsMatrix< CooProductTensor< TensorScalar, Kokkos::Cuda, Pack >,
65 MatrixScalar, Kokkos::Cuda >,
66 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
67 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
68{
69public:
70
71 typedef Kokkos::Cuda execution_space;
72 typedef execution_space::size_type size_type;
73
74 typedef CooProductTensor< TensorScalar, execution_space, Pack > tensor_type;
75 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type;
76 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type;
77
78 typedef int rows_type;
79 static const rows_type invalid_row = -1;
80
81 class CooKernel {
82 public:
83
89
91 const vector_type & x,
92 const vector_type & y,
93 const size_type entries_per_thread,
94 const size_type block_size )
95 : m_A(A),
96 m_x(x),
97 m_y(y),
98 m_entries_per_thread(entries_per_thread),
99 m_block_size(block_size) {}
100
101 __device__
102 void operator()(void) const
103 {
104 // Number of bases in the stochastic system:
105 const size_type dim = m_A.block.dimension();
106 const size_type num_entries = m_A.block.entry_count();
107
108 // Thread dimensions and index
109 const size_type nid = blockDim.x * blockDim.y;
110 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
111
112 // Shared memory
113 VectorScalar * const sh_x =
114 kokkos_impl_cuda_shared_memory<VectorScalar>();
115 VectorScalar * const sh_A = sh_x + m_block_size*dim;
116 VectorScalar * const sh_y = sh_A + m_block_size*dim;
117 volatile VectorScalar * const vals = sh_y + dim;
118 volatile rows_type * const rows =
119 reinterpret_cast<volatile rows_type*>(vals + nid);
120
121 // Product tensor entries which this warp will iterate:
122 const size_type entries_per_warp = blockDim.x * m_entries_per_thread;
123 const size_type lBeg = threadIdx.y * entries_per_warp + threadIdx.x;
124 const size_type lEnd = ( lBeg + entries_per_warp < num_entries ?
125 lBeg + entries_per_warp : num_entries );
126
127 // blockIdx.x == row in the deterministic (finite element) system
128 const size_type femBeg = m_A.graph.row_map[ blockIdx.x ];
129 const size_type femEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
130
131 // Zero y
132 for (size_type l = tid; l < dim; l += nid) {
133 sh_y[l] = 0.0;
134 }
135
136 // Initialize rows & vals arrays
137 rows[tid] = invalid_row;
138 vals[tid] = 0.0;
139
140 // Loop over columns in the discrete (finite element) system.
141 for (size_type femEntry=femBeg; femEntry<femEnd; femEntry+=m_block_size) {
142 const size_type block_size =
143 femEntry + m_block_size < femEnd ? m_block_size : femEnd - femEntry;
144
145 // Wait for X and A to be used in the previous iteration
146 // before reading new values.
147 __syncthreads();
148
149 // Coalesced read blocks of X and A into shared memory
150 for (size_type col = 0; col < block_size; ++col) {
151
152 const size_type femColumn = m_A.graph.entries( femEntry + col );
153 const VectorScalar * const x = & m_x( 0, femColumn );
154 const MatrixScalar * const A = & m_A.values( 0, femEntry + col );
155
156 // Coalesced read by the whole block from global memory:
157 for (size_type l = tid; l < dim; l += nid) {
158 sh_x[col + l * m_block_size] = x[l];
159 sh_A[col + l * m_block_size] = A[l];
160 }
161
162 }
163
164 __syncthreads(); // wait for X and A to be read before using them
165
166 // This cuda block is responsible for computing all values of 'y'
167 for (size_type l = lBeg; l < lEnd; l += blockDim.x) {
168
169 // Read 'blockDim.x' entries from the tensor (coalesced)
170 size_type i, j, k;
171 m_A.block.coord(l,i,j,k);
172 const TensorScalar v = m_A.block.value( l );
173 j *= m_block_size;
174 k *= m_block_size;
175
176 // Register storing local accumulation for row i
177 VectorScalar y = 0;
178
179 // Check for carry from previous row
180 if (threadIdx.x == 0) {
181 if (i == rows[tid+31])
182 y += vals[tid+31];
183 else
184 sh_y[rows[tid+31]] += vals[tid+31];
185 }
186
187 // Accumulate local row for the set of FEM columns
188 for (size_type col = 0; col < block_size; ++col) {
189 y += v * ( sh_A[col+j] * sh_x[col+k] + sh_A[col+k] * sh_x[col+j] );
190 }
191
192 // Store row and value into shared arrays
193 rows[tid] = i;
194 vals[tid] = y;
195
196 // Reduce 'y' within 'blockDim.x' to the right for threads
197 // on the same row
198 if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
199 if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
200 if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
201 if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
202 if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
203
204 // Add local accumulation of y into sh_y for threads on
205 // distinct rows. We don't store thread 31 and instead carry it
206 // to the next iteration to eliminate race conditions between warps
207 if (threadIdx.x < 31 && i != rows[tid + 1])
208 sh_y[i] += vals[tid];
209
210 }
211
212 // At this point we have blockDim.y values that need to be
213 // reduced and stored. Move these row/value pairs to the beginning
214 __syncthreads();
215 if (threadIdx.x == 31) {
216 rows[threadIdx.y] = rows[tid];
217 vals[threadIdx.y] = vals[tid];
218 }
219 __syncthreads();
220
221 // Reduce these values to the right using the first warp
222 // This assumes blockDim.x >= blockDim.y
223 if (threadIdx.y == 0 && threadIdx.x < blockDim.y) {
224 const size_type i = rows[tid];
225 if (threadIdx.x >= 1 && i == rows[tid- 1]) vals[tid] += vals[tid- 1];
226 if (threadIdx.x >= 2 && i == rows[tid- 2]) vals[tid] += vals[tid- 2];
227 if (threadIdx.x >= 4 && i == rows[tid- 4]) vals[tid] += vals[tid- 4];
228 if (threadIdx.x >= 8 && i == rows[tid- 8]) vals[tid] += vals[tid- 8];
229 if (threadIdx.x >= 16 && i == rows[tid-16]) vals[tid] += vals[tid-16];
230
231 if ((threadIdx.x == blockDim.y-1) ||
232 (threadIdx.x < blockDim.y-1 && i != rows[tid+1]))
233 sh_y[i] += vals[tid];
234 }
235
236 // Reset rows and vals to prohibit carry across FEM columns
237 rows[tid] = invalid_row;
238 vals[tid] = 0.0;
239
240 }
241
242 // Wait for all contributions of y to be completed
243 __syncthreads();
244
245 // Store result back in global memory
246 for (size_type l = tid; l < dim; l += nid) {
247 m_y( l, blockIdx.x ) = sh_y[ l ];
248 }
249 }
250 };
251
252 //------------------------------------
253
254 static void apply( const matrix_type & A,
255 const vector_type & x,
256 const vector_type & y )
257 {
258 const size_type fem_rows = A.graph.row_map.extent(0) - 1;
259 const size_type stoch_rows = A.block.dimension();
260 const size_type stoch_entries = A.block.entry_count();
261 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
262
263#ifdef STOKHOS_DEBUG
264 const size_type num_warp_max = 16; // Use fewer warps in debug mode to prevent
265 // launch failures
266#else
267 const size_type num_warp_max = 20;
268#endif
269 const size_type num_warp =
270 std::min( num_warp_max, stoch_entries / warp_size );
271 const dim3 dBlock( warp_size , num_warp, 1 );
272 const dim3 dGrid( fem_rows, 1, 1 );
273
274 const size_type num_thread = dBlock.x*dBlock.y;
275 const size_type entries_per_thread =
276 (stoch_entries + num_thread-1) / num_thread;
277
278 // Use at most half of shared memory to get 2 blocks per SMP
279 const size_type size_rows = sizeof(rows_type) * num_thread;
280 const size_type size_vals = sizeof(VectorScalar) * num_thread;
281 const size_type shcap =
282 Kokkos::Cuda().impl_internal_space_instance()->m_maxShmemPerBlock / 2;
283 size_type bs =
284 ((shcap-size_rows-size_vals) / (sizeof(VectorScalar)*stoch_rows) - 1) / 2;
285 if (bs % 2 == 0) --bs; // Make block-size odd to reduce bank conflicts
286 const size_type block_size_max = 31;
287 const size_type block_size = std::min(bs, block_size_max);
288 const size_type shmem =
289 sizeof(VectorScalar) * ((2*block_size+1) * stoch_rows) + // A, x, y
290 size_vals + size_rows;
291
292#if 0
293 //std::cout << std::endl << A.block << std::endl;
294 const size_type fem_nnz = A.values.extent(1);
295 std::cout << "Multiply< BlockCrsMatrix< CooProductTensor ... > >::apply"
296 << std::endl
297 << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
298 << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
299 << " block_size(" << block_size << ")" << std::endl
300 << " shmem(" << shmem/1024 << " kB)" << std::endl
301 << " fem_rows(" << fem_rows << ")" << std::endl
302 << " fem_nnz(" << fem_nnz << ")" << std::endl
303 << " stoch_rows(" << stoch_rows << ")" << std::endl
304 << " stoch_nnz(" << stoch_entries << ")" << std::endl
305 << " threads_per_block(" << num_thread << ")" << std::endl
306 << " entries_per_thread(" << entries_per_thread << ")" << std::endl
307 ;
308#endif
309
310 //cudaProfilerStart();
311 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
312 ( CooKernel( A, x, y, entries_per_thread, block_size ) );
313 //cudaProfilerStop();
314 }
315};
316
317//----------------------------------------------------------------------------
318//----------------------------------------------------------------------------
319
320} // namespace Stokhos
321
322#endif /* #ifndef STOKHOS_CUDA_COO_PRODUCT_TENSOR_HPP */
CRS matrix of dense blocks.
Sparse product tensor using 'COO'-like storage format.
Top-level namespace for Stokhos classes and functions.