Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Kokkos_CrsMatrix_UQ_PCE.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP
43#define KOKKOS_CRSMATRIX_UQ_PCE_HPP
44
45#include "Sacado_UQ_PCE.hpp"
48#include "KokkosSparse_CrsMatrix.hpp"
49#include "KokkosSparse_spmv.hpp"
50
51#include "Kokkos_Blas1_UQ_PCE.hpp" // for some utilities
52
53#include "Stokhos_Multiply.hpp"
55
56namespace Stokhos {
57
58//----------------------------------------------------------------------------
59// Specialization of KokkosSparse::CrsMatrix for Sacado::UQ::PCE scalar type
60//----------------------------------------------------------------------------
61
62// Kernel implementing y = A * x where
63// A == KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<...>,...>,
64// x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
65// x and y are rank 1
66template <typename MatrixDevice,
67 typename MatrixStorage,
68 typename MatrixOrdinal,
69 typename MatrixMemory,
70 typename MatrixSize,
71 typename InputStorage,
72 typename ... InputP,
73 typename OutputStorage,
74 typename ... OutputP>
75class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
76 MatrixOrdinal,
77 MatrixDevice,
78 MatrixMemory,
79 MatrixSize>,
80 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
81 InputP... >,
82 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
83 OutputP... >
84 >
85{
86public:
90
91 typedef typename MatrixDevice::execution_space execution_space;
92
93 typedef KokkosSparse::CrsMatrix< const MatrixValue,
94 MatrixOrdinal,
95 MatrixDevice,
96 MatrixMemory,
97 MatrixSize> matrix_type;
98 typedef typename matrix_type::values_type matrix_values_type;
100 typedef typename tensor_type::size_type size_type;
101 typedef Kokkos::View< const InputVectorValue*,
102 InputP... > input_vector_type;
103 typedef Kokkos::View< OutputVectorValue*,
104 OutputP... > output_vector_type;
105
106private:
107
108 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
109 typedef typename matrix_values_type::array_type matrix_array_type;
110 typedef typename input_vector_type::array_type input_array_type;
111 typedef typename output_vector_type::array_type output_array_type;
112
113 typedef typename MatrixValue::value_type matrix_scalar;
114 typedef typename InputVectorValue::value_type input_scalar;
115 typedef typename OutputVectorValue::value_type output_scalar;
116 typedef typename tensor_type::value_type tensor_scalar;
117
125
127 const input_vector_type & x ,
128 const output_vector_type & y ,
129 const input_scalar & a ,
130 const output_scalar & b )
131 : m_A_values( A.values )
132 , m_A_graph( A.graph )
133 , m_x( x )
134 , m_y( y )
135 , m_tensor( Kokkos::cijk(A.values) )
136 , m_a( a )
137 , m_b( b )
138 {}
139
140public:
141
142 //
143 // Non-team functor interface -- no threads within PCE multiply
144 //
145 // Note: Rember that matrix currently is always LayoutRight!
146 //
147 KOKKOS_INLINE_FUNCTION
148 void operator()( const size_type iBlockRow ) const
149 {
150 // Prefer that y[ m_tensor.dimension() ] be scratch space
151 // on the local thread, but cannot dynamically allocate
152 output_scalar * const y = & m_y(0,iBlockRow);
153
154 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
155 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
156
157 // Leading dimension guaranteed contiguous for LayoutLeft
158 if ( m_b == output_scalar(0) )
159 for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
160 y[j] = 0 ;
161 else
162 for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
163 y[j] = m_b * y[j] ;
164 //loop over cols of A
165 for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
166 const input_scalar * const x = & m_x( 0 , m_A_graph.entries(iEntry) );
167 const matrix_scalar * const A = & m_A_values( iEntry , 0 );
168
170 m_tensor , A , x , y , m_a );
171 }
172
173 }
174
175#if defined(__MIC__)
176
177 //
178 // Team functor interface with threading within PCE multiply
179 //
180 // Note: Rember that matrix currently is always LayoutRight!
181 //
182 // This is a MIC-specific version of that processes multiple FEM columns
183 // at a time to reduce tensor reads
184 //
185 typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
186 KOKKOS_INLINE_FUNCTION
187 void operator()( const team_member & device ) const
188 {
189 const size_type iBlockRow = device.league_rank();
190
191 // Check for valid row
192 const size_type row_count = m_A_graph.row_map.extent(0)-1;
193 if (iBlockRow >= row_count)
194 return;
195
196 const size_type num_thread = device.team_size();
197 const size_type thread_idx = device.team_rank();
198 const Kokkos::pair<size_type,size_type> work_range =
199 details::compute_work_range<output_scalar>(
200 device, m_tensor.dimension(), num_thread, thread_idx);
201
202 // Prefer that y[ m_tensor.dimension() ] be scratch space
203 // on the local thread, but cannot dynamically allocate
204 output_scalar * const y = & m_y(0,iBlockRow);
205
206 // Leading dimension guaranteed contiguous for LayoutLeft
207 if ( m_b == output_scalar(0) )
208 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
209 y[j] = 0 ;
210 else
211 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
212 y[j] = m_b * y[j] ;
213
214 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
215 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
216 const size_type BlockSize = 9;
217 const size_type numBlock =
218 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
219
220 const matrix_scalar* sh_A[BlockSize];
221 const input_scalar* sh_x[BlockSize];
222
223 size_type iBlockEntry = iBlockEntryBeg;
224 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
225 const size_type block_size =
226 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
227
228 for ( size_type col = 0; col < block_size; ++col ) {
229 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
230 sh_x[col] = & m_x( 0 , iBlockColumn );
231 sh_A[col] = & m_A_values( iBlockEntry + col , 0);
232 }
233
234 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
235
236 const size_type nEntry = m_tensor.num_entry(iy);
237 const size_type iEntryBeg = m_tensor.entry_begin(iy);
238 const size_type iEntryEnd = iEntryBeg + nEntry;
239 size_type iEntry = iEntryBeg;
240
241 output_scalar ytmp = 0 ;
242
243 // Do entries with a blocked loop of size blocksize
244 const size_type nBlock = nEntry / tensor_type::vectorsize;
245 const size_type nEntryB = nBlock * tensor_type::vectorsize;
246 const size_type iEnd = iEntryBeg + nEntryB;
247
248 typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
249 typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
250 typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
251 VecTV vy;
252 vy.zero();
253 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
254 const size_type *j = &m_tensor.coord(iEntry,0);
255 const size_type *k = &m_tensor.coord(iEntry,1);
256 ValTV c(&(m_tensor.value(iEntry)));
257
258 for ( size_type col = 0; col < block_size; ++col ) {
259 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
260 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
261
262 // vy += c * ( aj * xk + ak * xj)
263 aj.times_equal(xk);
264 aj.multiply_add(ak, xj);
265 vy.multiply_add(c, aj);
266 }
267 }
268 ytmp += vy.sum();
269
270 // The number of nonzeros is always constrained to be a multiple of 8
271
272 const size_type rem = iEntryEnd-iEntry;
273 if (rem >= 8) {
274 typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
275 typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
276 typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
277 const size_type *j = &m_tensor.coord(iEntry,0);
278 const size_type *k = &m_tensor.coord(iEntry,1);
279 ValTV2 c(&(m_tensor.value(iEntry)));
280
281 for ( size_type col = 0; col < block_size; ++col ) {
282 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
283 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
284
285 // vy += c * ( aj * xk + ak * xj)
286 aj.times_equal(xk);
287 aj.multiply_add(ak, xj);
288 aj.times_equal(c);
289 ytmp += aj.sum();
290 }
291 }
292
293 y[iy] += m_a * ytmp ;
294 }
295
296 // Add a team barrier to keep the thread team in-sync before going on
297 // to the next block
298 device.team_barrier();
299 }
300
301 }
302
303#else
304
305 //
306 // Team functor interface with threading within PCE multiply
307 //
308 // Note: Rember that matrix currently is always LayoutRight!
309 //
310 // This is a general, hand-vectorized version that processes multiple FEM
311 // columns at a time to reduce tensor reads. Note that auto-vectorization
312 // doesn't work here because of the inner-loop over FEM columns.
313 //
314 typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
315 KOKKOS_INLINE_FUNCTION
316 void operator()( const team_member & device ) const
317 {
318 const size_type iBlockRow = device.league_rank();
319
320 // Check for valid row
321 const size_type row_count = m_A_graph.row_map.extent(0)-1;
322 if (iBlockRow >= row_count)
323 return;
324
325 const size_type num_thread = device.team_size();
326 const size_type thread_idx = device.team_rank();
327 const Kokkos::pair<size_type,size_type> work_range =
328 details::compute_work_range<output_scalar>(
329 device, m_tensor.dimension(), num_thread, thread_idx);
330
331 // Prefer that y[ m_tensor.dimension() ] be scratch space
332 // on the local thread, but cannot dynamically allocate
333 output_scalar * const y = & m_y(0,iBlockRow);
334
335 // Leading dimension guaranteed contiguous for LayoutLeft
336 if ( m_b == output_scalar(0) )
337 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
338 y[j] = 0 ;
339 else
340 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
341 y[j] = m_b * y[j] ;
342
343 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
344 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
345 const size_type BlockSize = 14;
346 const size_type numBlock =
347 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
348
349 const matrix_scalar* sh_A[BlockSize];
350 const input_scalar* sh_x[BlockSize];
351
352 size_type iBlockEntry = iBlockEntryBeg;
353 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
354 const size_type block_size =
355 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
356
357 for ( size_type col = 0; col < block_size; ++col ) {
358 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
359 sh_x[col] = & m_x( 0 , iBlockColumn );
360 sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
361 }
362
363 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
364
365 const size_type nEntry = m_tensor.num_entry(iy);
366 const size_type iEntryBeg = m_tensor.entry_begin(iy);
367 const size_type iEntryEnd = iEntryBeg + nEntry;
368 size_type iEntry = iEntryBeg;
369
370 output_scalar ytmp = 0 ;
371
372 // Do entries with a blocked loop of size blocksize
373 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
374 const size_type nBlock = nEntry / tensor_type::vectorsize;
375 const size_type nEntryB = nBlock * tensor_type::vectorsize;
376 const size_type iEnd = iEntryBeg + nEntryB;
377
378 typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
379 typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
380 typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
381 VecTV vy;
382 vy.zero();
383 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
384 const size_type *j = &m_tensor.coord(iEntry,0);
385 const size_type *k = &m_tensor.coord(iEntry,1);
386 ValTV c(&(m_tensor.value(iEntry)));
387
388 for ( size_type col = 0; col < block_size; ++col ) {
389 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
390 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
391
392 // vy += c * ( aj * xk + ak * xj)
393 aj.times_equal(xk);
394 aj.multiply_add(ak, xj);
395 vy.multiply_add(c, aj);
396 }
397 }
398 ytmp += vy.sum();
399 }
400
401 // Do remaining entries with a scalar loop
402 for ( ; iEntry<iEntryEnd; ++iEntry) {
403 const size_type j = m_tensor.coord(iEntry,0);
404 const size_type k = m_tensor.coord(iEntry,1);
405 tensor_scalar cijk = m_tensor.value(iEntry);
406
407 for ( size_type col = 0; col < block_size; ++col ) {
408 ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
409 sh_A[col][k] * sh_x[col][j] );
410 }
411
412 }
413
414 y[iy] += m_a * ytmp ;
415 }
416
417 // Add a team barrier to keep the thread team in-sync before going on
418 // to the next block
419 device.team_barrier();
420 }
421
422 }
423
424#endif
425
426 static void apply( const matrix_type & A ,
427 const input_vector_type & x ,
428 const output_vector_type & y ,
429 const input_scalar & a = input_scalar(1) ,
430 const output_scalar & b = output_scalar(0) )
431 {
432 // Generally the block algorithm seems to perform better on the MIC,
433 // as long as the stochastic size isn't too big, but doesn't perform
434 // any better on the CPU (probably because the CPU has a fat L3 cache
435 // to store the sparse 3 tensor).
436#ifdef __MIC__
437 const bool use_block_algorithm = true;
438#else
439 const bool use_block_algorithm = false;
440#endif
441
442 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
443 if (use_block_algorithm) {
444#ifdef __MIC__
445 const size_t team_size = 4; // 4 hyperthreads for MIC
446#else
447 const size_t team_size = 2; // 2 for everything else
448#endif
449 const size_t league_size = row_count;
450 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
451 Kokkos::parallel_for( config , Multiply(A,x,y,a,b) );
452 }
453 else {
454 Kokkos::parallel_for( row_count , Multiply(A,x,y,a,b) );
455 }
456 }
457};
458
459// Kernel implementing y = A * x where
460// A == KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<...>,...>,
461// x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
462// x and y are rank 2
463template <typename MatrixDevice,
464 typename MatrixStorage,
465 typename MatrixOrdinal,
466 typename MatrixMemory,
467 typename MatrixSize,
468 typename InputStorage,
469 typename ... InputP,
470 typename OutputStorage,
471 typename ... OutputP>
472class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
473 MatrixOrdinal,
474 MatrixDevice,
475 MatrixMemory,
476 MatrixSize>,
477 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
478 InputP... >,
479 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
480 OutputP... >
481 >
482{
483public:
487
488 typedef typename MatrixDevice::execution_space execution_space;
489
490 typedef KokkosSparse::CrsMatrix< const MatrixValue,
491 MatrixOrdinal,
492 MatrixDevice,
493 MatrixMemory,
494 MatrixSize> matrix_type;
495 typedef typename matrix_type::values_type matrix_values_type;
497 typedef typename tensor_type::size_type size_type;
498 typedef Kokkos::View< const InputVectorValue**,
499 InputP... > input_vector_type;
500 typedef Kokkos::View< OutputVectorValue**,
501 OutputP... > output_vector_type;
502
503private:
504
505 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
506 typedef typename matrix_values_type::array_type matrix_array_type;
507 typedef typename input_vector_type::array_type input_array_type;
508 typedef typename output_vector_type::array_type output_array_type;
509
510 typedef typename MatrixValue::value_type matrix_scalar;
511 typedef typename InputVectorValue::value_type input_scalar;
512 typedef typename OutputVectorValue::value_type output_scalar;
513 typedef typename tensor_type::value_type tensor_scalar;
514
522
524 const input_vector_type & x ,
525 const output_vector_type & y ,
526 const input_scalar & a ,
527 const output_scalar & b )
528 : m_A_values( A.values )
529 , m_A_graph( A.graph )
530 , m_x( x )
531 , m_y( y )
532 , m_tensor( Kokkos::cijk(A.values) )
533 , m_a( a )
534 , m_b( b )
535 {}
536
537public:
538
539 //
540 // Non-team functor interface -- no threads within PCE multiply
541 //
542 // Note: Rember that matrix currently is always LayoutRight!
543 //
544 KOKKOS_INLINE_FUNCTION
545 void operator()( const size_type iBlockRow ) const
546 {
547 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
548 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
549
550 const size_type num_col = m_y.extent(2);
551
552 // Leading dimension guaranteed contiguous for LayoutLeft
553 if ( m_b == output_scalar(0) )
554 for (size_type col=0; col<num_col; ++col)
555 for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
556 m_y(j, iBlockRow, col) = 0 ;
557 else
558 for (size_type col=0; col<num_col; ++col)
559 for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
560 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
561
562 // Put the x-column loop inside the A-column loop to reuse entries in A.
563 // This way all of the entries for that particular column of A should stay
564 // in L1 cache for all of the columns of x.
565
566 for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
567 const matrix_scalar * const A = &m_A_values( iEntry, 0 );
568 const size_type iBlockCol = m_A_graph.entries(iEntry);
569
570 for (size_type col=0; col<num_col; ++col) {
571 output_scalar * const y = &m_y( 0, iBlockRow, col );
572 const input_scalar * const x = &m_x( 0, iBlockCol, col );
573 BlockMultiply< tensor_type >::apply( m_tensor , A , x , y , m_a );
574 }
575
576 }
577
578 }
579
580#if defined(__MIC__)
581
582 //
583 // Team functor interface with threading within PCE multiply
584 //
585 // Note: Rember that matrix currently is always LayoutRight!
586 //
587 // This is a MIC-specific version of that processes multiple FEM columns
588 // at a time to reduce tensor reads
589 //
590 typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
591
592 KOKKOS_INLINE_FUNCTION
593 void operator()( const team_member & device ) const
594 {
595 const size_type iBlockRow = device.league_rank();
596
597 // Check for valid row
598 const size_type row_count = m_A_graph.row_map.extent(0)-1;
599 if (iBlockRow >= row_count)
600 return;
601
602 const size_type num_thread = device.team_size();
603 const size_type thread_idx = device.team_rank();
604 const Kokkos::pair<size_type,size_type> work_range =
605 details::compute_work_range<output_scalar>(
606 device, m_tensor.dimension(), num_thread, thread_idx);
607
608 const size_type num_col = m_y.extent(2);
609
610 // Leading dimension guaranteed contiguous for LayoutLeft
611 if ( m_b == output_scalar(0) )
612 for (size_type col=0; col<num_col; ++col)
613 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
614 m_y(j, iBlockRow, col) = 0 ;
615 else
616 for (size_type col=0; col<num_col; ++col)
617 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
618 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
619
620 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
621 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
622 const size_type BlockSize = 9;
623 const size_type numBlock =
624 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
625
626 const matrix_scalar* sh_A[BlockSize];
627 const input_scalar* sh_x[BlockSize];
628
629 size_type iBlockEntry = iBlockEntryBeg;
630 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
631 const size_type block_size =
632 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
633
634 // Loop over columns of x, y
635 for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
636
637 output_scalar * const y = & m_y( 0 , iBlockRow , vec_col );
638
639 for ( size_type col = 0; col < block_size; ++col ) {
640 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
641 sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
642 sh_A[col] = & m_A_values( iBlockEntry + col , 0);
643 }
644
645 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
646
647 const size_type nEntry = m_tensor.num_entry(iy);
648 const size_type iEntryBeg = m_tensor.entry_begin(iy);
649 const size_type iEntryEnd = iEntryBeg + nEntry;
650 size_type iEntry = iEntryBeg;
651
652 output_scalar ytmp = 0 ;
653
654 // Do entries with a blocked loop of size blocksize
655 const size_type nBlock = nEntry / tensor_type::vectorsize;
656 const size_type nEntryB = nBlock * tensor_type::vectorsize;
657 const size_type iEnd = iEntryBeg + nEntryB;
658
659 typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
660 typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
661 typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
662 VecTV vy;
663 vy.zero();
664 for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
665 const size_type *j = &m_tensor.coord(iEntry,0);
666 const size_type *k = &m_tensor.coord(iEntry,1);
667 ValTV c(&(m_tensor.value(iEntry)));
668
669 for ( size_type col = 0; col < block_size; ++col ) {
670 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
671 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
672
673 // vy += c * ( aj * xk + ak * xj)
674 aj.times_equal(xk);
675 aj.multiply_add(ak, xj);
676 vy.multiply_add(c, aj);
677 }
678 }
679 ytmp += vy.sum();
680
681 // The number of nonzeros is always constrained to be a multiple of 8
682
683 const size_type rem = iEntryEnd-iEntry;
684 if (rem >= 8) {
685 typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
686 typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
687 typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
688 const size_type *j = &m_tensor.coord(iEntry,0);
689 const size_type *k = &m_tensor.coord(iEntry,1);
690 ValTV2 c(&(m_tensor.value(iEntry)));
691
692 for ( size_type col = 0; col < block_size; ++col ) {
693 MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
694 VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
695
696 // vy += c * ( aj * xk + ak * xj)
697 aj.times_equal(xk);
698 aj.multiply_add(ak, xj);
699 aj.times_equal(c);
700 ytmp += aj.sum();
701 }
702 }
703
704 y[iy] += m_a * ytmp ;
705 }
706
707 }
708
709 // Add a team barrier to keep the thread team in-sync before going on
710 // to the next block
711 device.team_barrier();
712
713 }
714
715 }
716
717#else
718
719 //
720 // Team functor interface with threading within PCE multiply
721 //
722 // Note: Rember that matrix currently is always LayoutRight!
723 //
724 // This is a general, hand-vectorized version that processes multiple FEM
725 // columns at a time to reduce tensor reads. Note that auto-vectorization
726 // doesn't work here because of the inner-loop over FEM columns.
727 //
728 typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
729
730 KOKKOS_INLINE_FUNCTION
731 void operator()( const team_member & device ) const
732 {
733 const size_type iBlockRow = device.league_rank();
734
735 // Check for valid row
736 const size_type row_count = m_A_graph.row_map.extent(0)-1;
737 if (iBlockRow >= row_count)
738 return;
739
740 const size_type num_thread = device.team_size();
741 const size_type thread_idx = device.team_rank();
742 const Kokkos::pair<size_type,size_type> work_range =
743 details::compute_work_range<output_scalar>(
744 device, m_tensor.dimension(), num_thread, thread_idx);
745
746 const size_type num_col = m_y.extent(2);
747
748 // Leading dimension guaranteed contiguous for LayoutLeft
749 if ( m_b == output_scalar(0) )
750 for (size_type col=0; col<num_col; ++col)
751 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
752 m_y(j, iBlockRow, col) = 0 ;
753 else
754 for (size_type col=0; col<num_col; ++col)
755 for ( size_type j = work_range.first ; j < work_range.second ; ++j )
756 m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
757
758 const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
759 const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
760 const size_type BlockSize = 14;
761 const size_type numBlock =
762 (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
763
764 const matrix_scalar* sh_A[BlockSize];
765 const input_scalar* sh_x[BlockSize];
766
767 size_type iBlockEntry = iBlockEntryBeg;
768 for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
769 const size_type block_size =
770 block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
771
772 // Loop over columns of x, y
773 for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
774
775 output_scalar * const y = & m_y( 0 , iBlockRow , vec_col );
776
777 for ( size_type col = 0; col < block_size; ++col ) {
778 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
779 sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
780 sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
781 }
782
783 for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
784
785 const size_type nEntry = m_tensor.num_entry(iy);
786 const size_type iEntryBeg = m_tensor.entry_begin(iy);
787 const size_type iEntryEnd = iEntryBeg + nEntry;
788 size_type iEntry = iEntryBeg;
789
790 output_scalar ytmp = 0 ;
791
792 // Do entries with a blocked loop of size blocksize
793 if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize){
794 const size_type nBlock = nEntry / tensor_type::vectorsize;
795 const size_type nEntryB = nBlock * tensor_type::vectorsize;
796 const size_type iEnd = iEntryBeg + nEntryB;
797
798 typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
799 typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
800 typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
801 VecTV vy;
802 vy.zero();
803 for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
804 const size_type *j = &m_tensor.coord(iEntry,0);
805 const size_type *k = &m_tensor.coord(iEntry,1);
806 ValTV c(&(m_tensor.value(iEntry)));
807
808 for ( size_type col = 0; col < block_size; ++col ) {
809 MatTV aj(sh_A[col], j), ak(sh_A[col], k);
810 VecTV xj(sh_x[col], j), xk(sh_x[col], k);
811
812 // vy += c * ( aj * xk + ak * xj)
813 aj.times_equal(xk);
814 aj.multiply_add(ak, xj);
815 vy.multiply_add(c, aj);
816 }
817 }
818 ytmp += vy.sum();
819 }
820
821 // Do remaining entries with a scalar loop
822 for ( ; iEntry<iEntryEnd; ++iEntry) {
823 const size_type j = m_tensor.coord(iEntry,0);
824 const size_type k = m_tensor.coord(iEntry,1);
825 tensor_scalar cijk = m_tensor.value(iEntry);
826
827 for ( size_type col = 0; col < block_size; ++col ) {
828 ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
829 sh_A[col][k] * sh_x[col][j] );
830 }
831
832 }
833
834 y[iy] += m_a * ytmp ;
835 }
836
837 }
838
839 // Add a team barrier to keep the thread team in-sync before going on
840 // to the next block
841 device.team_barrier();
842 }
843
844 }
845
846#endif
847
848 static void apply( const matrix_type & A ,
849 const input_vector_type & x ,
850 const output_vector_type & y ,
851 const input_scalar & a = input_scalar(1) ,
852 const output_scalar & b = output_scalar(0) )
853 {
854 // Generally the block algorithm seems to perform better on the MIC,
855 // as long as the stochastic size isn't too big, but doesn't perform
856 // any better on the CPU (probably because the CPU has a fat L3 cache
857 // to store the sparse 3 tensor).
858#ifdef __MIC__
859 const bool use_block_algorithm = true;
860#else
861 const bool use_block_algorithm = false;
862#endif
863
864 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
865 if (use_block_algorithm) {
866#ifdef __MIC__
867 const size_t team_size = 4; // 4 hyperthreads for MIC
868#else
869 const size_t team_size = 2; // 2 for everything else
870#endif
871 const size_t league_size = row_count;
872 Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
873 Kokkos::parallel_for( config , Multiply(A,x,y,a,b) );
874 }
875 else {
876 Kokkos::parallel_for( row_count , Multiply(A,x,y,a,b) );
877 }
878 }
879};
880
881// Kernel implementing y = A * x where
882// A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
883// x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
884// x and y are rank 1
885template <typename MatrixDevice,
886 typename MatrixStorage,
887 typename MatrixOrdinal,
888 typename MatrixMemory,
889 typename MatrixSize,
890 typename InputStorage,
891 typename ... InputP,
892 typename OutputStorage,
893 typename ... OutputP>
894class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
895 MatrixOrdinal,
896 MatrixDevice,
897 MatrixMemory,
898 MatrixSize>,
899 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
900 InputP... >,
901 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
902 OutputP... >
903 >
904{
905public:
909
910 typedef KokkosSparse::CrsMatrix< MatrixValue,
911 MatrixOrdinal,
912 MatrixDevice,
913 MatrixMemory,
914 MatrixSize> matrix_type;
915 typedef typename matrix_type::const_type const_matrix_type;
916
917 typedef Kokkos::View< const InputVectorValue*,
918 InputP... > input_vector_type;
919 typedef Kokkos::View< OutputVectorValue*,
920 OutputP... > output_vector_type;
921
922 typedef typename InputVectorValue::value_type input_scalar;
923 typedef typename OutputVectorValue::value_type output_scalar;
924
925 static void apply( const matrix_type & A ,
926 const input_vector_type & x ,
927 const output_vector_type & y ,
928 const input_scalar & a = input_scalar(1) ,
929 const output_scalar & b = output_scalar(0) )
930 {
931 const_matrix_type cA = A;
933 cA, x, y, a, b);
934 }
935};
936
937// Kernel implementing y = A * x where
938// A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
939// x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
940// x and y are rank 2
941template <typename MatrixDevice,
942 typename MatrixStorage,
943 typename MatrixOrdinal,
944 typename MatrixMemory,
945 typename MatrixSize,
946 typename InputStorage,
947 typename ... InputP,
948 typename OutputStorage,
949 typename ... OutputP>
950class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
951 MatrixOrdinal,
952 MatrixDevice,
953 MatrixMemory,
954 MatrixSize>,
955 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
956 InputP... >,
957 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
958 OutputP... >
959 >
960{
961public:
965
966 typedef KokkosSparse::CrsMatrix< MatrixValue,
967 MatrixOrdinal,
968 MatrixDevice,
969 MatrixMemory,
970 MatrixSize> matrix_type;
971 typedef typename matrix_type::const_type const_matrix_type;
972
973 typedef Kokkos::View< const InputVectorValue**,
974 InputP... > input_vector_type;
975 typedef Kokkos::View< OutputVectorValue**,
976 OutputP... > output_vector_type;
977
978 typedef typename InputVectorValue::value_type input_scalar;
979 typedef typename OutputVectorValue::value_type output_scalar;
980
981 static void apply( const matrix_type & A ,
982 const input_vector_type & x ,
983 const output_vector_type & y ,
984 const input_scalar & a = input_scalar(1) ,
985 const output_scalar & b = output_scalar(0) )
986 {
987 const_matrix_type cA = A;
989 cA, x, y, a, b);
990 }
991};
992
993template <typename MatrixType, typename InputViewType, typename OutputViewType>
995
996// Kernel implementing y = A * x where PCE size of A is 1
997// A == KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
998// x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
999// x and y are rank 1
1000template <typename MatrixDevice,
1001 typename MatrixStorage,
1002 typename MatrixOrdinal,
1003 typename MatrixMemory,
1004 typename MatrixSize,
1005 typename InputStorage,
1006 typename ... InputP,
1007 typename OutputStorage,
1008 typename ... OutputP>
1009class MeanMultiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
1010 MatrixOrdinal,
1011 MatrixDevice,
1012 MatrixMemory,
1013 MatrixSize>,
1014 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
1015 InputP... >,
1016 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
1017 OutputP... >
1018 >
1019{
1020public:
1024
1025 typedef KokkosSparse::CrsMatrix< const MatrixValue,
1026 MatrixOrdinal,
1027 MatrixDevice,
1028 MatrixMemory,
1029 MatrixSize> matrix_type;
1030 typedef typename matrix_type::values_type matrix_values_type;
1031 typedef typename MatrixValue::ordinal_type size_type;
1032 typedef Kokkos::View< const InputVectorValue*,
1034 typedef Kokkos::View< OutputVectorValue*,
1036
1037 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
1038 typedef typename MatrixValue::value_type matrix_scalar;
1039 typedef typename InputVectorValue::value_type input_scalar;
1040 typedef typename OutputVectorValue::value_type output_scalar;
1041
1042 template <int BlockSize>
1043 struct BlockKernel {
1044 typedef typename MatrixDevice::execution_space execution_space;
1046 typedef typename input_vector_type::array_type input_array_type;
1047 typedef typename output_vector_type::array_type output_array_type;
1048
1059
1061 const input_vector_type & x ,
1062 const output_vector_type & y ,
1063 const input_scalar & a ,
1064 const output_scalar & b )
1065 : m_A_values( A.values )
1066 , m_A_graph( A.graph )
1067 , v_y( y )
1068 , v_x( x )
1069 , m_a( a )
1070 , m_b( b )
1071 , dim( dimension_scalar(x) )
1072 , numBlocks( dim / BlockSize )
1073 , rem( dim % BlockSize )
1074 , dim_block( numBlocks*BlockSize )
1075 {}
1076
1077 KOKKOS_INLINE_FUNCTION
1078 void operator()( const size_type iBlockRow ) const
1079 {
1080#if defined(__INTEL_COMPILER)&& ! defined(__CUDA_ARCH__)
1081 output_scalar s[BlockSize] __attribute__((aligned(64))) = {};
1082#else
1083 output_scalar s[BlockSize] = {};
1084#endif
1085
1086 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1087 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1088 size_type pce_block = 0;
1089 for (; pce_block < dim_block; pce_block+=BlockSize) {
1090 output_scalar * const y = &v_y(pce_block, iBlockRow);
1091 if (m_b == output_scalar(0))
1092#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1093#pragma ivdep
1094//#pragma vector aligned
1095#endif
1096 for (size_type k = 0; k < BlockSize; ++k)
1097 s[k] = 0.0;
1098 else
1099#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1100#pragma ivdep
1101//#pragma vector aligned
1102#endif
1103 for (size_type k = 0; k < BlockSize; ++k)
1104 s[k] = m_b*y[k];
1105 for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1106 const matrix_scalar aA = m_a*m_A_values(iEntry);
1107 const size_type col = m_A_graph.entries(iEntry);
1108 const input_scalar * const x = &v_x(pce_block, col);
1109#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1110#pragma ivdep
1111//#pragma vector aligned
1112#endif
1113 for (size_type k = 0; k < BlockSize; ++k)
1114 s[k] += aA*x[k];
1115 }
1116#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1117#pragma ivdep
1118//#pragma vector aligned
1119#endif
1120 for (size_type k = 0; k < BlockSize; ++k) {
1121 y[k] = s[k];
1122 }
1123 }
1124
1125 // Remaining coeffs
1126 if (rem > 0) {
1127 output_scalar * const y = &v_y(pce_block, iBlockRow);
1128 if (m_b == output_scalar(0))
1129#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1130#pragma ivdep
1131//#pragma vector aligned
1132#endif
1133 for (size_type k = 0; k < rem; ++k)
1134 s[k] = 0.0;
1135 else
1136#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1137#pragma ivdep
1138//#pragma vector aligned
1139#endif
1140 for (size_type k = 0; k < rem; ++k)
1141 s[k] = m_b*y[k];
1142 for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1143 const matrix_scalar aA = m_a*m_A_values(iEntry);
1144 const size_type col = m_A_graph.entries(iEntry);
1145 const input_scalar * const x = &v_x(pce_block, col);
1146#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1147#pragma ivdep
1148//#pragma vector aligned
1149#endif
1150 for (size_type k = 0; k < rem; ++k)
1151 s[k] += aA*x[k];
1152 }
1153#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1154#pragma ivdep
1155//#pragma vector aligned
1156#endif
1157 for (size_type k = 0; k < rem; ++k) {
1158 y[k] = s[k];
1159 }
1160 }
1161
1162 }
1163
1164 };
1165
1166 struct Kernel {
1167 typedef typename MatrixDevice::execution_space execution_space;
1169 typedef typename input_vector_type::array_type input_array_type;
1170 typedef typename output_vector_type::array_type output_array_type;
1171
1179
1180 Kernel( const matrix_type & A ,
1181 const input_vector_type & x ,
1182 const output_vector_type & y ,
1183 const input_scalar & a ,
1184 const output_scalar & b )
1185 : m_A_values( A.values )
1186 , m_A_graph( A.graph )
1187 , v_y( y )
1188 , v_x( x )
1189 , m_a( a )
1190 , m_b( b )
1191 , dim( dimension_scalar(x) )
1192 {}
1193
1194 KOKKOS_INLINE_FUNCTION
1195 void operator()( const size_type iBlockRow ) const
1196 {
1197 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1198 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1199 output_scalar * const y = &v_y(0, iBlockRow);
1200 if (m_b == output_scalar(0))
1201#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1202#pragma ivdep
1203//#pragma vector aligned
1204#endif
1205 for (size_type k = 0; k < dim; ++k)
1206 y[k] = 0.0;
1207 else
1208#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1209#pragma ivdep
1210//#pragma vector aligned
1211#endif
1212 for (size_type k = 0; k < dim; ++k)
1213 y[k] = m_b*y[k];
1214 for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1215 const matrix_scalar aA = m_a*m_A_values(iEntry);
1216 const size_type col = m_A_graph.entries(iEntry);
1217 const input_scalar * const x = &v_x(0, col);
1218#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1219#pragma ivdep
1220//#pragma vector aligned
1221#endif
1222 for (size_type k = 0; k < dim; ++k)
1223 y[k] += aA*x[k];
1224 }
1225 }
1226
1227 };
1228
1229 static void apply( const matrix_type & A ,
1230 const input_vector_type & x ,
1231 const output_vector_type & y ,
1232 const input_scalar & a = input_scalar(1) ,
1233 const output_scalar & b = output_scalar(0) )
1234 {
1235 const size_t row_count = A.graph.row_map.extent(0) - 1 ;
1236 const size_type dim = Kokkos::dimension_scalar(x);
1237
1238 // Choose block size appropriately for PCE dimension
1239#if defined (__MIC__)
1240 if (dim >= 128)
1241 Kokkos::parallel_for( row_count , Kernel(A,x,y,a,b) );
1242 else if (dim >= 64)
1243 Kokkos::parallel_for( row_count , BlockKernel<64>(A,x,y,a,b) );
1244 else if (dim >= 32)
1245 Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1246 else if (dim >= 16)
1247 Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1248 else if (dim >= 8)
1249 Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1250 else
1251 Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1252#else
1253 if (dim >= 32)
1254 Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1255 else if (dim >= 16)
1256 Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1257 else if (dim >= 8)
1258 Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1259 else
1260 Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1261#endif
1262 }
1263};
1264
1265// Kernel implementing y = A * x where A has PCE size = 1
1266// A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
1267// x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
1268// x and y are rank 2
1269template <typename MatrixDevice,
1270 typename MatrixStorage,
1271 typename MatrixOrdinal,
1272 typename MatrixMemory,
1273 typename MatrixSize,
1274 typename InputStorage,
1275 typename ... InputP,
1276 typename OutputStorage,
1277 typename ... OutputP>
1278class MeanMultiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
1279 MatrixOrdinal,
1280 MatrixDevice,
1281 MatrixMemory,
1282 MatrixSize>,
1283 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
1284 InputP... >,
1285 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1286 OutputP... >
1287 >
1288{
1289public:
1293
1294 typedef KokkosSparse::CrsMatrix< const MatrixValue,
1295 MatrixOrdinal,
1296 MatrixDevice,
1297 MatrixMemory,
1298 MatrixSize> matrix_type;
1299 typedef Kokkos::View< const InputVectorValue**,
1301 typedef Kokkos::View< OutputVectorValue**,
1303
1304 typedef typename MatrixDevice::execution_space execution_space;
1305 typedef typename MatrixValue::ordinal_type size_type;
1306 typedef typename InputVectorValue::value_type input_scalar;
1307 typedef typename OutputVectorValue::value_type output_scalar;
1308
1309 static void apply( const matrix_type & A ,
1310 const input_vector_type & x ,
1311 const output_vector_type & y ,
1312 const input_scalar & a = input_scalar(1) ,
1313 const output_scalar & b = output_scalar(0) )
1314 {
1315 typedef Kokkos::View< const InputVectorValue*,
1316 InputP... > input_vector_1d_type;
1317 typedef Kokkos::View< OutputVectorValue*,
1318 OutputP... > output_vector_1d_type;
1319 typedef MeanMultiply<matrix_type, input_vector_1d_type,
1320 output_vector_1d_type> MeanMultiply1D;
1321 const size_type num_col = x.extent(1);
1322 for (size_type i=0; i<num_col; ++i) {
1323 input_vector_1d_type x_col =
1324 Kokkos::subview(x, Kokkos::ALL(), i);
1325 output_vector_1d_type y_col =
1326 Kokkos::subview(y, Kokkos::ALL(), i);
1327 MeanMultiply1D::apply( A, x_col, y_col, a, b );
1328 }
1329 }
1330};
1331
1332// Kernel implementing y = A * x where PCE size of A is 1
1333// A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
1334// x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
1335// x and y are rank 1
1336template <typename MatrixDevice,
1337 typename MatrixStorage,
1338 typename MatrixOrdinal,
1339 typename MatrixMemory,
1340 typename MatrixSize,
1341 typename InputStorage,
1342 typename ... InputP,
1343 typename OutputStorage,
1344 typename ... OutputP>
1345class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
1346 MatrixOrdinal,
1347 MatrixDevice,
1348 MatrixMemory,
1349 MatrixSize>,
1350 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
1351 InputP... >,
1352 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
1353 OutputP... >
1354 >
1355{
1356public:
1360
1361 typedef KokkosSparse::CrsMatrix< MatrixValue,
1362 MatrixOrdinal,
1363 MatrixDevice,
1364 MatrixMemory,
1365 MatrixSize> matrix_type;
1366 typedef typename matrix_type::const_type const_matrix_type;
1367
1368 typedef Kokkos::View< const InputVectorValue*,
1370 typedef Kokkos::View< OutputVectorValue*,
1372
1373 typedef typename InputVectorValue::value_type input_scalar;
1374 typedef typename OutputVectorValue::value_type output_scalar;
1375
1376 static void apply( const matrix_type & A ,
1377 const input_vector_type & x ,
1378 const output_vector_type & y ,
1379 const input_scalar & a = input_scalar(1) ,
1380 const output_scalar & b = output_scalar(0) )
1381 {
1382 const_matrix_type cA = A;
1384 cA, x, y, a, b);
1385 }
1386};
1387
1388// Kernel implementing y = A * x where PCE size of A is 1
1389// A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
1390// x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
1391// x and y are rank 2
1392template <typename MatrixDevice,
1393 typename MatrixStorage,
1394 typename MatrixOrdinal,
1395 typename MatrixMemory,
1396 typename MatrixSize,
1397 typename InputStorage,
1398 typename ... InputP,
1399 typename OutputStorage,
1400 typename ... OutputP>
1401class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
1402 MatrixOrdinal,
1403 MatrixDevice,
1404 MatrixMemory,
1405 MatrixSize>,
1406 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
1407 InputP... >,
1408 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1409 OutputP... >
1410 >
1411{
1412public:
1416
1417 typedef KokkosSparse::CrsMatrix< MatrixValue,
1418 MatrixOrdinal,
1419 MatrixDevice,
1420 MatrixMemory,
1421 MatrixSize> matrix_type;
1422 typedef typename matrix_type::const_type const_matrix_type;
1423
1424 typedef Kokkos::View< const InputVectorValue**,
1426 typedef Kokkos::View< OutputVectorValue**,
1428
1429 typedef typename InputVectorValue::value_type input_scalar;
1430 typedef typename OutputVectorValue::value_type output_scalar;
1431
1432 static void apply( const matrix_type & A ,
1433 const input_vector_type & x ,
1434 const output_vector_type & y ,
1435 const input_scalar & a = input_scalar(1) ,
1436 const output_scalar & b = output_scalar(0) )
1437 {
1438 const_matrix_type cA = A;
1440 cA, x, y, a, b);
1441 }
1442};
1443
1444} // namespace Stokhos
1445
1446namespace KokkosSparse {
1447
1448template <typename AlphaType,
1449 typename BetaType,
1450 typename MatrixType,
1451 typename InputType,
1452 typename ... InputP,
1453 typename OutputType,
1454 typename ... OutputP>
1455typename std::enable_if<
1456 Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1457 Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1458 >::type
1460 const char mode[],
1461 const AlphaType& a,
1462 const MatrixType& A,
1463 const Kokkos::View< InputType, InputP... >& x,
1464 const BetaType& b,
1465 const Kokkos::View< OutputType, OutputP... >& y,
1466 const RANK_ONE)
1467{
1468 typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1469 typedef Kokkos::View< InputType, InputP... > InputVectorType;
1470 typedef Stokhos::Multiply<MatrixType, typename InputVectorType::const_type,
1471 OutputVectorType> multiply_type;
1472 typedef Stokhos::MeanMultiply<MatrixType, typename InputVectorType::const_type,
1473 OutputVectorType> mean_multiply_type;
1474
1475 if(mode[0]!='N') {
1477 "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1478 }
1479
1482 "Stokhos spmv not implemented for non-constant a or b");
1483 }
1484 if (dimension_scalar(A.values) == 1 && dimension_scalar(x) != 1) {
1485 mean_multiply_type::apply( A, x, y,
1486 Sacado::Value<AlphaType>::eval(a),
1487 Sacado::Value<BetaType>::eval(b) );
1488 }
1489 else
1490 multiply_type::apply( A, x, y,
1491 Sacado::Value<AlphaType>::eval(a),
1492 Sacado::Value<BetaType>::eval(b) );
1493}
1494
1495template <typename AlphaType,
1496 typename BetaType,
1497 typename MatrixType,
1498 typename InputType,
1499 typename ... InputP,
1500 typename OutputType,
1501 typename ... OutputP>
1502typename std::enable_if<
1503 Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1504 Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1505 >::type
1507 KokkosKernels::Experimental::Controls,
1508 const char mode[],
1509 const AlphaType& a,
1510 const MatrixType& A,
1511 const Kokkos::View< InputType, InputP... >& x,
1512 const BetaType& b,
1513 const Kokkos::View< OutputType, OutputP... >& y,
1514 const RANK_ONE)
1515{
1516 spmv(mode, a, A, x, b, y, RANK_ONE());
1517}
1518
1519template <typename AlphaType,
1520 typename BetaType,
1521 typename MatrixType,
1522 typename InputType,
1523 typename ... InputP,
1524 typename OutputType,
1525 typename ... OutputP>
1526typename std::enable_if<
1527 Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1528 Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1529 >::type
1531 const char mode[],
1532 const AlphaType& a,
1533 const MatrixType& A,
1534 const Kokkos::View< InputType, InputP... >& x,
1535 const BetaType& b,
1536 const Kokkos::View< OutputType, OutputP... >& y,
1537 const RANK_TWO)
1538{
1539 if(mode[0]!='N') {
1541 "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1542 }
1543 if (y.extent(1) == 1) {
1544 auto y_1D = subview(y, Kokkos::ALL(), 0);
1545 auto x_1D = subview(x, Kokkos::ALL(), 0);
1546 spmv(mode, a, A, x_1D, b, y_1D, RANK_ONE());
1547 }
1548 else {
1549 typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1550 typedef Kokkos::View< InputType, InputP... > InputVectorType;
1551
1552 typedef Stokhos::Multiply<MatrixType, typename InputVectorType::const_type,
1553 OutputVectorType> multiply_type;
1554 typedef Stokhos::MeanMultiply<MatrixType, typename InputVectorType::const_type,
1555 OutputVectorType> mean_multiply_type;
1556
1559 "Stokhos spmv not implemented for non-constant a or b");
1560 }
1561
1562 typename InputVectorType::const_type x_const = x;
1563
1564 if (dimension_scalar(A.values) == 1 && dimension_scalar(x) != 1) {
1565 mean_multiply_type::apply( A, x_const, y,
1566 Sacado::Value<AlphaType>::eval(a),
1567 Sacado::Value<BetaType>::eval(b));
1568 }
1569 else
1570 multiply_type::apply( A, x_const, y,
1571 Sacado::Value<AlphaType>::eval(a),
1572 Sacado::Value<BetaType>::eval(b));
1573 }
1574}
1575
1576template <typename AlphaType,
1577 typename BetaType,
1578 typename MatrixType,
1579 typename InputType,
1580 typename ... InputP,
1581 typename OutputType,
1582 typename ... OutputP>
1583typename std::enable_if<
1584 Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1585 Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1586 >::type
1588 KokkosKernels::Experimental::Controls,
1589 const char mode[],
1590 const AlphaType& a,
1591 const MatrixType& A,
1592 const Kokkos::View< InputType, InputP... >& x,
1593 const BetaType& b,
1594 const Kokkos::View< OutputType, OutputP... >& y,
1595 const RANK_TWO)
1596{
1597 spmv(mode, a, A, x, b, y, RANK_TWO());
1598}
1599
1600}
1601
1602#endif /* #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP */
Kokkos::DefaultExecutionSpace device
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)
KOKKOS_INLINE_FUNCTION void raise_error(const char *msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P... > >::value, unsigned >::type dimension_scalar(const View< T, P... > &view)
KOKKOS_INLINE_FUNCTION bool is_constant(const T &x)
Top-level namespace for Stokhos classes and functions.
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition csr_vector.h:267