20#include <cusp/detail/device/arch.h>
21#include <cusp/detail/device/common.h>
22#include <cusp/detail/device/utils.h>
23#include <cusp/detail/device/texture.h>
24#include <thrust/device_ptr.h>
25#include <cudaProfiler.h>
26#include <cuda_profiler_api.h>
28#include <cusp/detail/device/arch.h>
30#include "Stokhos_config.h"
31#if 0 && defined(HAVE_STOKHOS_CUSPARSE)
32#define USE_CUSPARSE_ROW 0
33#define USE_CUSPARSE_COL 1
35#define USE_CUSPARSE_ROW 0
36#define USE_CUSPARSE_COL 0
39#if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
42#include <cuda_runtime.h>
53#if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
55class CudaSparseSingleton {
58 cusparseStatus_t status;
59 cusparseHandle_t handle;
60 cusparseMatDescr_t descra;
62 static CudaSparseSingleton & singleton();
68 status = cusparseCreate(&handle);
69 if(status != CUSPARSE_STATUS_SUCCESS)
71 throw std::runtime_error( std::string(
"ERROR - CUSPARSE Library Initialization failed" ) );
74 status = cusparseCreateMatDescr(&descra);
75 if(status != CUSPARSE_STATUS_SUCCESS)
77 throw std::runtime_error( std::string(
"ERROR - CUSPARSE Library Matrix descriptor failed" ) );
80 cusparseSetMatType(descra , CUSPARSE_MATRIX_TYPE_GENERAL);
81 cusparseSetMatIndexBase(descra , CUSPARSE_INDEX_BASE_ZERO);
84 CudaSparseSingleton(
const CudaSparseSingleton & );
85 CudaSparseSingleton & operator = (
const CudaSparseSingleton & );
88CudaSparseSingleton & CudaSparseSingleton::singleton()
90 static CudaSparseSingleton s ;
return s ;
98 const csr_matrix<int,double,device_memory>& A,
99 const array2d<double, device_memory, column_major>&
x,
100 array2d<double, device_memory, column_major>&
y,
103 CudaSparseSingleton & s = CudaSparseSingleton::singleton();
104 const double alpha = 1 , beta = 0 ;
105 cusparseStatus_t status =
106 cusparseDcsrmm2(s.handle,
107 CUSPARSE_OPERATION_NON_TRANSPOSE,
108 CUSPARSE_OPERATION_TRANSPOSE,
109 A.num_rows,
x.num_cols, A.num_cols, A.num_entries,
112 thrust::raw_pointer_cast(&A.values[0]),
113 thrust::raw_pointer_cast(&A.row_offsets[0]),
114 thrust::raw_pointer_cast(&A.column_indices[0]),
115 thrust::raw_pointer_cast(&(
x.values)[0]),
118 thrust::raw_pointer_cast(&(
y.values)[0]),
121 if ( CUSPARSE_STATUS_SUCCESS != status ) {
122 throw std::runtime_error( std::string(
"ERROR - cusparseDcsrmv " ) );
128template <
typename IndexType,
typename ValueType,
unsigned MAX_NNZ_PER_ROW>
133 const IndexType * Ar,
134 const IndexType * Ac,
135 const ValueType * Aval,
141 extern __shared__
int sh[];
142 volatile ValueType *
const sh_Aval = (ValueType*) sh;
143 volatile IndexType *
const sh_Ac = (IndexType*) (sh_Aval + MAX_NNZ_PER_ROW * blockDim.y);
146 const IndexType row = threadIdx.y + blockDim.y * blockIdx.x;
147 if (row < Anum_rows) {
148 const IndexType row_start = Ar[row];
149 const IndexType row_end = Ar[row+1];
152 for (IndexType
j=threadIdx.x;
j<
xnum_cols;
j+=blockDim.x) {
157 for (IndexType block_col=row_start; block_col<row_end;
158 block_col+=MAX_NNZ_PER_ROW) {
159 const IndexType r = block_col + MAX_NNZ_PER_ROW < row_end ?
160 MAX_NNZ_PER_ROW : row_end-block_col;
167 for (IndexType i=threadIdx.x; i<r; i+=blockDim.x){
168 sh_Aval[i*blockDim.y+threadIdx.y] = Aval[i+block_col];
169 sh_Ac[ i*blockDim.y+threadIdx.y] = Ac [i+block_col];
173 for (IndexType
j=threadIdx.x;
j<
xnum_cols;
j+=blockDim.x){
177 for (IndexType jj=0; jj<r; jj++){
178 IndexType J = sh_Ac[jj*blockDim.y+threadIdx.y];
179 sum += sh_Aval[jj*blockDim.y+threadIdx.y] *
x[
j+
xnum_cols*J];
189template <
typename Matrix,
typename Vector2,
typename Vector3>
193 typedef typename Matrix::index_type IndexType;
194 typedef typename Matrix::value_type ValueType;
195 typedef typename Matrix::memory_space MemorySpace;
201 const unsigned MAX_NNZ_PER_ROW = 32;
202 const size_t COLS_PER_BLOCK = 16;
203 const size_t ROWS_PER_BLOCK = 8;
204 const size_t shared =
205 ROWS_PER_BLOCK * MAX_NNZ_PER_ROW * (
sizeof(IndexType) +
sizeof(ValueType));
206 const size_t NUM_BLOCKS = (A.num_rows + ROWS_PER_BLOCK-1) / ROWS_PER_BLOCK;
208 dim3 block(COLS_PER_BLOCK, ROWS_PER_BLOCK, 1);
209 dim3 grid(NUM_BLOCKS, 1);
211 spmm_csr_vector_kernel_row<IndexType, ValueType, MAX_NNZ_PER_ROW> <<<grid, block, shared>>>
212 (A.num_rows,
x.num_rows,
x.num_cols,
213 thrust::raw_pointer_cast(&A.row_offsets[0]),
214 thrust::raw_pointer_cast(&A.column_indices[0]),
215 thrust::raw_pointer_cast(&A.values[0]),
216 thrust::raw_pointer_cast(&(
x.values)[0]),
217 thrust::raw_pointer_cast(&(
y.values)[0]));
225 const csr_matrix<int,double,device_memory>& A,
226 const array2d<double, device_memory, column_major>&
x,
227 array2d<double, device_memory, column_major>&
y,
230 CudaSparseSingleton & s = CudaSparseSingleton::singleton();
231 const double alpha = 1 , beta = 0 ;
232 cusparseStatus_t status =
233 cusparseDcsrmm(s.handle,
234 CUSPARSE_OPERATION_NON_TRANSPOSE,
235 A.num_rows,
x.num_cols, A.num_cols,
238 thrust::raw_pointer_cast(&A.values[0]),
239 thrust::raw_pointer_cast(&A.row_offsets[0]),
240 thrust::raw_pointer_cast(&A.column_indices[0]),
241 thrust::raw_pointer_cast(&(
x.values)[0]),
244 thrust::raw_pointer_cast(&(
y.values)[0]),
247 if ( CUSPARSE_STATUS_SUCCESS != status ) {
248 throw std::runtime_error( std::string(
"ERROR - cusparseDcsrmv " ) );
256template <
typename IndexType,
typename ValueType,
unsigned int VECTORS_PER_BLOCK,
unsigned int THREADS_PER_VECTOR>
259spmm_csr_vector_kernel_col(const IndexType Anum_rows,
262 const IndexType *
Ap,
263 const IndexType *
Aj,
264 const ValueType *
Ax,
268 __shared__
volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];
269 __shared__
volatile IndexType
ptrs[VECTORS_PER_BLOCK][2];
274 const IndexType
thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);
295 if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
299 IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) +
thread_lane;
302 if(jj >= row_start && jj < row_end)
306 for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
312 for(IndexType jj = row_start +
thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
317 sdata[threadIdx.x] = sum;
320 if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
321 if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
322 if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
323 if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
324 if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
328 y[
j*Anum_rows+row] = sdata[threadIdx.x];
336template <
typename IndexType,
typename ValueType,
unsigned int VECTORS_PER_BLOCK,
unsigned int THREADS_PER_VECTOR>
339spmm_csr_vector_kernel_col(const IndexType Anum_rows,
342 const IndexType * Ar,
343 const IndexType * Ac,
344 const ValueType * Aval,
348 __shared__
volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];
349 __shared__
volatile IndexType
ptrs[VECTORS_PER_BLOCK][2];
351 extern __shared__
int sha[];
352 ValueType *
const sh_Aval = (ValueType*) sha;
353 IndexType *
const sh_Ac = (IndexType*) (sh_Aval + 32 * VECTORS_PER_BLOCK);
358 const IndexType
thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1);
360 const IndexType
vector_lane = threadIdx.x / THREADS_PER_VECTOR;
361 const IndexType
num_vectors = VECTORS_PER_BLOCK * gridDim.x;
373 const IndexType r = row_end - row_start;
376 for (IndexType i =
thread_lane; i < r; i+= THREADS_PER_VECTOR){
388 if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
391 IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) +
thread_lane;
393 if(jj >= row_start && jj < row_end)
396 for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
402 for (IndexType jj =
thread_lane; jj < r; jj+=THREADS_PER_VECTOR)
408 sdata[threadIdx.x] =
sum;
410 if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] =
sum =
sum + sdata[threadIdx.x + 16];
411 if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] =
sum =
sum + sdata[threadIdx.x + 8];
412 if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] =
sum =
sum + sdata[threadIdx.x + 4];
413 if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] =
sum =
sum + sdata[threadIdx.x + 2];
414 if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] =
sum =
sum + sdata[threadIdx.x + 1];
417 y[
j*Anum_rows+row] = sdata[threadIdx.x];
424template <
bool UseCache,
unsigned int THREADS_PER_VECTOR,
425 typename Matrix,
typename Vector2,
typename Vector3>
428 typedef typename Matrix::index_type IndexType;
429 typedef typename Matrix::value_type ValueType;
434 const size_t SHARED = 0;
436 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>,
THREADS_PER_BLOCK, SHARED);
437 const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK));
439 spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR> <<<NUM_BLOCKS,
THREADS_PER_BLOCK, SHARED>>>
440 (A.num_rows,
x.num_rows,
x.num_cols,
441 thrust::raw_pointer_cast(&A.row_offsets[0]),
442 thrust::raw_pointer_cast(&A.column_indices[0]),
443 thrust::raw_pointer_cast(&A.values[0]),
444 thrust::raw_pointer_cast(&(
x.values)[0]),
445 thrust::raw_pointer_cast(&(
y.values)[0]));
448template <
typename Matrix,
typename Vector2,
typename Vector3>
453 typedef typename Vector2::index_type IndexType;
454 for (IndexType col=0; col<
x.num_cols; ++col) {
455 multiply(A,
x.column(col),
y.column(col));
458 typedef typename Matrix::index_type IndexType;
460 const IndexType nnz_per_row = A.num_entries / A.num_rows;
462 if (nnz_per_row <= 2) { __spmm_csr_vector_col<false, 2>(A,
x,
y);
return; }
463 if (nnz_per_row <= 4) { __spmm_csr_vector_col<false, 4>(A,
x,
y);
return; }
464 if (nnz_per_row <= 8) { __spmm_csr_vector_col<false, 8>(A,
x,
y);
return; }
465 if (nnz_per_row <= 16) { __spmm_csr_vector_col<false,16>(A,
x,
y);
return; }
467 __spmm_csr_vector_col<false,32>(A,
x,
y);
473template <
typename Matrix,
typename Vector2,
typename Vector3>
476 y.resize(A.num_rows,
x.num_cols);
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP... > >::value >::type sum(const Kokkos::View< RD, RP... > &r, const Kokkos::View< XD, XP... > &x)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
const IndexType thread_lane
const IndexType num_vectors
void spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y)
void __spmm_csr_vector_col(const Matrix &A, const Vector2 &x, Vector3 &y)
void __spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y, cusp::row_major)
const IndexType vector_id
const IndexType const IndexType const IndexType * Ap
const IndexType const IndexType const IndexType const IndexType * Aj
__shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]
__launch_bounds__(VECTORS_PER_BLOCK *THREADS_PER_VECTOR, 1) __global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows
__global__ void spmm_csr_vector_kernel_row(const IndexType Anum_rows, const IndexType xnum_rows, const IndexType xnum_cols, const IndexType *Ar, const IndexType *Ac, const ValueType *Aval, const ValueType *x, ValueType *y)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
const IndexType THREADS_PER_BLOCK
const IndexType xnum_rows
const IndexType vector_lane
const IndexType const IndexType xnum_cols
const IndexType thread_id
const IndexType const IndexType const IndexType const IndexType const ValueType * Ax