Actual source code: hpddm.cu

  1: #define HPDDM_MIXED_PRECISION 1
  2: #include <petsc/private/petschpddm.h>
  3: #include <petscdevice_cuda.h>
  4: #include <thrust/device_ptr.h>
  5: #include <thrust/copy.h>

  7: PetscErrorCode KSPSolve_HPDDM_CUDA_Private(KSP_HPDDM *data, const PetscScalar *b, PetscScalar *x, PetscInt n, MPI_Comm comm)
  8: {
  9:   const PetscInt N = data->op->getDof() * n;
 10: #if PetscDefined(USE_REAL_DOUBLE)
 11:   typedef HPDDM::downscaled_type<PetscScalar> K;
 12: #endif
 13: #if PetscDefined(USE_REAL_SINGLE)
 14:   typedef HPDDM::upscaled_type<PetscScalar> K;
 15: #endif

 17:   PetscFunctionBegin; // TODO: remove all cudaMemcpy() once HPDDM::IterativeMethod::solve() handles device pointers
 18:   if (data->precision != PETSC_SCALAR_PRECISION) {
 19:     const thrust::device_ptr<const PetscScalar> db = thrust::device_pointer_cast(b);
 20:     const thrust::device_ptr<PetscScalar>       dx = thrust::device_pointer_cast(x);
 21:     K                                          *ptr, *host_ptr;
 22:     thrust::device_ptr<K>                       dptr[2];

 24:     PetscCall(PetscMalloc1(2 * N, &host_ptr));
 25:     PetscCallCUDA(cudaMalloc((void **)&ptr, 2 * N * sizeof(K)));
 26:     dptr[0] = thrust::device_pointer_cast(ptr);
 27:     dptr[1] = thrust::device_pointer_cast(ptr + N);
 28:     thrust::copy_n(thrust::cuda::par.on(PetscDefaultCudaStream), db, N, dptr[0]);
 29:     thrust::copy_n(thrust::cuda::par.on(PetscDefaultCudaStream), dx, N, dptr[1]);
 30:     PetscCallCUDA(cudaMemcpy(host_ptr, ptr, 2 * N * sizeof(K), cudaMemcpyDeviceToHost));
 31: #if PetscDefined(USE_COMPLEX)
 32:     /* reinterpret thrust::complex<> as std::complex<> so that HPDDM deduces a type with BLAS/LAPACK and MPI support */
 33:     std::complex<HPDDM::underlying_type<K>> *const hb = reinterpret_cast<std::complex<HPDDM::underlying_type<K>> *>(host_ptr);
 34:     PetscCall(HPDDM::IterativeMethod::solve(*data->op, hb, hb + N, n, comm));
 35: #else
 36:     PetscCall(HPDDM::IterativeMethod::solve(*data->op, host_ptr, host_ptr + N, n, comm));
 37: #endif
 38:     PetscCallCUDA(cudaMemcpy(ptr + N, host_ptr + N, N * sizeof(K), cudaMemcpyHostToDevice));
 39:     thrust::copy_n(thrust::cuda::par.on(PetscDefaultCudaStream), dptr[1], N, dx);
 40:     PetscCallCUDA(cudaFree(ptr));
 41:     PetscCall(PetscFree(host_ptr));
 42:     PetscCall(PetscLogGpuToCpu(2 * N * sizeof(K)));
 43:     PetscCall(PetscLogCpuToGpu(N * sizeof(K)));
 44:   } else {
 45:     PetscScalar *host_ptr;

 47:     PetscCall(PetscMalloc1(2 * N, &host_ptr));
 48:     PetscCallCUDA(cudaMemcpy(host_ptr, b, N * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
 49:     PetscCallCUDA(cudaMemcpy(host_ptr + N, x, N * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
 50: #if PetscDefined(USE_COMPLEX)
 51:     std::complex<PetscReal> *const hb = reinterpret_cast<std::complex<PetscReal> *>(host_ptr);
 52:     PetscCall(HPDDM::IterativeMethod::solve(*data->op, hb, hb + N, n, comm));
 53: #else
 54:     PetscCall(HPDDM::IterativeMethod::solve(*data->op, host_ptr, host_ptr + N, n, comm));
 55: #endif
 56:     PetscCallCUDA(cudaMemcpy(x, host_ptr + N, N * sizeof(PetscScalar), cudaMemcpyHostToDevice));
 57:     PetscCall(PetscFree(host_ptr));
 58:     PetscCall(PetscLogGpuToCpu(2 * N * sizeof(PetscScalar)));
 59:     PetscCall(PetscLogCpuToGpu(N * sizeof(PetscScalar)));
 60:   }
 61:   PetscFunctionReturn(PETSC_SUCCESS);
 62: }