1 #pragma once 2 3 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 4 #include <petsc/private/matimpl.h> /*I <petscmat.h> I*/ 5 6 #ifdef __cplusplus 7 #include <petsc/private/deviceimpl.h> 8 #include <petsc/private/cupmsolverinterface.hpp> 9 #include <petsc/private/cupmobject.hpp> 10 11 #include "../src/sys/objects/device/impls/cupm/cupmthrustutility.hpp" 12 #include "../src/sys/objects/device/impls/cupm/kernels.hpp" 13 14 #include <thrust/device_vector.h> 15 #include <thrust/device_ptr.h> 16 #include <thrust/iterator/counting_iterator.h> 17 #include <thrust/iterator/transform_iterator.h> 18 #include <thrust/iterator/permutation_iterator.h> 19 #include <thrust/transform.h> 20 #include <thrust/copy.h> 21 22 namespace Petsc 23 { 24 25 namespace vec 26 { 27 28 namespace cupm 29 { 30 31 namespace impl 32 { 33 34 template <device::cupm::DeviceType> 35 class VecSeq_CUPM; 36 template <device::cupm::DeviceType> 37 class VecMPI_CUPM; 38 39 } // namespace impl 40 41 } // namespace cupm 42 43 } // namespace vec 44 45 namespace mat 46 { 47 48 namespace cupm 49 { 50 51 namespace impl 52 { 53 54 // ========================================================================================== 55 // MatDense_CUPM_Base 56 // 57 // A base class to separate out the CRTP code from the common CUPM stuff (like the composed 58 // function names). 59 // ========================================================================================== 60 61 template <device::cupm::DeviceType T> 62 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM_Base : protected device::cupm::impl::CUPMObject<T> { 63 public: 64 PETSC_CUPMOBJECT_HEADER(T); 65 66 #define MatDenseCUPMComposedOpDecl(OP_NAME) \ 67 PETSC_NODISCARD static constexpr const char *PetscConcat(MatDenseCUPM, OP_NAME)() noexcept \ 68 { \ 69 return T == device::cupm::DeviceType::CUDA ? PetscStringize(PetscConcat(MatDenseCUDA, OP_NAME)) : PetscStringize(PetscConcat(MatDenseHIP, OP_NAME)); \ 70 } 71 72 // clang-format off 73 MatDenseCUPMComposedOpDecl(GetArray_C) 74 MatDenseCUPMComposedOpDecl(GetArrayRead_C) 75 MatDenseCUPMComposedOpDecl(GetArrayWrite_C) 76 MatDenseCUPMComposedOpDecl(RestoreArray_C) 77 MatDenseCUPMComposedOpDecl(RestoreArrayRead_C) 78 MatDenseCUPMComposedOpDecl(RestoreArrayWrite_C) 79 MatDenseCUPMComposedOpDecl(PlaceArray_C) 80 MatDenseCUPMComposedOpDecl(ReplaceArray_C) 81 MatDenseCUPMComposedOpDecl(ResetArray_C) 82 MatDenseCUPMComposedOpDecl(SetPreallocation_C) 83 // clang-format on 84 85 #undef MatDenseCUPMComposedOpDecl 86 87 PETSC_NODISCARD static constexpr MatType MATSEQDENSECUPM() noexcept; 88 PETSC_NODISCARD static constexpr MatType MATMPIDENSECUPM() noexcept; 89 PETSC_NODISCARD static constexpr MatType MATDENSECUPM() noexcept; 90 PETSC_NODISCARD static constexpr MatSolverType MATSOLVERCUPM() noexcept; 91 }; 92 93 // ========================================================================================== 94 // MatDense_CUPM_Base -- Public API 95 // ========================================================================================== 96 97 template <device::cupm::DeviceType T> 98 inline constexpr MatType MatDense_CUPM_Base<T>::MATSEQDENSECUPM() noexcept 99 { 100 return T == device::cupm::DeviceType::CUDA ? MATSEQDENSECUDA : MATSEQDENSEHIP; 101 } 102 103 template <device::cupm::DeviceType T> 104 inline constexpr MatType MatDense_CUPM_Base<T>::MATMPIDENSECUPM() noexcept 105 { 106 return T == device::cupm::DeviceType::CUDA ? MATMPIDENSECUDA : MATMPIDENSEHIP; 107 } 108 109 template <device::cupm::DeviceType T> 110 inline constexpr MatType MatDense_CUPM_Base<T>::MATDENSECUPM() noexcept 111 { 112 return T == device::cupm::DeviceType::CUDA ? MATDENSECUDA : MATDENSEHIP; 113 } 114 115 template <device::cupm::DeviceType T> 116 inline constexpr MatSolverType MatDense_CUPM_Base<T>::MATSOLVERCUPM() noexcept 117 { 118 return T == device::cupm::DeviceType::CUDA ? MATSOLVERCUDA : MATSOLVERHIP; 119 } 120 121 #define MATDENSECUPM_BASE_HEADER(T) \ 122 PETSC_CUPMOBJECT_HEADER(T); \ 123 using VecSeq_CUPM = ::Petsc::vec::cupm::impl::VecSeq_CUPM<T>; \ 124 using VecMPI_CUPM = ::Petsc::vec::cupm::impl::VecMPI_CUPM<T>; \ 125 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATSEQDENSECUPM; \ 126 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATMPIDENSECUPM; \ 127 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATDENSECUPM; \ 128 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATSOLVERCUPM; \ 129 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArray_C; \ 130 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayRead_C; \ 131 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayWrite_C; \ 132 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArray_C; \ 133 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayRead_C; \ 134 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayWrite_C; \ 135 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMPlaceArray_C; \ 136 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMReplaceArray_C; \ 137 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMResetArray_C; \ 138 using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMSetPreallocation_C 139 140 // forward declare 141 template <device::cupm::DeviceType> 142 class MatDense_Seq_CUPM; 143 template <device::cupm::DeviceType> 144 class MatDense_MPI_CUPM; 145 146 // ========================================================================================== 147 // MatDense_CUPM 148 // 149 // The true "base" class for MatDenseCUPM. The reason MatDense_CUPM and MatDense_CUPM_Base 150 // exist is to separate out the CRTP code from the non-crtp code so that the generic functions 151 // can be called via templates below. 152 // ========================================================================================== 153 154 template <device::cupm::DeviceType T, typename Derived> 155 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM : protected MatDense_CUPM_Base<T> { 156 private: 157 static PetscErrorCode CheckSaneSequentialMatSizes_(Mat) noexcept; 158 159 protected: 160 MATDENSECUPM_BASE_HEADER(T); 161 162 template <PetscMemType, PetscMemoryAccessMode> 163 class MatrixArray; 164 165 // Cast the Mat to its host struct, i.e. return the result of (Mat_SeqDense *)m->data 166 template <typename U = Derived> 167 PETSC_NODISCARD static constexpr auto MatIMPLCast(Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(U::MatIMPLCast_(m)) 168 PETSC_NODISCARD static constexpr MatType MATIMPLCUPM() noexcept; 169 170 static PetscErrorCode CreateIMPLDenseCUPM(MPI_Comm, PetscInt, PetscInt, PetscInt, PetscInt, PetscScalar *, Mat *, PetscDeviceContext, bool) noexcept; 171 static PetscErrorCode SetPreallocation(Mat, PetscDeviceContext, PetscScalar *) noexcept; 172 173 template <typename F> 174 static PetscErrorCode DiagonalUnaryTransform(Mat, PetscDeviceContext, F &&) noexcept; 175 176 static PetscErrorCode Shift(Mat, PetscScalar) noexcept; 177 static PetscErrorCode GetDiagonal(Mat, Vec) noexcept; 178 179 PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, m}) 180 PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, m}) 181 PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, m}) 182 PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, m}) 183 PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, m}) 184 PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, m}) 185 }; 186 187 // ========================================================================================== 188 // MatDense_CUPM::MatrixArray 189 // ========================================================================================== 190 191 template <device::cupm::DeviceType T, typename D> 192 template <PetscMemType MT, PetscMemoryAccessMode MA> 193 class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM<T, D>::MatrixArray : public device::cupm::impl::RestoreableArray<T, MT, MA> { 194 using base_type = device::cupm::impl::RestoreableArray<T, MT, MA>; 195 196 public: 197 MatrixArray(PetscDeviceContext, Mat) noexcept; 198 ~MatrixArray() noexcept; 199 200 // must declare move constructor since we declare a destructor 201 constexpr MatrixArray(MatrixArray &&) noexcept; 202 203 private: 204 Mat m_ = nullptr; 205 }; 206 207 // ========================================================================================== 208 // MatDense_CUPM::MatrixArray -- Public API 209 // ========================================================================================== 210 211 template <device::cupm::DeviceType T, typename D> 212 template <PetscMemType MT, PetscMemoryAccessMode MA> 213 inline MatDense_CUPM<T, D>::MatrixArray<MT, MA>::MatrixArray(PetscDeviceContext dctx, Mat m) noexcept : base_type{dctx}, m_{m} 214 { 215 PetscFunctionBegin; 216 PetscCallAbort(PETSC_COMM_SELF, D::template GetArray<MT, MA>(m, &this->ptr_, dctx)); 217 PetscFunctionReturnVoid(); 218 } 219 220 template <device::cupm::DeviceType T, typename D> 221 template <PetscMemType MT, PetscMemoryAccessMode MA> 222 inline MatDense_CUPM<T, D>::MatrixArray<MT, MA>::~MatrixArray() noexcept 223 { 224 PetscFunctionBegin; 225 PetscCallAbort(PETSC_COMM_SELF, D::template RestoreArray<MT, MA>(m_, &this->ptr_, this->dctx_)); 226 PetscFunctionReturnVoid(); 227 } 228 229 template <device::cupm::DeviceType T, typename D> 230 template <PetscMemType MT, PetscMemoryAccessMode MA> 231 inline constexpr MatDense_CUPM<T, D>::MatrixArray<MT, MA>::MatrixArray(MatrixArray &&other) noexcept : base_type{std::move(other)}, m_{util::exchange(other.m_, nullptr)} 232 { 233 } 234 235 // ========================================================================================== 236 // MatDense_CUPM -- Private API 237 // ========================================================================================== 238 239 template <device::cupm::DeviceType T, typename D> 240 inline PetscErrorCode MatDense_CUPM<T, D>::CheckSaneSequentialMatSizes_(Mat A) noexcept 241 { 242 PetscFunctionBegin; 243 if (PetscDefined(USE_DEBUG)) { 244 PetscBool isseq; 245 246 PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), D::MATSEQDENSECUPM(), &isseq)); 247 if (isseq) { 248 // doing this check allows both sequential and parallel implementations to just pass in 249 // A, otherwise they would need to specify rstart, rend, and cols separately! 250 PetscCheck(A->rmap->rstart == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix row start %" PetscInt_FMT " != 0?", A->rmap->rstart); 251 PetscCheck(A->rmap->rend == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix row end %" PetscInt_FMT " != number of rows %" PetscInt_FMT, A->rmap->rend, A->rmap->n); 252 PetscCheck(A->cmap->n == A->cmap->N, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix number of local columns %" PetscInt_FMT " != number of global columns %" PetscInt_FMT, A->cmap->n, A->cmap->n); 253 } 254 } 255 PetscFunctionReturn(PETSC_SUCCESS); 256 } 257 258 // ========================================================================================== 259 // MatDense_CUPM -- Protected API 260 // ========================================================================================== 261 262 template <device::cupm::DeviceType T, typename D> 263 inline constexpr MatType MatDense_CUPM<T, D>::MATIMPLCUPM() noexcept 264 { 265 return D::MATIMPLCUPM_(); 266 } 267 268 // Common core for MatCreateSeqDenseCUPM() and MatCreateMPIDenseCUPM() 269 template <device::cupm::DeviceType T, typename D> 270 inline PetscErrorCode MatDense_CUPM<T, D>::CreateIMPLDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscScalar *data, Mat *A, PetscDeviceContext dctx, bool preallocate) noexcept 271 { 272 Mat mat; 273 274 PetscFunctionBegin; 275 PetscAssertPointer(A, 7); 276 PetscCall(MatCreate(comm, &mat)); 277 PetscCall(MatSetSizes(mat, m, n, M, N)); 278 PetscCall(MatSetType(mat, D::MATIMPLCUPM())); 279 if (preallocate) { 280 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx)); 281 PetscCall(D::SetPreallocation(mat, dctx, data)); 282 } 283 *A = mat; 284 PetscFunctionReturn(PETSC_SUCCESS); 285 } 286 287 template <device::cupm::DeviceType T, typename D> 288 inline PetscErrorCode MatDense_CUPM<T, D>::SetPreallocation(Mat A, PetscDeviceContext dctx, PetscScalar *device_array) noexcept 289 { 290 PetscFunctionBegin; 291 // cannot use PetscValidHeaderSpecificType(..., MATIMPLCUPM()) since the incoming matrix 292 // might be the local (sequential) matrix of a MatMPIDense_CUPM. Since this would be called 293 // from the MPI matrix'es impl MATIMPLCUPM() would return MATMPIDENSECUPM(). 294 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 295 PetscCheckTypeNames(A, D::MATSEQDENSECUPM(), D::MATMPIDENSECUPM()); 296 PetscCall(PetscLayoutSetUp(A->rmap)); 297 PetscCall(PetscLayoutSetUp(A->cmap)); 298 PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx)); 299 PetscCall(D::SetPreallocation_(A, dctx, device_array)); 300 A->preallocated = PETSC_TRUE; 301 A->assembled = PETSC_TRUE; 302 PetscFunctionReturn(PETSC_SUCCESS); 303 } 304 305 namespace detail 306 { 307 308 #if CCCL_VERSION >= 3000000 309 template <class T> 310 using iter_difference_t = cuda::std::iter_difference_t<T>; 311 #else 312 template <class T> 313 using iter_difference_t = typename thrust::iterator_difference<T>::type; 314 #endif 315 316 // ========================================================================================== 317 // MatrixIteratorBase 318 // 319 // A base class for creating thrust iterators over the local sub-matrix. This will set up the 320 // proper iterator definitions so thrust knows how to handle things properly. Template 321 // parameters are as follows: 322 // 323 // - Iterator: 324 // The type of the primary array iterator. Usually this is 325 // thrust::device_pointer<PetscScalar>::iterator. 326 // 327 // - IndexFunctor: 328 // This should be a functor which contains an operator() that when called with an index `i`, 329 // returns the i'th permuted index into the array. For example, it could return the i'th 330 // diagonal entry. 331 // ========================================================================================== 332 template <typename Iterator, typename IndexFunctor> 333 class MatrixIteratorBase { 334 public: 335 using array_iterator_type = Iterator; 336 using index_functor_type = IndexFunctor; 337 338 using difference_type = iter_difference_t<array_iterator_type>; 339 using CountingIterator = thrust::counting_iterator<difference_type>; 340 using TransformIterator = thrust::transform_iterator<index_functor_type, CountingIterator>; 341 using PermutationIterator = thrust::permutation_iterator<array_iterator_type, TransformIterator>; 342 using iterator = PermutationIterator; // type of the begin/end iterator 343 344 constexpr MatrixIteratorBase(array_iterator_type first, array_iterator_type last, index_functor_type idx_func) noexcept : first{std::move(first)}, last{std::move(last)}, func{std::move(idx_func)} { } 345 346 PETSC_NODISCARD iterator begin() const noexcept 347 { 348 return PermutationIterator{ 349 first, TransformIterator{CountingIterator{0}, func} 350 }; 351 } 352 353 protected: 354 array_iterator_type first; 355 array_iterator_type last; 356 index_functor_type func; 357 }; 358 359 // ========================================================================================== 360 // StridedIndexFunctor 361 // 362 // Iterator which permutes a linear index range into strided matrix indices. Usually used to 363 // get the diagonal. 364 // ========================================================================================== 365 template <typename T> 366 struct StridedIndexFunctor { 367 PETSC_NODISCARD PETSC_HOSTDEVICE_INLINE_DECL constexpr T operator()(const T &i) const noexcept { return stride * i; } 368 369 T stride; 370 }; 371 372 template <typename Iterator> 373 class DiagonalIterator : public MatrixIteratorBase<Iterator, StridedIndexFunctor<iter_difference_t<Iterator>>> { 374 public: 375 using base_type = MatrixIteratorBase<Iterator, StridedIndexFunctor<iter_difference_t<Iterator>>>; 376 377 using difference_type = typename base_type::difference_type; 378 using iterator = typename base_type::iterator; 379 380 constexpr DiagonalIterator(Iterator first, Iterator last, difference_type stride) noexcept : base_type{std::move(first), std::move(last), {stride}} { } 381 382 PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->last - this->first + this->func.stride - 1) / this->func.stride; } 383 }; 384 385 template <typename T> 386 inline DiagonalIterator<typename thrust::device_vector<T>::iterator> MakeDiagonalIterator(T *data, PetscInt rstart, PetscInt rend, PetscInt cols, PetscInt lda) noexcept 387 { 388 const auto rend2 = std::min(rend, cols); 389 const std::size_t begin = rstart * lda; 390 const std::size_t end = rend2 - rstart + rend2 * lda; 391 const auto dptr = thrust::device_pointer_cast(data); 392 393 return {dptr + begin, dptr + end, lda + 1}; 394 } 395 396 } // namespace detail 397 398 template <device::cupm::DeviceType T, typename D> 399 template <typename F> 400 inline PetscErrorCode MatDense_CUPM<T, D>::DiagonalUnaryTransform(Mat A, PetscDeviceContext dctx, F &&functor) noexcept 401 { 402 const auto rstart = A->rmap->rstart; 403 const auto rend = A->rmap->rend; 404 const auto gcols = A->cmap->N; 405 const auto rend2 = std::min(rend, gcols); 406 407 PetscFunctionBegin; 408 PetscCall(CheckSaneSequentialMatSizes_(A)); 409 if (rend2 > rstart) { 410 const auto da = D::DeviceArrayReadWrite(dctx, A); 411 cupmStream_t stream; 412 PetscInt lda; 413 414 PetscCall(MatDenseGetLDA(A, &lda)); 415 PetscCall(D::GetHandlesFrom_(dctx, &stream)); 416 { 417 auto diagonal = detail::MakeDiagonalIterator(da.data(), rstart, rend, gcols, lda); 418 419 // clang-format off 420 PetscCallThrust( 421 THRUST_CALL( 422 thrust::transform, 423 stream, 424 diagonal.begin(), diagonal.end(), diagonal.begin(), 425 std::forward<F>(functor) 426 ) 427 ); 428 // clang-format on 429 } 430 PetscCall(PetscLogGpuFlops(rend2 - rstart)); 431 } 432 PetscFunctionReturn(PETSC_SUCCESS); 433 } 434 435 template <device::cupm::DeviceType T, typename D> 436 inline PetscErrorCode MatDense_CUPM<T, D>::Shift(Mat A, PetscScalar alpha) noexcept 437 { 438 PetscDeviceContext dctx; 439 440 PetscFunctionBegin; 441 PetscCall(GetHandles_(&dctx)); 442 PetscCall(DiagonalUnaryTransform(A, dctx, device::cupm::functors::make_plus_equals(alpha))); 443 PetscFunctionReturn(PETSC_SUCCESS); 444 } 445 446 template <device::cupm::DeviceType T, typename D> 447 inline PetscErrorCode MatDense_CUPM<T, D>::GetDiagonal(Mat A, Vec v) noexcept 448 { 449 const auto rstart = A->rmap->rstart; 450 const auto rend = A->rmap->rend; 451 const auto gcols = A->cmap->N; 452 PetscInt lda; 453 PetscDeviceContext dctx; 454 455 PetscFunctionBegin; 456 PetscCall(CheckSaneSequentialMatSizes_(A)); 457 PetscCall(GetHandles_(&dctx)); 458 PetscCall(MatDenseGetLDA(A, &lda)); 459 { 460 auto dv = VecSeq_CUPM::DeviceArrayWrite(dctx, v); 461 auto da = D::DeviceArrayRead(dctx, A); 462 auto diagonal = detail::MakeDiagonalIterator(da.data(), rstart, rend, gcols, lda); 463 // We must have this cast outside of THRUST_CALL(). Without it, GCC 6.4 - 7.5, and 11.3.0 464 // throw spurious warnings: 465 // 466 // warning: 'MatDense_CUPM<...>::GetDiagonal(Mat, Vec)::<lambda()>' declared with greater 467 // visibility than the type of its field 'MatDense_CUPM<...>::GetDiagonal(Mat, 468 // Vec)::<lambda()>::<dv capture>' [-Wattributes] 469 // 460 | PetscCallThrust( 470 // | ^~~~~~~~~~~~~~~~ 471 auto dvp = thrust::device_pointer_cast(dv.data()); 472 cupmStream_t stream; 473 474 PetscCall(GetHandlesFrom_(dctx, &stream)); 475 PetscCallThrust(THRUST_CALL(thrust::copy, stream, diagonal.begin(), diagonal.end(), dvp)); 476 } 477 PetscFunctionReturn(PETSC_SUCCESS); 478 } 479 480 #define MatComposeOp_CUPM(use_host, pobj, op_str, op_host, ...) \ 481 do { \ 482 if (use_host) { \ 483 PetscCall(PetscObjectComposeFunction(pobj, op_str, op_host)); \ 484 } else { \ 485 PetscCall(PetscObjectComposeFunction(pobj, op_str, __VA_ARGS__)); \ 486 } \ 487 } while (0) 488 489 #define MatSetOp_CUPM(use_host, mat, op_name, op_host, ...) \ 490 do { \ 491 if (use_host) { \ 492 (mat)->ops->op_name = op_host; \ 493 } else { \ 494 (mat)->ops->op_name = __VA_ARGS__; \ 495 } \ 496 } while (0) 497 498 #define MATDENSECUPM_HEADER(T, ...) \ 499 MATDENSECUPM_BASE_HEADER(T); \ 500 friend class ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>; \ 501 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::MatIMPLCast; \ 502 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::MATIMPLCUPM; \ 503 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::CreateIMPLDenseCUPM; \ 504 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::SetPreallocation; \ 505 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayRead; \ 506 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayWrite; \ 507 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayReadWrite; \ 508 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayRead; \ 509 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayWrite; \ 510 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayReadWrite; \ 511 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DiagonalUnaryTransform; \ 512 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::Shift; \ 513 using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::GetDiagonal 514 515 } // namespace impl 516 517 // ========================================================================================== 518 // MatDense_CUPM -- Implementations 519 // ========================================================================================== 520 521 template <device::cupm::DeviceType T, PetscMemoryAccessMode access> 522 inline PetscErrorCode MatDenseCUPMGetArray_Private(Mat A, PetscScalar **array) noexcept 523 { 524 PetscFunctionBegin; 525 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 526 PetscAssertPointer(array, 2); 527 switch (access) { 528 case PETSC_MEMORY_ACCESS_READ: 529 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayRead_C(), (Mat, PetscScalar **), (A, array)); 530 break; 531 case PETSC_MEMORY_ACCESS_WRITE: 532 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayWrite_C(), (Mat, PetscScalar **), (A, array)); 533 break; 534 case PETSC_MEMORY_ACCESS_READ_WRITE: 535 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArray_C(), (Mat, PetscScalar **), (A, array)); 536 break; 537 } 538 if (PetscMemoryAccessWrite(access)) PetscCall(PetscObjectStateIncrease(PetscObjectCast(A))); 539 PetscFunctionReturn(PETSC_SUCCESS); 540 } 541 542 template <device::cupm::DeviceType T, PetscMemoryAccessMode access> 543 inline PetscErrorCode MatDenseCUPMRestoreArray_Private(Mat A, PetscScalar **array) noexcept 544 { 545 PetscFunctionBegin; 546 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 547 if (array) PetscAssertPointer(array, 2); 548 switch (access) { 549 case PETSC_MEMORY_ACCESS_READ: 550 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayRead_C(), (Mat, PetscScalar **), (A, array)); 551 break; 552 case PETSC_MEMORY_ACCESS_WRITE: 553 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayWrite_C(), (Mat, PetscScalar **), (A, array)); 554 break; 555 case PETSC_MEMORY_ACCESS_READ_WRITE: 556 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArray_C(), (Mat, PetscScalar **), (A, array)); 557 break; 558 } 559 if (PetscMemoryAccessWrite(access)) { 560 PetscCall(PetscObjectStateIncrease(PetscObjectCast(A))); 561 A->offloadmask = PETSC_OFFLOAD_GPU; 562 } 563 if (array) *array = nullptr; 564 PetscFunctionReturn(PETSC_SUCCESS); 565 } 566 567 template <device::cupm::DeviceType T> 568 inline PetscErrorCode MatDenseCUPMGetArray(Mat A, PetscScalar **array) noexcept 569 { 570 PetscFunctionBegin; 571 PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_READ_WRITE>(A, array)); 572 PetscFunctionReturn(PETSC_SUCCESS); 573 } 574 575 template <device::cupm::DeviceType T> 576 inline PetscErrorCode MatDenseCUPMGetArrayRead(Mat A, const PetscScalar **array) noexcept 577 { 578 PetscFunctionBegin; 579 PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_READ>(A, const_cast<PetscScalar **>(array))); 580 PetscFunctionReturn(PETSC_SUCCESS); 581 } 582 583 template <device::cupm::DeviceType T> 584 inline PetscErrorCode MatDenseCUPMGetArrayWrite(Mat A, PetscScalar **array) noexcept 585 { 586 PetscFunctionBegin; 587 PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_WRITE>(A, array)); 588 PetscFunctionReturn(PETSC_SUCCESS); 589 } 590 591 template <device::cupm::DeviceType T> 592 inline PetscErrorCode MatDenseCUPMRestoreArray(Mat A, PetscScalar **array) noexcept 593 { 594 PetscFunctionBegin; 595 PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_READ_WRITE>(A, array)); 596 PetscFunctionReturn(PETSC_SUCCESS); 597 } 598 599 template <device::cupm::DeviceType T> 600 inline PetscErrorCode MatDenseCUPMRestoreArrayRead(Mat A, const PetscScalar **array) noexcept 601 { 602 PetscFunctionBegin; 603 PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_READ>(A, const_cast<PetscScalar **>(array))); 604 PetscFunctionReturn(PETSC_SUCCESS); 605 } 606 607 template <device::cupm::DeviceType T> 608 inline PetscErrorCode MatDenseCUPMRestoreArrayWrite(Mat A, PetscScalar **array) noexcept 609 { 610 PetscFunctionBegin; 611 PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_WRITE>(A, array)); 612 PetscFunctionReturn(PETSC_SUCCESS); 613 } 614 615 template <device::cupm::DeviceType T> 616 inline PetscErrorCode MatDenseCUPMPlaceArray(Mat A, const PetscScalar *array) noexcept 617 { 618 PetscFunctionBegin; 619 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 620 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMPlaceArray_C(), (Mat, const PetscScalar *), (A, array)); 621 PetscCall(PetscObjectStateIncrease(PetscObjectCast(A))); 622 A->offloadmask = PETSC_OFFLOAD_GPU; 623 PetscFunctionReturn(PETSC_SUCCESS); 624 } 625 626 template <device::cupm::DeviceType T> 627 inline PetscErrorCode MatDenseCUPMReplaceArray(Mat A, const PetscScalar *array) noexcept 628 { 629 PetscFunctionBegin; 630 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 631 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMReplaceArray_C(), (Mat, const PetscScalar *), (A, array)); 632 PetscCall(PetscObjectStateIncrease(PetscObjectCast(A))); 633 A->offloadmask = PETSC_OFFLOAD_GPU; 634 PetscFunctionReturn(PETSC_SUCCESS); 635 } 636 637 template <device::cupm::DeviceType T> 638 inline PetscErrorCode MatDenseCUPMResetArray(Mat A) noexcept 639 { 640 PetscFunctionBegin; 641 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 642 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMResetArray_C(), (Mat), (A)); 643 PetscCall(PetscObjectStateIncrease(PetscObjectCast(A))); 644 PetscFunctionReturn(PETSC_SUCCESS); 645 } 646 647 template <device::cupm::DeviceType T> 648 inline PetscErrorCode MatDenseCUPMSetPreallocation(Mat A, PetscScalar *device_data, PetscDeviceContext dctx = nullptr) noexcept 649 { 650 PetscFunctionBegin; 651 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 652 PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMSetPreallocation_C(), (Mat, PetscDeviceContext, PetscScalar *), (A, dctx, device_data)); 653 PetscFunctionReturn(PETSC_SUCCESS); 654 } 655 656 } // namespace cupm 657 658 } // namespace mat 659 660 } // namespace Petsc 661 662 #endif // __cplusplus 663