1e4a0ef16SKarl Rupp 2e4a0ef16SKarl Rupp /* 3e4a0ef16SKarl Rupp Defines the basic matrix operations for the AIJ (compressed row) 4e4a0ef16SKarl Rupp matrix storage format. 5e4a0ef16SKarl Rupp */ 6e4a0ef16SKarl Rupp 7aaa7dc30SBarry Smith #include <petscconf.h> 899acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 9aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10aaa7dc30SBarry Smith #include <petscbt.h> 11aaa7dc30SBarry Smith #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 13e4a0ef16SKarl Rupp 14aaa7dc30SBarry Smith #include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h> 15e4a0ef16SKarl Rupp 16e4a0ef16SKarl Rupp #include <algorithm> 17e4a0ef16SKarl Rupp #include <vector> 18e4a0ef16SKarl Rupp #include <string> 19e4a0ef16SKarl Rupp 20e4a0ef16SKarl Rupp #include "viennacl/linalg/prod.hpp" 21e4a0ef16SKarl Rupp 228713a8baSPatrick Sanan PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat); 2372367587SKarl Rupp PETSC_INTERN PetscErrorCode MatGetFactor_seqaij_petsc(Mat, MatFactorType, Mat *); 244222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 258713a8baSPatrick Sanan 26d71ae5a4SJacob Faibussowitsch PetscErrorCode MatViennaCLCopyToGPU(Mat A) 27d71ae5a4SJacob Faibussowitsch { 28e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr; 29e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 30e4a0ef16SKarl Rupp 31e4a0ef16SKarl Rupp PetscFunctionBegin; 32bf1781e8SStefano Zampini if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { //some OpenCL SDKs have issues with buffers of size 0 33c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 349566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_ViennaCLCopyToGPU, A, 0, 0, 0)); 35e4a0ef16SKarl Rupp 36e4a0ef16SKarl Rupp try { 37e4a0ef16SKarl Rupp if (a->compressedrow.use) { 38a3430c56SKarl Rupp if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix(); 39e4a0ef16SKarl Rupp 40a3430c56SKarl Rupp // Since PetscInt is different from cl_uint, we have to convert: 41a3430c56SKarl Rupp viennacl::backend::mem_handle dummy; 42e4a0ef16SKarl Rupp 439371c9d4SSatish Balay viennacl::backend::typesafe_host_array<unsigned int> row_buffer; 449371c9d4SSatish Balay row_buffer.raw_resize(dummy, a->compressedrow.nrows + 1); 459371c9d4SSatish Balay for (PetscInt i = 0; i <= a->compressedrow.nrows; ++i) row_buffer.set(i, (a->compressedrow.i)[i]); 46e4a0ef16SKarl Rupp 479371c9d4SSatish Balay viennacl::backend::typesafe_host_array<unsigned int> row_indices; 489371c9d4SSatish Balay row_indices.raw_resize(dummy, a->compressedrow.nrows); 499371c9d4SSatish Balay for (PetscInt i = 0; i < a->compressedrow.nrows; ++i) row_indices.set(i, (a->compressedrow.rindex)[i]); 50a3430c56SKarl Rupp 519371c9d4SSatish Balay viennacl::backend::typesafe_host_array<unsigned int> col_buffer; 529371c9d4SSatish Balay col_buffer.raw_resize(dummy, a->nz); 539371c9d4SSatish Balay for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]); 54a3430c56SKarl Rupp 55a3430c56SKarl Rupp viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz); 569566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((2 * a->compressedrow.nrows) + 1 + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar))); 57e4a0ef16SKarl Rupp } else { 58a3430c56SKarl Rupp if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix(); 59e4a0ef16SKarl Rupp 60e4a0ef16SKarl Rupp // Since PetscInt is in general different from cl_uint, we have to convert: 61e4a0ef16SKarl Rupp viennacl::backend::mem_handle dummy; 62e4a0ef16SKarl Rupp 639371c9d4SSatish Balay viennacl::backend::typesafe_host_array<unsigned int> row_buffer; 649371c9d4SSatish Balay row_buffer.raw_resize(dummy, A->rmap->n + 1); 659371c9d4SSatish Balay for (PetscInt i = 0; i <= A->rmap->n; ++i) row_buffer.set(i, (a->i)[i]); 66e4a0ef16SKarl Rupp 679371c9d4SSatish Balay viennacl::backend::typesafe_host_array<unsigned int> col_buffer; 689371c9d4SSatish Balay col_buffer.raw_resize(dummy, a->nz); 699371c9d4SSatish Balay for (PetscInt i = 0; i < a->nz; ++i) col_buffer.set(i, (a->j)[i]); 70e4a0ef16SKarl Rupp 71e4a0ef16SKarl Rupp viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz); 729566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((A->rmap->n + 1) + a->nz) * sizeof(PetscInt) + (a->nz) * sizeof(PetscScalar))); 73e4a0ef16SKarl Rupp } 744cf1874eSKarl Rupp ViennaCLWaitForGPU(); 75d71ae5a4SJacob Faibussowitsch } catch (std::exception const &ex) { 76d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 77d71ae5a4SJacob Faibussowitsch } 78e4a0ef16SKarl Rupp 79a3430c56SKarl Rupp // Create temporary vector for v += A*x: 80a3430c56SKarl Rupp if (viennaclstruct->tempvec) { 819b66742cSDave May if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(A->rmap->n)) { 82a3430c56SKarl Rupp delete (ViennaCLVector *)viennaclstruct->tempvec; 839b66742cSDave May viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n); 84a3430c56SKarl Rupp } else { 85a3430c56SKarl Rupp viennaclstruct->tempvec->clear(); 86a3430c56SKarl Rupp } 87a3430c56SKarl Rupp } else { 889b66742cSDave May viennaclstruct->tempvec = new ViennaCLVector(A->rmap->n); 89a3430c56SKarl Rupp } 90a3430c56SKarl Rupp 91c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 92e4a0ef16SKarl Rupp 939566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_ViennaCLCopyToGPU, A, 0, 0, 0)); 94e4a0ef16SKarl Rupp } 9567c87b7fSKarl Rupp } 963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 97e4a0ef16SKarl Rupp } 98e4a0ef16SKarl Rupp 99d71ae5a4SJacob Faibussowitsch PetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu) 100d71ae5a4SJacob Faibussowitsch { 101f38c1e66SStefano Zampini Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr; 102e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 103e4a0ef16SKarl Rupp PetscInt m = A->rmap->n; 104e4a0ef16SKarl Rupp 105e4a0ef16SKarl Rupp PetscFunctionBegin; 1063ba16761SJacob Faibussowitsch if (A->offloadmask == PETSC_OFFLOAD_BOTH) PetscFunctionReturn(PETSC_SUCCESS); 107c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED && Agpu) { 108e4a0ef16SKarl Rupp try { 1092c71b3e2SJacob Faibussowitsch PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 1102c71b3e2SJacob Faibussowitsch PetscCheck((PetscInt)Agpu->size1() == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "GPU matrix has %lu rows, should be %" PetscInt_FMT, Agpu->size1(), m); 111e4a0ef16SKarl Rupp a->nz = Agpu->nnz(); 112e4a0ef16SKarl Rupp a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 113e4a0ef16SKarl Rupp A->preallocated = PETSC_TRUE; 114e4a0ef16SKarl Rupp if (a->singlemalloc) { 1159566063dSJacob Faibussowitsch if (a->a) PetscCall(PetscFree3(a->a, a->j, a->i)); 116e4a0ef16SKarl Rupp } else { 1179566063dSJacob Faibussowitsch if (a->i) PetscCall(PetscFree(a->i)); 1189566063dSJacob Faibussowitsch if (a->j) PetscCall(PetscFree(a->j)); 1199566063dSJacob Faibussowitsch if (a->a) PetscCall(PetscFree(a->a)); 120e4a0ef16SKarl Rupp } 1219566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(a->nz, &a->a, a->nz, &a->j, m + 1, &a->i)); 122e4a0ef16SKarl Rupp 123e4a0ef16SKarl Rupp a->singlemalloc = PETSC_TRUE; 124e4a0ef16SKarl Rupp 125e4a0ef16SKarl Rupp /* Setup row lengths */ 1269566063dSJacob Faibussowitsch PetscCall(PetscFree(a->imax)); 1279566063dSJacob Faibussowitsch PetscCall(PetscFree(a->ilen)); 1289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &a->imax)); 1299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &a->ilen)); 130e4a0ef16SKarl Rupp 131e4a0ef16SKarl Rupp /* Copy data back from GPU */ 1329371c9d4SSatish Balay viennacl::backend::typesafe_host_array<unsigned int> row_buffer; 1339371c9d4SSatish Balay row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1); 134e4a0ef16SKarl Rupp 135e4a0ef16SKarl Rupp // copy row array 136e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get()); 137e4a0ef16SKarl Rupp (a->i)[0] = row_buffer[0]; 138e4a0ef16SKarl Rupp for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) { 139e4a0ef16SKarl Rupp (a->i)[i + 1] = row_buffer[i + 1]; 140e4a0ef16SKarl Rupp a->imax[i] = a->ilen[i] = a->i[i + 1] - a->i[i]; //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse 141e4a0ef16SKarl Rupp } 142e4a0ef16SKarl Rupp 143e4a0ef16SKarl Rupp // copy column indices 1449371c9d4SSatish Balay viennacl::backend::typesafe_host_array<unsigned int> col_buffer; 1459371c9d4SSatish Balay col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz()); 146e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get()); 1479371c9d4SSatish Balay for (PetscInt i = 0; i < (PetscInt)Agpu->nnz(); ++i) (a->j)[i] = col_buffer[i]; 148e4a0ef16SKarl Rupp 149e4a0ef16SKarl Rupp // copy nonzero entries directly to destination (no conversion required) 150e4a0ef16SKarl Rupp viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a); 151e4a0ef16SKarl Rupp 1529566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(row_buffer.raw_size() + col_buffer.raw_size() + (Agpu->nnz() * sizeof(PetscScalar)))); 1534cf1874eSKarl Rupp ViennaCLWaitForGPU(); 154023073b3SKarl Rupp /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */ 155d71ae5a4SJacob Faibussowitsch } catch (std::exception const &ex) { 156d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 157d71ae5a4SJacob Faibussowitsch } 158c70f7ee4SJunchao Zhang } else if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED) { 1593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 160f38c1e66SStefano Zampini } else { 1613ba16761SJacob Faibussowitsch if (!Agpu && A->offloadmask != PETSC_OFFLOAD_GPU) PetscFunctionReturn(PETSC_SUCCESS); 162e4a0ef16SKarl Rupp 1632c71b3e2SJacob Faibussowitsch PetscCheck(!a->compressedrow.use, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices"); 164f38c1e66SStefano Zampini if (!Agpu) { 165f38c1e66SStefano Zampini viennacl::backend::memory_read(viennaclstruct->mat->handle(), 0, sizeof(PetscScalar) * viennaclstruct->mat->nnz(), a->a); 166f38c1e66SStefano Zampini } else { 167f38c1e66SStefano Zampini viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar) * Agpu->nnz(), a->a); 168f38c1e66SStefano Zampini } 169f38c1e66SStefano Zampini } 170c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 171b8ced49eSKarl Rupp /* This assembly prevents resetting the flag to PETSC_OFFLOAD_CPU and recopying */ 1729566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 1739566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 175e4a0ef16SKarl Rupp } 176e4a0ef16SKarl Rupp 177d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJViennaCL(Mat A, Vec xx, Vec yy) 178d71ae5a4SJacob Faibussowitsch { 179e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 180e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr; 1810d73d530SKarl Rupp const ViennaCLVector *xgpu = NULL; 1820d73d530SKarl Rupp ViennaCLVector *ygpu = NULL; 183e4a0ef16SKarl Rupp 184e4a0ef16SKarl Rupp PetscFunctionBegin; 185837a59e1SRichard Tran Mills /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1869566063dSJacob Faibussowitsch PetscCall(MatViennaCLCopyToGPU(A)); 187bf1781e8SStefano Zampini if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { 1889566063dSJacob Faibussowitsch PetscCall(VecViennaCLGetArrayRead(xx, &xgpu)); 1899566063dSJacob Faibussowitsch PetscCall(VecViennaCLGetArrayWrite(yy, &ygpu)); 1909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 191e4a0ef16SKarl Rupp try { 192b7832b47SStefano Zampini if (a->compressedrow.use) { 193b7832b47SStefano Zampini *ygpu = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu); 194b7832b47SStefano Zampini } else { 195e4a0ef16SKarl Rupp *ygpu = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 196b7832b47SStefano Zampini } 1974cf1874eSKarl Rupp ViennaCLWaitForGPU(); 198d71ae5a4SJacob Faibussowitsch } catch (std::exception const &ex) { 199d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 200d71ae5a4SJacob Faibussowitsch } 2019566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2029566063dSJacob Faibussowitsch PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu)); 2039566063dSJacob Faibussowitsch PetscCall(VecViennaCLRestoreArrayWrite(yy, &ygpu)); 2049566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 205bf1781e8SStefano Zampini } else { 2069566063dSJacob Faibussowitsch PetscCall(VecSet_SeqViennaCL(yy, 0)); 20767c87b7fSKarl Rupp } 2083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 209e4a0ef16SKarl Rupp } 210e4a0ef16SKarl Rupp 211d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A, Vec xx, Vec yy, Vec zz) 212d71ae5a4SJacob Faibussowitsch { 213e4a0ef16SKarl Rupp Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 214e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL *)A->spptr; 2150d73d530SKarl Rupp const ViennaCLVector *xgpu = NULL, *ygpu = NULL; 2160d73d530SKarl Rupp ViennaCLVector *zgpu = NULL; 217e4a0ef16SKarl Rupp 218e4a0ef16SKarl Rupp PetscFunctionBegin; 219837a59e1SRichard Tran Mills /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2209566063dSJacob Faibussowitsch PetscCall(MatViennaCLCopyToGPU(A)); 221bf1781e8SStefano Zampini if (A->rmap->n > 0 && A->cmap->n > 0 && a->nz) { 222e4a0ef16SKarl Rupp try { 2239566063dSJacob Faibussowitsch PetscCall(VecViennaCLGetArrayRead(xx, &xgpu)); 2249566063dSJacob Faibussowitsch PetscCall(VecViennaCLGetArrayRead(yy, &ygpu)); 2259566063dSJacob Faibussowitsch PetscCall(VecViennaCLGetArrayWrite(zz, &zgpu)); 2269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 227f38c1e66SStefano Zampini if (a->compressedrow.use) *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu); 228f38c1e66SStefano Zampini else *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu); 229f38c1e66SStefano Zampini if (zz != yy) *zgpu = *ygpu + *viennaclstruct->tempvec; 230f38c1e66SStefano Zampini else *zgpu += *viennaclstruct->tempvec; 2314cf1874eSKarl Rupp ViennaCLWaitForGPU(); 2329566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2339566063dSJacob Faibussowitsch PetscCall(VecViennaCLRestoreArrayRead(xx, &xgpu)); 2349566063dSJacob Faibussowitsch PetscCall(VecViennaCLRestoreArrayRead(yy, &ygpu)); 2359566063dSJacob Faibussowitsch PetscCall(VecViennaCLRestoreArrayWrite(zz, &zgpu)); 236e4a0ef16SKarl Rupp 237d71ae5a4SJacob Faibussowitsch } catch (std::exception const &ex) { 238d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 239d71ae5a4SJacob Faibussowitsch } 2409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 241bf1781e8SStefano Zampini } else { 2429566063dSJacob Faibussowitsch PetscCall(VecCopy_SeqViennaCL(yy, zz)); 24367c87b7fSKarl Rupp } 2443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 245e4a0ef16SKarl Rupp } 246e4a0ef16SKarl Rupp 247d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A, MatAssemblyType mode) 248d71ae5a4SJacob Faibussowitsch { 249e4a0ef16SKarl Rupp PetscFunctionBegin; 2509566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 2513ba16761SJacob Faibussowitsch if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 2529566063dSJacob Faibussowitsch if (!A->boundtocpu) PetscCall(MatViennaCLCopyToGPU(A)); 2533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 254e4a0ef16SKarl Rupp } 255e4a0ef16SKarl Rupp 256e4a0ef16SKarl Rupp /* --------------------------------------------------------------------------------*/ 257cab5ea25SPierre Jolivet /*@C 25811a5261eSBarry Smith MatCreateSeqAIJViennaCL - Creates a sparse matrix in `MATSEQAIJVIENNACL` (compressed row) format 25919fddfadSKarl Rupp (the default parallel PETSc format). This matrix will ultimately be pushed down 260e4a0ef16SKarl Rupp to GPUs and use the ViennaCL library for calculations. For good matrix 261e4a0ef16SKarl Rupp assembly performance the user should preallocate the matrix storage by setting 262e4a0ef16SKarl Rupp the parameter nz (or the array nnz). By setting these parameters accurately, 263e4a0ef16SKarl Rupp performance during matrix assembly can be increased substantially. 264e4a0ef16SKarl Rupp 265d083f849SBarry Smith Collective 266e4a0ef16SKarl Rupp 267e4a0ef16SKarl Rupp Input Parameters: 26811a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 269e4a0ef16SKarl Rupp . m - number of rows 270e4a0ef16SKarl Rupp . n - number of columns 271e4a0ef16SKarl Rupp . nz - number of nonzeros per row (same for all rows) 272e4a0ef16SKarl Rupp - nnz - array containing the number of nonzeros in the various rows 273e4a0ef16SKarl Rupp (possibly different for each row) or NULL 274e4a0ef16SKarl Rupp 275e4a0ef16SKarl Rupp Output Parameter: 276e4a0ef16SKarl Rupp . A - the matrix 277e4a0ef16SKarl Rupp 27811a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 279e4a0ef16SKarl Rupp MatXXXXSetPreallocation() paradigm instead of this routine directly. 28011a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 281e4a0ef16SKarl Rupp 282e4a0ef16SKarl Rupp Notes: 283e4a0ef16SKarl Rupp If nnz is given then nz is ignored 284e4a0ef16SKarl Rupp 28511a5261eSBarry Smith The AIJ format, also called 286*2ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 287e4a0ef16SKarl Rupp storage. That is, the stored row and column indices can begin at 288e4a0ef16SKarl Rupp either one (as in Fortran) or zero. See the users' manual for details. 289e4a0ef16SKarl Rupp 290e4a0ef16SKarl Rupp Specify the preallocated storage with either nz or nnz (not both). 29111a5261eSBarry Smith Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 292e4a0ef16SKarl Rupp allocation. For large problems you MUST preallocate memory or you 293e4a0ef16SKarl Rupp will get TERRIBLE performance, see the users' manual chapter on matrices. 294e4a0ef16SKarl Rupp 295e4a0ef16SKarl Rupp Level: intermediate 296e4a0ef16SKarl Rupp 29711a5261eSBarry Smith .seealso: `MATSEQAIJVIENNACL`, `MatCreate()`, `MatCreateAIJ()`, `MatCreateAIJCUSPARSE()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 298e4a0ef16SKarl Rupp @*/ 299d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJViennaCL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 300d71ae5a4SJacob Faibussowitsch { 301e4a0ef16SKarl Rupp PetscFunctionBegin; 3029566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 3039566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 3049566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJVIENNACL)); 3059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 307e4a0ef16SKarl Rupp } 308e4a0ef16SKarl Rupp 309d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A) 310d71ae5a4SJacob Faibussowitsch { 311e4a0ef16SKarl Rupp Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL *)A->spptr; 312e4a0ef16SKarl Rupp 313e4a0ef16SKarl Rupp PetscFunctionBegin; 314e4a0ef16SKarl Rupp try { 3156447cd05SKarl Rupp if (viennaclcontainer) { 3166447cd05SKarl Rupp delete viennaclcontainer->tempvec; 3176447cd05SKarl Rupp delete viennaclcontainer->mat; 3186447cd05SKarl Rupp delete viennaclcontainer->compressed_mat; 319e4a0ef16SKarl Rupp delete viennaclcontainer; 3206447cd05SKarl Rupp } 321c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 322d71ae5a4SJacob Faibussowitsch } catch (std::exception const &ex) { 323d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what()); 324d71ae5a4SJacob Faibussowitsch } 3258713a8baSPatrick Sanan 3269566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", NULL)); 3279566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", NULL)); 3289566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", NULL)); 3298713a8baSPatrick Sanan 330e4a0ef16SKarl Rupp /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 331e4a0ef16SKarl Rupp A->spptr = 0; 3329566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 3333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 334e4a0ef16SKarl Rupp } 335e4a0ef16SKarl Rupp 336d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B) 337d71ae5a4SJacob Faibussowitsch { 338e4a0ef16SKarl Rupp PetscFunctionBegin; 3399566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 3409566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJViennaCL(B, MATSEQAIJVIENNACL, MAT_INPLACE_MATRIX, &B)); 3413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3428713a8baSPatrick Sanan } 3438713a8baSPatrick Sanan 344b470e4b4SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat, PetscBool); 345d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJViennaCL(Mat A, MatDuplicateOption cpvalues, Mat *B) 346d71ae5a4SJacob Faibussowitsch { 347c3cca76eSKarl Rupp Mat C; 348c3cca76eSKarl Rupp 349c3cca76eSKarl Rupp PetscFunctionBegin; 3509566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 351c3cca76eSKarl Rupp C = *B; 352c3cca76eSKarl Rupp 3539566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE)); 35492f9df4aSRichard Tran Mills C->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL; 355c3cca76eSKarl Rupp 356c3cca76eSKarl Rupp C->spptr = new Mat_SeqAIJViennaCL(); 357c3cca76eSKarl Rupp ((Mat_SeqAIJViennaCL *)C->spptr)->tempvec = NULL; 358c3cca76eSKarl Rupp ((Mat_SeqAIJViennaCL *)C->spptr)->mat = NULL; 359c3cca76eSKarl Rupp ((Mat_SeqAIJViennaCL *)C->spptr)->compressed_mat = NULL; 360c3cca76eSKarl Rupp 3619566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)C, MATSEQAIJVIENNACL)); 362c3cca76eSKarl Rupp 363c70f7ee4SJunchao Zhang C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 364c3cca76eSKarl Rupp 365c3cca76eSKarl Rupp /* If the source matrix is already assembled, copy the destination matrix to the GPU */ 3669566063dSJacob Faibussowitsch if (C->assembled) PetscCall(MatViennaCLCopyToGPU(C)); 3673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 368c3cca76eSKarl Rupp } 369c3cca76eSKarl Rupp 370d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJViennaCL(Mat A, PetscScalar *array[]) 371d71ae5a4SJacob Faibussowitsch { 372f38c1e66SStefano Zampini PetscFunctionBegin; 3739566063dSJacob Faibussowitsch PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL)); 374c1a4f3baSJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 3753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 376f38c1e66SStefano Zampini } 377f38c1e66SStefano Zampini 378d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJViennaCL(Mat A, PetscScalar *array[]) 379d71ae5a4SJacob Faibussowitsch { 380f38c1e66SStefano Zampini PetscFunctionBegin; 381c1a4f3baSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 382c1a4f3baSJunchao Zhang *array = NULL; 3833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 384c1a4f3baSJunchao Zhang } 385c1a4f3baSJunchao Zhang 386d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[]) 387d71ae5a4SJacob Faibussowitsch { 388c1a4f3baSJunchao Zhang PetscFunctionBegin; 3899566063dSJacob Faibussowitsch PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL)); 390c1a4f3baSJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 3913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 392c1a4f3baSJunchao Zhang } 393c1a4f3baSJunchao Zhang 394d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJViennaCL(Mat A, const PetscScalar *array[]) 395d71ae5a4SJacob Faibussowitsch { 396c1a4f3baSJunchao Zhang PetscFunctionBegin; 397c1a4f3baSJunchao Zhang *array = NULL; 398c1a4f3baSJunchao Zhang /* No A->offloadmask = PETSC_OFFLOAD_CPU since if A->offloadmask was PETSC_OFFLOAD_BOTH, it is still BOTH */ 3993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 400c1a4f3baSJunchao Zhang } 401c1a4f3baSJunchao Zhang 402d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[]) 403d71ae5a4SJacob Faibussowitsch { 404c1a4f3baSJunchao Zhang PetscFunctionBegin; 405c1a4f3baSJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 4063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 407c1a4f3baSJunchao Zhang } 408c1a4f3baSJunchao Zhang 409d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL(Mat A, PetscScalar *array[]) 410d71ae5a4SJacob Faibussowitsch { 411c1a4f3baSJunchao Zhang PetscFunctionBegin; 412c1a4f3baSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 413f38c1e66SStefano Zampini *array = NULL; 4143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 415f38c1e66SStefano Zampini } 416f38c1e66SStefano Zampini 417d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJViennaCL(Mat A, PetscBool flg) 418d71ae5a4SJacob Faibussowitsch { 419c1a4f3baSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 420f38c1e66SStefano Zampini 421e7e92044SBarry Smith PetscFunctionBegin; 422b470e4b4SRichard Tran Mills A->boundtocpu = flg; 423c1a4f3baSJunchao Zhang if (flg && a->inode.size) { 424c1a4f3baSJunchao Zhang a->inode.use = PETSC_TRUE; 425ea500dcfSRichard Tran Mills } else { 426c1a4f3baSJunchao Zhang a->inode.use = PETSC_FALSE; 427ea500dcfSRichard Tran Mills } 428e7e92044SBarry Smith if (flg) { 429f38c1e66SStefano Zampini /* make sure we have an up-to-date copy on the CPU */ 4309566063dSJacob Faibussowitsch PetscCall(MatViennaCLCopyFromGPU(A, (const ViennaCLAIJMatrix *)NULL)); 431e7e92044SBarry Smith A->ops->mult = MatMult_SeqAIJ; 432e7e92044SBarry Smith A->ops->multadd = MatMultAdd_SeqAIJ; 433e7e92044SBarry Smith A->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 434e7e92044SBarry Smith A->ops->duplicate = MatDuplicate_SeqAIJ; 4359566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 436e7e92044SBarry Smith } else { 437e7e92044SBarry Smith A->ops->mult = MatMult_SeqAIJViennaCL; 438e7e92044SBarry Smith A->ops->multadd = MatMultAdd_SeqAIJViennaCL; 439e7e92044SBarry Smith A->ops->assemblyend = MatAssemblyEnd_SeqAIJViennaCL; 440e7e92044SBarry Smith A->ops->destroy = MatDestroy_SeqAIJViennaCL; 441e7e92044SBarry Smith A->ops->duplicate = MatDuplicate_SeqAIJViennaCL; 442c1a4f3baSJunchao Zhang 443c1a4f3baSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJViennaCL; 444c1a4f3baSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJViennaCL; 445c1a4f3baSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJViennaCL; 446c1a4f3baSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJViennaCL; 447c1a4f3baSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJViennaCL; 448c1a4f3baSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJViennaCL; 449e7e92044SBarry Smith } 4503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 451e7e92044SBarry Smith } 452e7e92044SBarry Smith 453d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJViennaCL(Mat A, MatType type, MatReuse reuse, Mat *newmat) 454d71ae5a4SJacob Faibussowitsch { 4558713a8baSPatrick Sanan Mat B; 4568713a8baSPatrick Sanan 4578713a8baSPatrick Sanan PetscFunctionBegin; 4585f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_REUSE_MATRIX, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MAT_REUSE_MATRIX is not supported. Consider using MAT_INPLACE_MATRIX instead"); 4598713a8baSPatrick Sanan 4609566063dSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4618713a8baSPatrick Sanan 4628713a8baSPatrick Sanan B = *newmat; 4638713a8baSPatrick Sanan 464e4a0ef16SKarl Rupp B->spptr = new Mat_SeqAIJViennaCL(); 465e4a0ef16SKarl Rupp 466a3430c56SKarl Rupp ((Mat_SeqAIJViennaCL *)B->spptr)->tempvec = NULL; 467a3430c56SKarl Rupp ((Mat_SeqAIJViennaCL *)B->spptr)->mat = NULL; 468a3430c56SKarl Rupp ((Mat_SeqAIJViennaCL *)B->spptr)->compressed_mat = NULL; 469e4a0ef16SKarl Rupp 4709566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJViennaCL(A, PETSC_FALSE)); 47192f9df4aSRichard Tran Mills A->ops->bindtocpu = MatBindToCPU_SeqAIJViennaCL; 472e4a0ef16SKarl Rupp 4739566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJVIENNACL)); 4749566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 4759566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECVIENNACL, &B->defaultvectype)); 476e4a0ef16SKarl Rupp 4779566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaij_seqaijviennacl_C", MatConvert_SeqAIJ_SeqAIJViennaCL)); 4789566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqdense_C", MatProductSetFromOptions_SeqAIJ_SeqDense)); 4799566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijviennacl_seqaij_C", MatProductSetFromOptions_SeqAIJ)); 4808713a8baSPatrick Sanan 481c70f7ee4SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4828713a8baSPatrick Sanan 4838713a8baSPatrick Sanan /* If the source matrix is already assembled, copy the destination matrix to the GPU */ 4849566063dSJacob Faibussowitsch if (B->assembled) PetscCall(MatViennaCLCopyToGPU(B)); 4853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 486e4a0ef16SKarl Rupp } 487e4a0ef16SKarl Rupp 4883ca39a21SBarry Smith /*MC 489e4a0ef16SKarl Rupp MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices. 490e4a0ef16SKarl Rupp 491e4a0ef16SKarl Rupp A matrix type type whose data resides on GPUs. These matrices are in CSR format by 492e4a0ef16SKarl Rupp default. All matrix calculations are performed using the ViennaCL library. 493e4a0ef16SKarl Rupp 494e4a0ef16SKarl Rupp Options Database Keys: 49511a5261eSBarry Smith + -mat_type aijviennacl - sets the matrix type to `MATSEQAIJVIENNACL` during a call to `MatSetFromOptions() 49611a5261eSBarry Smith . -mat_viennacl_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`. 49711a5261eSBarry Smith - -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for `MatMult()` during a call to `MatSetFromOptions()`. 498e4a0ef16SKarl Rupp 499e4a0ef16SKarl Rupp Level: beginner 500e4a0ef16SKarl Rupp 501db781477SPatrick Sanan .seealso: `MatCreateSeqAIJViennaCL()`, `MATAIJVIENNACL`, `MatCreateAIJViennaCL()` 502e4a0ef16SKarl Rupp M*/ 503e4a0ef16SKarl Rupp 504d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_ViennaCL(void) 505d71ae5a4SJacob Faibussowitsch { 50672367587SKarl Rupp PetscFunctionBegin; 5079566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_LU, MatGetFactor_seqaij_petsc)); 5089566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaij_petsc)); 5099566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ILU, MatGetFactor_seqaij_petsc)); 5109566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERPETSC, MATSEQAIJVIENNACL, MAT_FACTOR_ICC, MatGetFactor_seqaij_petsc)); 5113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 51272367587SKarl Rupp } 513