199acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 20fd2b57fSKarl Rupp 33d13b8fdSMatthew G. Knepley #include <petscconf.h> 49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 557d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 77e8381f9SStefano Zampini #include <thrust/advance.h> 8a2cee5feSJed Brown #include <thrust/partition.h> 9a2cee5feSJed Brown #include <thrust/sort.h> 10a2cee5feSJed Brown #include <thrust/unique.h> 114eb5378fSStefano Zampini #include <petscsf.h> 127e8381f9SStefano Zampini 139371c9d4SSatish Balay struct VecCUDAEquals { 147e8381f9SStefano Zampini template <typename Tuple> 15d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 16d71ae5a4SJacob Faibussowitsch { 177e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 187e8381f9SStefano Zampini } 197e8381f9SStefano Zampini }; 207e8381f9SStefano Zampini 212c4ab24aSJunchao Zhang static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data) 22d71ae5a4SJacob Faibussowitsch { 232c4ab24aSJunchao Zhang MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data; 24cbc6b225SStefano Zampini 25cbc6b225SStefano Zampini PetscFunctionBegin; 262c4ab24aSJunchao Zhang PetscCall(PetscSFDestroy(&coo->sf)); 272c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Ajmap1)); 282c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Aperm1)); 292c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bjmap1)); 302c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bperm1)); 312c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Aimap2)); 322c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Ajmap2)); 332c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Aperm2)); 342c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bimap2)); 352c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bjmap2)); 362c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bperm2)); 372c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Cperm1)); 382c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->sendbuf)); 392c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->recvbuf)); 402c4ab24aSJunchao Zhang PetscCall(PetscFree(coo)); 413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 427e8381f9SStefano Zampini } 439ae82921SPaul Mullowney 44d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 45d71ae5a4SJacob Faibussowitsch { 46cbc6b225SStefano Zampini Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 472c4ab24aSJunchao Zhang PetscBool dev_ij = PETSC_FALSE; 482c4ab24aSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_HOST; 492c4ab24aSJunchao Zhang PetscInt *i, *j; 502c4ab24aSJunchao Zhang PetscContainer container_h, container_d; 512c4ab24aSJunchao Zhang MatCOOStruct_MPIAIJ *coo_h, *coo_d; 52219fbbafSJunchao Zhang 53219fbbafSJunchao Zhang PetscFunctionBegin; 549566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->garray)); 559566063dSJacob Faibussowitsch PetscCall(VecDestroy(&mpiaij->lvec)); 56cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE) 57eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 58cbc6b225SStefano Zampini #else 599566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->colmap)); 60cbc6b225SStefano Zampini #endif 619566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 62cbc6b225SStefano Zampini mat->assembled = PETSC_FALSE; 63cbc6b225SStefano Zampini mat->was_assembled = PETSC_FALSE; 649566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 652c4ab24aSJunchao Zhang if (PetscMemTypeDevice(mtype)) { 662c4ab24aSJunchao Zhang dev_ij = PETSC_TRUE; 672c4ab24aSJunchao Zhang PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 682c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 692c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 70219fbbafSJunchao Zhang } else { 712c4ab24aSJunchao Zhang i = coo_i; 722c4ab24aSJunchao Zhang j = coo_j; 732c4ab24aSJunchao Zhang } 742c4ab24aSJunchao Zhang 752c4ab24aSJunchao Zhang PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j)); 762c4ab24aSJunchao Zhang if (dev_ij) PetscCall(PetscFree2(i, j)); 77cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 782c4ab24aSJunchao Zhang // Create the GPU memory 799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 81219fbbafSJunchao Zhang 822c4ab24aSJunchao Zhang // Copy the COO struct to device 832c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 842c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 852c4ab24aSJunchao Zhang PetscCall(PetscMalloc1(1, &coo_d)); 862c4ab24aSJunchao Zhang *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d 87219fbbafSJunchao Zhang 882c4ab24aSJunchao Zhang PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d 892c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount))); 902c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount))); 912c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount))); 922c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount))); 932c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount))); 942c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount))); 952c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount))); 962c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount))); 972c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount))); 982c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount))); 992c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount))); 1002c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar))); 1012c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar))); 102219fbbafSJunchao Zhang 1032c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 1042c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 1052c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 1062c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 1072c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 1082c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 1092c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 1102c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 1112c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 1122c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 1132c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 114219fbbafSJunchao Zhang 1152c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 1162c4ab24aSJunchao Zhang PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1172c4ab24aSJunchao Zhang PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1182c4ab24aSJunchao Zhang PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE)); 1192c4ab24aSJunchao Zhang PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1202c4ab24aSJunchao Zhang PetscCall(PetscContainerDestroy(&container_d)); 1213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 122219fbbafSJunchao Zhang } 123219fbbafSJunchao Zhang 124d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 125d71ae5a4SJacob Faibussowitsch { 126219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 127219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 128219fbbafSJunchao Zhang for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 129219fbbafSJunchao Zhang } 130219fbbafSJunchao Zhang 131d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 132d71ae5a4SJacob Faibussowitsch { 133219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 134219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 135158ec288SJunchao Zhang for (; i < Annz + Bnnz; i += grid_size) { 136158ec288SJunchao Zhang PetscScalar sum = 0.0; 137158ec288SJunchao Zhang if (i < Annz) { 138158ec288SJunchao Zhang for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 139158ec288SJunchao Zhang Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 140158ec288SJunchao Zhang } else { 141158ec288SJunchao Zhang i -= Annz; 142158ec288SJunchao Zhang for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 143158ec288SJunchao Zhang Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 144158ec288SJunchao Zhang } 145158ec288SJunchao Zhang } 146158ec288SJunchao Zhang } 147158ec288SJunchao Zhang 148d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 149d71ae5a4SJacob Faibussowitsch { 150158ec288SJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 151158ec288SJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 152158ec288SJunchao Zhang for (; i < Annz2 + Bnnz2; i += grid_size) { 153158ec288SJunchao Zhang if (i < Annz2) { 154158ec288SJunchao Zhang for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 155158ec288SJunchao Zhang } else { 156158ec288SJunchao Zhang i -= Annz2; 157158ec288SJunchao Zhang for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 158158ec288SJunchao Zhang } 159158ec288SJunchao Zhang } 160219fbbafSJunchao Zhang } 161219fbbafSJunchao Zhang 162d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 163d71ae5a4SJacob Faibussowitsch { 164219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 165219fbbafSJunchao Zhang Mat A = mpiaij->A, B = mpiaij->B; 1662c4ab24aSJunchao Zhang PetscScalar *Aa, *Ba; 167219fbbafSJunchao Zhang const PetscScalar *v1 = v; 168219fbbafSJunchao Zhang PetscMemType memtype; 1692c4ab24aSJunchao Zhang PetscContainer container; 1702c4ab24aSJunchao Zhang MatCOOStruct_MPIAIJ *coo; 171219fbbafSJunchao Zhang 172219fbbafSJunchao Zhang PetscFunctionBegin; 1732c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1742c4ab24aSJunchao Zhang PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix"); 1752c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 176cbc6b225SStefano Zampini 1772c4ab24aSJunchao Zhang const auto &Annz = coo->Annz; 1782c4ab24aSJunchao Zhang const auto &Annz2 = coo->Annz2; 1792c4ab24aSJunchao Zhang const auto &Bnnz = coo->Bnnz; 1802c4ab24aSJunchao Zhang const auto &Bnnz2 = coo->Bnnz2; 1812c4ab24aSJunchao Zhang const auto &vsend = coo->sendbuf; 1822c4ab24aSJunchao Zhang const auto &v2 = coo->recvbuf; 1832c4ab24aSJunchao Zhang const auto &Ajmap1 = coo->Ajmap1; 1842c4ab24aSJunchao Zhang const auto &Ajmap2 = coo->Ajmap2; 1852c4ab24aSJunchao Zhang const auto &Aimap2 = coo->Aimap2; 1862c4ab24aSJunchao Zhang const auto &Bjmap1 = coo->Bjmap1; 1872c4ab24aSJunchao Zhang const auto &Bjmap2 = coo->Bjmap2; 1882c4ab24aSJunchao Zhang const auto &Bimap2 = coo->Bimap2; 1892c4ab24aSJunchao Zhang const auto &Aperm1 = coo->Aperm1; 1902c4ab24aSJunchao Zhang const auto &Aperm2 = coo->Aperm2; 1912c4ab24aSJunchao Zhang const auto &Bperm1 = coo->Bperm1; 1922c4ab24aSJunchao Zhang const auto &Bperm2 = coo->Bperm2; 1932c4ab24aSJunchao Zhang const auto &Cperm1 = coo->Cperm1; 1942c4ab24aSJunchao Zhang 1959566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 196219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 1972c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 1982c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 199219fbbafSJunchao Zhang } 200219fbbafSJunchao Zhang 201219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 2029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 2039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 204219fbbafSJunchao Zhang } else { 2059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 206158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 207219fbbafSJunchao Zhang } 208219fbbafSJunchao Zhang 20908bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 210219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 2112c4ab24aSJunchao Zhang if (coo->sendlen) { 2122c4ab24aSJunchao Zhang MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend); 2139371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); 214cbc6b225SStefano Zampini } 215219fbbafSJunchao Zhang 216219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 2172c4ab24aSJunchao Zhang PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 218219fbbafSJunchao Zhang /* Add local entries to A and B */ 219158ec288SJunchao Zhang if (Annz + Bnnz > 0) { 220158ec288SJunchao Zhang MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 2219566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 222cbc6b225SStefano Zampini } 2232c4ab24aSJunchao Zhang PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 224219fbbafSJunchao Zhang 225219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 226158ec288SJunchao Zhang if (Annz2 + Bnnz2 > 0) { 227158ec288SJunchao Zhang MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 2289566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 229cbc6b225SStefano Zampini } 23008bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 231219fbbafSJunchao Zhang 232219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 2339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 234158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 235219fbbafSJunchao Zhang } else { 2369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 237158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 238219fbbafSJunchao Zhang } 2399566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 240cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_GPU; 2413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 242219fbbafSJunchao Zhang } 243219fbbafSJunchao Zhang 244d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 245d71ae5a4SJacob Faibussowitsch { 246ed502f03SStefano Zampini Mat Ad, Ao; 247ed502f03SStefano Zampini const PetscInt *cmap; 248ed502f03SStefano Zampini 249ed502f03SStefano Zampini PetscFunctionBegin; 2509566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 2519566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 252ed502f03SStefano Zampini if (glob) { 253ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 254ed502f03SStefano Zampini 2559566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 2569566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ao, NULL, &on)); 2579566063dSJacob Faibussowitsch PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 2589566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(dn + on, &gidx)); 259ed502f03SStefano Zampini for (i = 0; i < dn; i++) gidx[i] = cst + i; 260ed502f03SStefano Zampini for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 2619566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 262ed502f03SStefano Zampini } 2633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 264ed502f03SStefano Zampini } 265ed502f03SStefano Zampini 26666976f2fSJacob Faibussowitsch static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 267d71ae5a4SJacob Faibussowitsch { 268bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 269bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 2709ae82921SPaul Mullowney PetscInt i; 2719ae82921SPaul Mullowney 2729ae82921SPaul Mullowney PetscFunctionBegin; 2732f3c17daSJacob Faibussowitsch if (B->hash_active) { 2742f3c17daSJacob Faibussowitsch B->ops[0] = b->cops; 2752f3c17daSJacob Faibussowitsch B->hash_active = PETSC_FALSE; 2762f3c17daSJacob Faibussowitsch } 2779566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->rmap)); 2789566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->cmap)); 2797e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 280ad540459SPierre Jolivet for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 2819ae82921SPaul Mullowney } 2827e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 283ad540459SPierre Jolivet for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 2849ae82921SPaul Mullowney } 2856a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 286eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&b->colmap)); 2876a29ce69SStefano Zampini #else 2889566063dSJacob Faibussowitsch PetscCall(PetscFree(b->colmap)); 2896a29ce69SStefano Zampini #endif 2909566063dSJacob Faibussowitsch PetscCall(PetscFree(b->garray)); 2919566063dSJacob Faibussowitsch PetscCall(VecDestroy(&b->lvec)); 2929566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&b->Mvctx)); 2936a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 2949566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 2956a29ce69SStefano Zampini if (!b->A) { 2969566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 2979566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 2986a29ce69SStefano Zampini } 2996a29ce69SStefano Zampini if (!b->B) { 3006a29ce69SStefano Zampini PetscMPIInt size; 3019566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 3029566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 3039566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 3049ae82921SPaul Mullowney } 3059566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 3069566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 3079566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 3089566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 3099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 3109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 3119566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 3129566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 3139ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 3142f3c17daSJacob Faibussowitsch B->was_assembled = PETSC_FALSE; 3152f3c17daSJacob Faibussowitsch B->assembled = PETSC_FALSE; 3163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3179ae82921SPaul Mullowney } 318e057df02SPaul Mullowney 31966976f2fSJacob Faibussowitsch static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 320d71ae5a4SJacob Faibussowitsch { 3219ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 3229ae82921SPaul Mullowney 3239ae82921SPaul Mullowney PetscFunctionBegin; 3249566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 3259566063dSJacob Faibussowitsch PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 3269566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 3279566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 3283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3299ae82921SPaul Mullowney } 330ca45077fSPaul Mullowney 33166976f2fSJacob Faibussowitsch static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 332d71ae5a4SJacob Faibussowitsch { 3333fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 3343fa6b06aSMark Adams 3353fa6b06aSMark Adams PetscFunctionBegin; 3369566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->A)); 3379566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->B)); 3383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3393fa6b06aSMark Adams } 340042217e8SBarry Smith 34166976f2fSJacob Faibussowitsch static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 342d71ae5a4SJacob Faibussowitsch { 343fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 344fdc842d1SBarry Smith 345fdc842d1SBarry Smith PetscFunctionBegin; 3469566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 3479566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 3489566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 3499566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 3503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 351fdc842d1SBarry Smith } 352fdc842d1SBarry Smith 35366976f2fSJacob Faibussowitsch static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 354d71ae5a4SJacob Faibussowitsch { 355ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 356ca45077fSPaul Mullowney 357ca45077fSPaul Mullowney PetscFunctionBegin; 3589566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 3599566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 3609566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 3619566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 3623ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 363ca45077fSPaul Mullowney } 3649ae82921SPaul Mullowney 36566976f2fSJacob Faibussowitsch static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 366d71ae5a4SJacob Faibussowitsch { 367ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 368bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 369e057df02SPaul Mullowney 370ca45077fSPaul Mullowney PetscFunctionBegin; 371e057df02SPaul Mullowney switch (op) { 372d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT_DIAG: 373d71ae5a4SJacob Faibussowitsch cusparseStruct->diagGPUMatFormat = format; 374d71ae5a4SJacob Faibussowitsch break; 375d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT_OFFDIAG: 376d71ae5a4SJacob Faibussowitsch cusparseStruct->offdiagGPUMatFormat = format; 377d71ae5a4SJacob Faibussowitsch break; 378e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 379e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 380e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 381045c96e1SPaul Mullowney break; 382d71ae5a4SJacob Faibussowitsch default: 383d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 384045c96e1SPaul Mullowney } 3853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 386ca45077fSPaul Mullowney } 387e057df02SPaul Mullowney 38866976f2fSJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 389d71ae5a4SJacob Faibussowitsch { 390e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 391e057df02SPaul Mullowney PetscBool flg; 392a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 393a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 3945fd66863SKarl Rupp 395e057df02SPaul Mullowney PetscFunctionBegin; 396d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 397e057df02SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 3989371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 3999566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 4009371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 4019566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 4029371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 4039566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 404e057df02SPaul Mullowney } 405d0609cedSBarry Smith PetscOptionsHeadEnd(); 4063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 407e057df02SPaul Mullowney } 408e057df02SPaul Mullowney 40966976f2fSJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 410d71ae5a4SJacob Faibussowitsch { 4113fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 412042217e8SBarry Smith 41334d6c7a5SJose E. Roman PetscFunctionBegin; 4149566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 4159566063dSJacob Faibussowitsch if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 4163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41734d6c7a5SJose E. Roman } 41834d6c7a5SJose E. Roman 41966976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 420d71ae5a4SJacob Faibussowitsch { 4213fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 4223fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 423bbf3fe20SPaul Mullowney 424bbf3fe20SPaul Mullowney PetscFunctionBegin; 42528b400f6SJacob Faibussowitsch PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 426acbf8a88SJunchao Zhang PetscCallCXX(delete cusparseStruct); 4279566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 4289566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 4299566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 4309566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 4319566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 4329566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 4339566063dSJacob Faibussowitsch PetscCall(MatDestroy_MPIAIJ(A)); 4343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 435bbf3fe20SPaul Mullowney } 436ca45077fSPaul Mullowney 43726cec326SBarry Smith /* defines MatSetValues_MPICUSPARSE_Hash() */ 43826cec326SBarry Smith #define TYPE AIJ 43926cec326SBarry Smith #define TYPE_AIJ 44026cec326SBarry Smith #define SUB_TYPE_CUSPARSE 44126cec326SBarry Smith #include "../src/mat/impls/aij/mpi/mpihashmat.h" 44226cec326SBarry Smith #undef TYPE 44326cec326SBarry Smith #undef TYPE_AIJ 44426cec326SBarry Smith #undef SUB_TYPE_CUSPARSE 44526cec326SBarry Smith 44626cec326SBarry Smith static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A) 44726cec326SBarry Smith { 44826cec326SBarry Smith Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data; 44926cec326SBarry Smith Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 45026cec326SBarry Smith 45126cec326SBarry Smith PetscFunctionBegin; 45226cec326SBarry Smith PetscCall(MatSetUp_MPI_Hash(A)); 45326cec326SBarry Smith PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 45426cec326SBarry Smith PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 45526cec326SBarry Smith A->preallocated = PETSC_TRUE; 45626cec326SBarry Smith PetscFunctionReturn(PETSC_SUCCESS); 45726cec326SBarry Smith } 45826cec326SBarry Smith 4598eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 460d71ae5a4SJacob Faibussowitsch { 461bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 4623338378cSStefano Zampini Mat A; 4639ae82921SPaul Mullowney 4649ae82921SPaul Mullowney PetscFunctionBegin; 4659566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 4669566063dSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 4679566063dSJacob Faibussowitsch else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 4683338378cSStefano Zampini A = *newmat; 4696f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 4709566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 4719566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 47234136279SStefano Zampini 473bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ *)A->data; 4749566063dSJacob Faibussowitsch if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 4759566063dSJacob Faibussowitsch if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 4769566063dSJacob Faibussowitsch if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 477d98d7c49SStefano Zampini 47848a46eb9SPierre Jolivet if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 4792205254eSKarl Rupp 48034d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 481bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 482fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 483bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 484bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 485bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 4863fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 4874e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 48826cec326SBarry Smith A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE; 4892205254eSKarl Rupp 4909566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 4919566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 4929566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 4939566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 4949566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 4959566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 496ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 4979566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 498ae48a8d0SStefano Zampini #endif 4993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5009ae82921SPaul Mullowney } 5019ae82921SPaul Mullowney 502d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 503d71ae5a4SJacob Faibussowitsch { 5043338378cSStefano Zampini PetscFunctionBegin; 5059566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 5069566063dSJacob Faibussowitsch PetscCall(MatCreate_MPIAIJ(A)); 5079566063dSJacob Faibussowitsch PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 5083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5093338378cSStefano Zampini } 5103338378cSStefano Zampini 5113f9c0db1SPaul Mullowney /*@ 51211a5261eSBarry Smith MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 513e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 51420f4b53cSBarry Smith to NVIDIA GPUs and use the CuSPARSE library for calculations. 5159ae82921SPaul Mullowney 516d083f849SBarry Smith Collective 517e057df02SPaul Mullowney 518e057df02SPaul Mullowney Input Parameters: 51911a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 52020f4b53cSBarry Smith . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 52120f4b53cSBarry Smith This value should be the same as the local size used in creating the 52220f4b53cSBarry Smith y vector for the matrix-vector product y = Ax. 52320f4b53cSBarry Smith . n - This value should be the same as the local size used in creating the 52420f4b53cSBarry Smith x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have 52520f4b53cSBarry Smith calculated if `N` is given) For square matrices `n` is almost always `m`. 52620f4b53cSBarry Smith . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 52720f4b53cSBarry Smith . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 52820f4b53cSBarry Smith . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix 52920f4b53cSBarry Smith (same value is used for all local rows) 53020f4b53cSBarry Smith . d_nnz - array containing the number of nonzeros in the various rows of the 53120f4b53cSBarry Smith DIAGONAL portion of the local submatrix (possibly different for each row) 53220f4b53cSBarry Smith or `NULL`, if `d_nz` is used to specify the nonzero structure. 53320f4b53cSBarry Smith The size of this array is equal to the number of local rows, i.e `m`. 53420f4b53cSBarry Smith For matrices you plan to factor you must leave room for the diagonal entry and 53520f4b53cSBarry Smith put in the entry even if it is zero. 53620f4b53cSBarry Smith . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local 53720f4b53cSBarry Smith submatrix (same value is used for all local rows). 53820f4b53cSBarry Smith - o_nnz - array containing the number of nonzeros in the various rows of the 53920f4b53cSBarry Smith OFF-DIAGONAL portion of the local submatrix (possibly different for 54020f4b53cSBarry Smith each row) or `NULL`, if `o_nz` is used to specify the nonzero 54120f4b53cSBarry Smith structure. The size of this array is equal to the number 54220f4b53cSBarry Smith of local rows, i.e `m`. 543e057df02SPaul Mullowney 544e057df02SPaul Mullowney Output Parameter: 545e057df02SPaul Mullowney . A - the matrix 546e057df02SPaul Mullowney 5472ef1f0ffSBarry Smith Level: intermediate 5482ef1f0ffSBarry Smith 5492ef1f0ffSBarry Smith Notes: 55011a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 551e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 55211a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 553e057df02SPaul Mullowney 55411a5261eSBarry Smith The AIJ format, also called the 5552ef1f0ffSBarry Smith compressed row storage), is fully compatible with standard Fortran 556e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 5572ef1f0ffSBarry Smith either one (as in Fortran) or zero. 558e057df02SPaul Mullowney 559fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE` 560e057df02SPaul Mullowney @*/ 561d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 562d71ae5a4SJacob Faibussowitsch { 5639ae82921SPaul Mullowney PetscMPIInt size; 5649ae82921SPaul Mullowney 5659ae82921SPaul Mullowney PetscFunctionBegin; 5669566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 5679566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, M, N)); 5689566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm, &size)); 5699ae82921SPaul Mullowney if (size > 1) { 5709566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 5719566063dSJacob Faibussowitsch PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 5729ae82921SPaul Mullowney } else { 5739566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 5749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 5759ae82921SPaul Mullowney } 5763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5779ae82921SPaul Mullowney } 5789ae82921SPaul Mullowney 5793ca39a21SBarry Smith /*MC 58011a5261eSBarry Smith MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 581e057df02SPaul Mullowney 582*15229ffcSPierre Jolivet A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 5832692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 58411a5261eSBarry Smith All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 5859ae82921SPaul Mullowney 58611a5261eSBarry Smith This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 58711a5261eSBarry Smith and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 58811a5261eSBarry Smith `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 5899ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 5909ae82921SPaul Mullowney the above preallocation routines for simplicity. 5919ae82921SPaul Mullowney 5929ae82921SPaul Mullowney Options Database Keys: 5932ef1f0ffSBarry Smith + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` 5942ef1f0ffSBarry Smith . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid). 5952ef1f0ffSBarry Smith . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 5962ef1f0ffSBarry Smith - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid). 5979ae82921SPaul Mullowney 5989ae82921SPaul Mullowney Level: beginner 5999ae82921SPaul Mullowney 6001cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 6016bb69460SJunchao Zhang M*/ 6026bb69460SJunchao Zhang 6036bb69460SJunchao Zhang /*MC 60411a5261eSBarry Smith MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 6056bb69460SJunchao Zhang 6066bb69460SJunchao Zhang Level: beginner 6076bb69460SJunchao Zhang 6081cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 6099ae82921SPaul Mullowney M*/ 610