xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 15229ffc342989b2bf0590a733d92c152a3348fc)
199acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
20fd2b57fSKarl Rupp 
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
557d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
77e8381f9SStefano Zampini #include <thrust/advance.h>
8a2cee5feSJed Brown #include <thrust/partition.h>
9a2cee5feSJed Brown #include <thrust/sort.h>
10a2cee5feSJed Brown #include <thrust/unique.h>
114eb5378fSStefano Zampini #include <petscsf.h>
127e8381f9SStefano Zampini 
139371c9d4SSatish Balay struct VecCUDAEquals {
147e8381f9SStefano Zampini   template <typename Tuple>
15d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
16d71ae5a4SJacob Faibussowitsch   {
177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini   }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini 
212c4ab24aSJunchao Zhang static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data)
22d71ae5a4SJacob Faibussowitsch {
232c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data;
24cbc6b225SStefano Zampini 
25cbc6b225SStefano Zampini   PetscFunctionBegin;
262c4ab24aSJunchao Zhang   PetscCall(PetscSFDestroy(&coo->sf));
272c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Ajmap1));
282c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aperm1));
292c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bjmap1));
302c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bperm1));
312c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aimap2));
322c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Ajmap2));
332c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aperm2));
342c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bimap2));
352c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bjmap2));
362c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bperm2));
372c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Cperm1));
382c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->sendbuf));
392c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->recvbuf));
402c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
427e8381f9SStefano Zampini }
439ae82921SPaul Mullowney 
44d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45d71ae5a4SJacob Faibussowitsch {
46cbc6b225SStefano Zampini   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
472c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
482c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
492c4ab24aSJunchao Zhang   PetscInt            *i, *j;
502c4ab24aSJunchao Zhang   PetscContainer       container_h, container_d;
512c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52219fbbafSJunchao Zhang 
53219fbbafSJunchao Zhang   PetscFunctionBegin;
549566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->garray));
559566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&mpiaij->lvec));
56cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
57eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58cbc6b225SStefano Zampini #else
599566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->colmap));
60cbc6b225SStefano Zampini #endif
619566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62cbc6b225SStefano Zampini   mat->assembled     = PETSC_FALSE;
63cbc6b225SStefano Zampini   mat->was_assembled = PETSC_FALSE;
649566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
652c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
662c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
672c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
682c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
692c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70219fbbafSJunchao Zhang   } else {
712c4ab24aSJunchao Zhang     i = coo_i;
722c4ab24aSJunchao Zhang     j = coo_j;
732c4ab24aSJunchao Zhang   }
742c4ab24aSJunchao Zhang 
752c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
762c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
77cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
782c4ab24aSJunchao Zhang   // Create the GPU memory
799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
809566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
81219fbbafSJunchao Zhang 
822c4ab24aSJunchao Zhang   // Copy the COO struct to device
832c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
842c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
852c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
862c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87219fbbafSJunchao Zhang 
882c4ab24aSJunchao Zhang   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
892c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
902c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
912c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
922c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
932c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
942c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
952c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
962c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
972c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
982c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
992c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
1002c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
1012c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102219fbbafSJunchao Zhang 
1032c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1042c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1052c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1062c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1072c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1082c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1092c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1102c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1112c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1122c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1132c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
114219fbbafSJunchao Zhang 
1152c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
1162c4ab24aSJunchao Zhang   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1172c4ab24aSJunchao Zhang   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1182c4ab24aSJunchao Zhang   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
1192c4ab24aSJunchao Zhang   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1202c4ab24aSJunchao Zhang   PetscCall(PetscContainerDestroy(&container_d));
1213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
122219fbbafSJunchao Zhang }
123219fbbafSJunchao Zhang 
124d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
125d71ae5a4SJacob Faibussowitsch {
126219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
127219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
128219fbbafSJunchao Zhang   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
129219fbbafSJunchao Zhang }
130219fbbafSJunchao Zhang 
131d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
132d71ae5a4SJacob Faibussowitsch {
133219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
134219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
135158ec288SJunchao Zhang   for (; i < Annz + Bnnz; i += grid_size) {
136158ec288SJunchao Zhang     PetscScalar sum = 0.0;
137158ec288SJunchao Zhang     if (i < Annz) {
138158ec288SJunchao Zhang       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
139158ec288SJunchao Zhang       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
140158ec288SJunchao Zhang     } else {
141158ec288SJunchao Zhang       i -= Annz;
142158ec288SJunchao Zhang       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
143158ec288SJunchao Zhang       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
144158ec288SJunchao Zhang     }
145158ec288SJunchao Zhang   }
146158ec288SJunchao Zhang }
147158ec288SJunchao Zhang 
148d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
149d71ae5a4SJacob Faibussowitsch {
150158ec288SJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
151158ec288SJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
152158ec288SJunchao Zhang   for (; i < Annz2 + Bnnz2; i += grid_size) {
153158ec288SJunchao Zhang     if (i < Annz2) {
154158ec288SJunchao Zhang       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
155158ec288SJunchao Zhang     } else {
156158ec288SJunchao Zhang       i -= Annz2;
157158ec288SJunchao Zhang       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
158158ec288SJunchao Zhang     }
159158ec288SJunchao Zhang   }
160219fbbafSJunchao Zhang }
161219fbbafSJunchao Zhang 
162d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
163d71ae5a4SJacob Faibussowitsch {
164219fbbafSJunchao Zhang   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
165219fbbafSJunchao Zhang   Mat                  A = mpiaij->A, B = mpiaij->B;
1662c4ab24aSJunchao Zhang   PetscScalar         *Aa, *Ba;
167219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
168219fbbafSJunchao Zhang   PetscMemType         memtype;
1692c4ab24aSJunchao Zhang   PetscContainer       container;
1702c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo;
171219fbbafSJunchao Zhang 
172219fbbafSJunchao Zhang   PetscFunctionBegin;
1732c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1742c4ab24aSJunchao Zhang   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
1752c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
176cbc6b225SStefano Zampini 
1772c4ab24aSJunchao Zhang   const auto &Annz   = coo->Annz;
1782c4ab24aSJunchao Zhang   const auto &Annz2  = coo->Annz2;
1792c4ab24aSJunchao Zhang   const auto &Bnnz   = coo->Bnnz;
1802c4ab24aSJunchao Zhang   const auto &Bnnz2  = coo->Bnnz2;
1812c4ab24aSJunchao Zhang   const auto &vsend  = coo->sendbuf;
1822c4ab24aSJunchao Zhang   const auto &v2     = coo->recvbuf;
1832c4ab24aSJunchao Zhang   const auto &Ajmap1 = coo->Ajmap1;
1842c4ab24aSJunchao Zhang   const auto &Ajmap2 = coo->Ajmap2;
1852c4ab24aSJunchao Zhang   const auto &Aimap2 = coo->Aimap2;
1862c4ab24aSJunchao Zhang   const auto &Bjmap1 = coo->Bjmap1;
1872c4ab24aSJunchao Zhang   const auto &Bjmap2 = coo->Bjmap2;
1882c4ab24aSJunchao Zhang   const auto &Bimap2 = coo->Bimap2;
1892c4ab24aSJunchao Zhang   const auto &Aperm1 = coo->Aperm1;
1902c4ab24aSJunchao Zhang   const auto &Aperm2 = coo->Aperm2;
1912c4ab24aSJunchao Zhang   const auto &Bperm1 = coo->Bperm1;
1922c4ab24aSJunchao Zhang   const auto &Bperm2 = coo->Bperm2;
1932c4ab24aSJunchao Zhang   const auto &Cperm1 = coo->Cperm1;
1942c4ab24aSJunchao Zhang 
1959566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
196219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
1972c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
1982c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
199219fbbafSJunchao Zhang   }
200219fbbafSJunchao Zhang 
201219fbbafSJunchao Zhang   if (imode == INSERT_VALUES) {
2029566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
2039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
204219fbbafSJunchao Zhang   } else {
2059566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
206158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
207219fbbafSJunchao Zhang   }
208219fbbafSJunchao Zhang 
20908bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
210219fbbafSJunchao Zhang   /* Pack entries to be sent to remote */
2112c4ab24aSJunchao Zhang   if (coo->sendlen) {
2122c4ab24aSJunchao Zhang     MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
2139371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError());
214cbc6b225SStefano Zampini   }
215219fbbafSJunchao Zhang 
216219fbbafSJunchao Zhang   /* Send remote entries to their owner and overlap the communication with local computation */
2172c4ab24aSJunchao Zhang   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
218219fbbafSJunchao Zhang   /* Add local entries to A and B */
219158ec288SJunchao Zhang   if (Annz + Bnnz > 0) {
220158ec288SJunchao Zhang     MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
2219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
222cbc6b225SStefano Zampini   }
2232c4ab24aSJunchao Zhang   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
224219fbbafSJunchao Zhang 
225219fbbafSJunchao Zhang   /* Add received remote entries to A and B */
226158ec288SJunchao Zhang   if (Annz2 + Bnnz2 > 0) {
227158ec288SJunchao Zhang     MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
2289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
229cbc6b225SStefano Zampini   }
23008bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
231219fbbafSJunchao Zhang 
232219fbbafSJunchao Zhang   if (imode == INSERT_VALUES) {
2339566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
234158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
235219fbbafSJunchao Zhang   } else {
2369566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
237158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
238219fbbafSJunchao Zhang   }
2399566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
240cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
2413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
242219fbbafSJunchao Zhang }
243219fbbafSJunchao Zhang 
244d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
245d71ae5a4SJacob Faibussowitsch {
246ed502f03SStefano Zampini   Mat             Ad, Ao;
247ed502f03SStefano Zampini   const PetscInt *cmap;
248ed502f03SStefano Zampini 
249ed502f03SStefano Zampini   PetscFunctionBegin;
2509566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
2519566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
252ed502f03SStefano Zampini   if (glob) {
253ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
254ed502f03SStefano Zampini 
2559566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
2569566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ao, NULL, &on));
2579566063dSJacob Faibussowitsch     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
2589566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(dn + on, &gidx));
259ed502f03SStefano Zampini     for (i = 0; i < dn; i++) gidx[i] = cst + i;
260ed502f03SStefano Zampini     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
2619566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
262ed502f03SStefano Zampini   }
2633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
264ed502f03SStefano Zampini }
265ed502f03SStefano Zampini 
26666976f2fSJacob Faibussowitsch static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
267d71ae5a4SJacob Faibussowitsch {
268bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
269bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
2709ae82921SPaul Mullowney   PetscInt            i;
2719ae82921SPaul Mullowney 
2729ae82921SPaul Mullowney   PetscFunctionBegin;
2732f3c17daSJacob Faibussowitsch   if (B->hash_active) {
2742f3c17daSJacob Faibussowitsch     B->ops[0]      = b->cops;
2752f3c17daSJacob Faibussowitsch     B->hash_active = PETSC_FALSE;
2762f3c17daSJacob Faibussowitsch   }
2779566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->rmap));
2789566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->cmap));
2797e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
280ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
2819ae82921SPaul Mullowney   }
2827e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
283ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
2849ae82921SPaul Mullowney   }
2856a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
286eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&b->colmap));
2876a29ce69SStefano Zampini #else
2889566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->colmap));
2896a29ce69SStefano Zampini #endif
2909566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->garray));
2919566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&b->lvec));
2929566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&b->Mvctx));
2936a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
2949566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
2956a29ce69SStefano Zampini   if (!b->A) {
2969566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2979566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2986a29ce69SStefano Zampini   }
2996a29ce69SStefano Zampini   if (!b->B) {
3006a29ce69SStefano Zampini     PetscMPIInt size;
3019566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
3029566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
3039566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
3049ae82921SPaul Mullowney   }
3059566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
3069566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
3079566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
3089566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
3099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
3109566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
3119566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
3129566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
3139ae82921SPaul Mullowney   B->preallocated  = PETSC_TRUE;
3142f3c17daSJacob Faibussowitsch   B->was_assembled = PETSC_FALSE;
3152f3c17daSJacob Faibussowitsch   B->assembled     = PETSC_FALSE;
3163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3179ae82921SPaul Mullowney }
318e057df02SPaul Mullowney 
31966976f2fSJacob Faibussowitsch static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
320d71ae5a4SJacob Faibussowitsch {
3219ae82921SPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
3229ae82921SPaul Mullowney 
3239ae82921SPaul Mullowney   PetscFunctionBegin;
3249566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3259566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
3269566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3279566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
3283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3299ae82921SPaul Mullowney }
330ca45077fSPaul Mullowney 
33166976f2fSJacob Faibussowitsch static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
332d71ae5a4SJacob Faibussowitsch {
3333fa6b06aSMark Adams   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
3343fa6b06aSMark Adams 
3353fa6b06aSMark Adams   PetscFunctionBegin;
3369566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->A));
3379566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->B));
3383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3393fa6b06aSMark Adams }
340042217e8SBarry Smith 
34166976f2fSJacob Faibussowitsch static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
342d71ae5a4SJacob Faibussowitsch {
343fdc842d1SBarry Smith   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
344fdc842d1SBarry Smith 
345fdc842d1SBarry Smith   PetscFunctionBegin;
3469566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3479566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
3489566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3499566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
3503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
351fdc842d1SBarry Smith }
352fdc842d1SBarry Smith 
35366976f2fSJacob Faibussowitsch static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
354d71ae5a4SJacob Faibussowitsch {
355ca45077fSPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
356ca45077fSPaul Mullowney 
357ca45077fSPaul Mullowney   PetscFunctionBegin;
3589566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
3599566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
3609566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
3619566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
3623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
363ca45077fSPaul Mullowney }
3649ae82921SPaul Mullowney 
36566976f2fSJacob Faibussowitsch static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
366d71ae5a4SJacob Faibussowitsch {
367ca45077fSPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
368bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
369e057df02SPaul Mullowney 
370ca45077fSPaul Mullowney   PetscFunctionBegin;
371e057df02SPaul Mullowney   switch (op) {
372d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_DIAG:
373d71ae5a4SJacob Faibussowitsch     cusparseStruct->diagGPUMatFormat = format;
374d71ae5a4SJacob Faibussowitsch     break;
375d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_OFFDIAG:
376d71ae5a4SJacob Faibussowitsch     cusparseStruct->offdiagGPUMatFormat = format;
377d71ae5a4SJacob Faibussowitsch     break;
378e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
379e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
380e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
381045c96e1SPaul Mullowney     break;
382d71ae5a4SJacob Faibussowitsch   default:
383d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
384045c96e1SPaul Mullowney   }
3853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
386ca45077fSPaul Mullowney }
387e057df02SPaul Mullowney 
38866976f2fSJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
389d71ae5a4SJacob Faibussowitsch {
390e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
391e057df02SPaul Mullowney   PetscBool                flg;
392a183c035SDominic Meiser   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
393a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
3945fd66863SKarl Rupp 
395e057df02SPaul Mullowney   PetscFunctionBegin;
396d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
397e057df02SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3989371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
3999566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
4009371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
4019566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
4029371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
4039566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
404e057df02SPaul Mullowney   }
405d0609cedSBarry Smith   PetscOptionsHeadEnd();
4063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
407e057df02SPaul Mullowney }
408e057df02SPaul Mullowney 
40966976f2fSJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
410d71ae5a4SJacob Faibussowitsch {
4113fa6b06aSMark Adams   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
412042217e8SBarry Smith 
41334d6c7a5SJose E. Roman   PetscFunctionBegin;
4149566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
4159566063dSJacob Faibussowitsch   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
4163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41734d6c7a5SJose E. Roman }
41834d6c7a5SJose E. Roman 
41966976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
420d71ae5a4SJacob Faibussowitsch {
4213fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
4223fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
423bbf3fe20SPaul Mullowney 
424bbf3fe20SPaul Mullowney   PetscFunctionBegin;
42528b400f6SJacob Faibussowitsch   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
426acbf8a88SJunchao Zhang   PetscCallCXX(delete cusparseStruct);
4279566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
4289566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
4299566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4309566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4319566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
4329566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
4339566063dSJacob Faibussowitsch   PetscCall(MatDestroy_MPIAIJ(A));
4343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
435bbf3fe20SPaul Mullowney }
436ca45077fSPaul Mullowney 
43726cec326SBarry Smith /* defines MatSetValues_MPICUSPARSE_Hash() */
43826cec326SBarry Smith #define TYPE AIJ
43926cec326SBarry Smith #define TYPE_AIJ
44026cec326SBarry Smith #define SUB_TYPE_CUSPARSE
44126cec326SBarry Smith #include "../src/mat/impls/aij/mpi/mpihashmat.h"
44226cec326SBarry Smith #undef TYPE
44326cec326SBarry Smith #undef TYPE_AIJ
44426cec326SBarry Smith #undef SUB_TYPE_CUSPARSE
44526cec326SBarry Smith 
44626cec326SBarry Smith static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
44726cec326SBarry Smith {
44826cec326SBarry Smith   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
44926cec326SBarry Smith   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
45026cec326SBarry Smith 
45126cec326SBarry Smith   PetscFunctionBegin;
45226cec326SBarry Smith   PetscCall(MatSetUp_MPI_Hash(A));
45326cec326SBarry Smith   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
45426cec326SBarry Smith   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
45526cec326SBarry Smith   A->preallocated = PETSC_TRUE;
45626cec326SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
45726cec326SBarry Smith }
45826cec326SBarry Smith 
4598eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
460d71ae5a4SJacob Faibussowitsch {
461bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
4623338378cSStefano Zampini   Mat         A;
4639ae82921SPaul Mullowney 
4649ae82921SPaul Mullowney   PetscFunctionBegin;
4659566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
4669566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
4679566063dSJacob Faibussowitsch   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
4683338378cSStefano Zampini   A             = *newmat;
4696f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
4709566063dSJacob Faibussowitsch   PetscCall(PetscFree(A->defaultvectype));
4719566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
47234136279SStefano Zampini 
473bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ *)A->data;
4749566063dSJacob Faibussowitsch   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
4759566063dSJacob Faibussowitsch   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
4769566063dSJacob Faibussowitsch   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
477d98d7c49SStefano Zampini 
47848a46eb9SPierre Jolivet   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
4792205254eSKarl Rupp 
48034d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
481bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
482fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
483bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
484bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
485bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
4863fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
4874e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
48826cec326SBarry Smith   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
4892205254eSKarl Rupp 
4909566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
4919566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
4929566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
4939566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
4949566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
4959566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
496ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
4979566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
498ae48a8d0SStefano Zampini #endif
4993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5009ae82921SPaul Mullowney }
5019ae82921SPaul Mullowney 
502d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
503d71ae5a4SJacob Faibussowitsch {
5043338378cSStefano Zampini   PetscFunctionBegin;
5059566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
5069566063dSJacob Faibussowitsch   PetscCall(MatCreate_MPIAIJ(A));
5079566063dSJacob Faibussowitsch   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
5083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5093338378cSStefano Zampini }
5103338378cSStefano Zampini 
5113f9c0db1SPaul Mullowney /*@
51211a5261eSBarry Smith   MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
513e057df02SPaul Mullowney   (the default parallel PETSc format).  This matrix will ultimately pushed down
51420f4b53cSBarry Smith   to NVIDIA GPUs and use the CuSPARSE library for calculations.
5159ae82921SPaul Mullowney 
516d083f849SBarry Smith   Collective
517e057df02SPaul Mullowney 
518e057df02SPaul Mullowney   Input Parameters:
51911a5261eSBarry Smith + comm  - MPI communicator, set to `PETSC_COMM_SELF`
52020f4b53cSBarry Smith . m     - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
52120f4b53cSBarry Smith            This value should be the same as the local size used in creating the
52220f4b53cSBarry Smith            y vector for the matrix-vector product y = Ax.
52320f4b53cSBarry Smith . n     - This value should be the same as the local size used in creating the
52420f4b53cSBarry Smith        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
52520f4b53cSBarry Smith        calculated if `N` is given) For square matrices `n` is almost always `m`.
52620f4b53cSBarry Smith . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
52720f4b53cSBarry Smith . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
52820f4b53cSBarry Smith . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
52920f4b53cSBarry Smith            (same value is used for all local rows)
53020f4b53cSBarry Smith . d_nnz - array containing the number of nonzeros in the various rows of the
53120f4b53cSBarry Smith            DIAGONAL portion of the local submatrix (possibly different for each row)
53220f4b53cSBarry Smith            or `NULL`, if `d_nz` is used to specify the nonzero structure.
53320f4b53cSBarry Smith            The size of this array is equal to the number of local rows, i.e `m`.
53420f4b53cSBarry Smith            For matrices you plan to factor you must leave room for the diagonal entry and
53520f4b53cSBarry Smith            put in the entry even if it is zero.
53620f4b53cSBarry Smith . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
53720f4b53cSBarry Smith            submatrix (same value is used for all local rows).
53820f4b53cSBarry Smith - o_nnz - array containing the number of nonzeros in the various rows of the
53920f4b53cSBarry Smith            OFF-DIAGONAL portion of the local submatrix (possibly different for
54020f4b53cSBarry Smith            each row) or `NULL`, if `o_nz` is used to specify the nonzero
54120f4b53cSBarry Smith            structure. The size of this array is equal to the number
54220f4b53cSBarry Smith            of local rows, i.e `m`.
543e057df02SPaul Mullowney 
544e057df02SPaul Mullowney   Output Parameter:
545e057df02SPaul Mullowney . A - the matrix
546e057df02SPaul Mullowney 
5472ef1f0ffSBarry Smith   Level: intermediate
5482ef1f0ffSBarry Smith 
5492ef1f0ffSBarry Smith   Notes:
55011a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
551e057df02SPaul Mullowney   MatXXXXSetPreallocation() paradigm instead of this routine directly.
55211a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
553e057df02SPaul Mullowney 
55411a5261eSBarry Smith   The AIJ format, also called the
5552ef1f0ffSBarry Smith   compressed row storage), is fully compatible with standard Fortran
556e057df02SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
5572ef1f0ffSBarry Smith   either one (as in Fortran) or zero.
558e057df02SPaul Mullowney 
559fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
560e057df02SPaul Mullowney @*/
561d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
562d71ae5a4SJacob Faibussowitsch {
5639ae82921SPaul Mullowney   PetscMPIInt size;
5649ae82921SPaul Mullowney 
5659ae82921SPaul Mullowney   PetscFunctionBegin;
5669566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
5679566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, M, N));
5689566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
5699ae82921SPaul Mullowney   if (size > 1) {
5709566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
5719566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
5729ae82921SPaul Mullowney   } else {
5739566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
5749566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
5759ae82921SPaul Mullowney   }
5763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5779ae82921SPaul Mullowney }
5789ae82921SPaul Mullowney 
5793ca39a21SBarry Smith /*MC
58011a5261eSBarry Smith    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
581e057df02SPaul Mullowney 
582*15229ffcSPierre Jolivet    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
5832692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
58411a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
5859ae82921SPaul Mullowney 
58611a5261eSBarry Smith    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
58711a5261eSBarry Smith    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
58811a5261eSBarry Smith    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
5899ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
5909ae82921SPaul Mullowney    the above preallocation routines for simplicity.
5919ae82921SPaul Mullowney 
5929ae82921SPaul Mullowney    Options Database Keys:
5932ef1f0ffSBarry Smith +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
5942ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
5952ef1f0ffSBarry Smith .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
5962ef1f0ffSBarry Smith -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
5979ae82921SPaul Mullowney 
5989ae82921SPaul Mullowney   Level: beginner
5999ae82921SPaul Mullowney 
6001cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
6016bb69460SJunchao Zhang M*/
6026bb69460SJunchao Zhang 
6036bb69460SJunchao Zhang /*MC
60411a5261eSBarry Smith    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
6056bb69460SJunchao Zhang 
6066bb69460SJunchao Zhang   Level: beginner
6076bb69460SJunchao Zhang 
6081cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
6099ae82921SPaul Mullowney M*/
610