xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 2f3c17da234d2f11b148339b59813b2e2c6c1ee0)
199acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
20fd2b57fSKarl Rupp 
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
557d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
77e8381f9SStefano Zampini #include <thrust/advance.h>
8a2cee5feSJed Brown #include <thrust/partition.h>
9a2cee5feSJed Brown #include <thrust/sort.h>
10a2cee5feSJed Brown #include <thrust/unique.h>
114eb5378fSStefano Zampini #include <petscsf.h>
127e8381f9SStefano Zampini 
139371c9d4SSatish Balay struct VecCUDAEquals {
147e8381f9SStefano Zampini   template <typename Tuple>
15d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
16d71ae5a4SJacob Faibussowitsch   {
177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini   }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini 
212c4ab24aSJunchao Zhang static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data)
22d71ae5a4SJacob Faibussowitsch {
232c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data;
24cbc6b225SStefano Zampini 
25cbc6b225SStefano Zampini   PetscFunctionBegin;
262c4ab24aSJunchao Zhang   PetscCall(PetscSFDestroy(&coo->sf));
272c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Ajmap1));
282c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aperm1));
292c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bjmap1));
302c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bperm1));
312c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aimap2));
322c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Ajmap2));
332c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aperm2));
342c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bimap2));
352c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bjmap2));
362c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bperm2));
372c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Cperm1));
382c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->sendbuf));
392c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->recvbuf));
402c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
427e8381f9SStefano Zampini }
439ae82921SPaul Mullowney 
44d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45d71ae5a4SJacob Faibussowitsch {
46cbc6b225SStefano Zampini   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
472c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
482c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
492c4ab24aSJunchao Zhang   PetscInt            *i, *j;
502c4ab24aSJunchao Zhang   PetscContainer       container_h, container_d;
512c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52219fbbafSJunchao Zhang 
53219fbbafSJunchao Zhang   PetscFunctionBegin;
549566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->garray));
559566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&mpiaij->lvec));
56cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
57eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58cbc6b225SStefano Zampini #else
599566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->colmap));
60cbc6b225SStefano Zampini #endif
619566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62cbc6b225SStefano Zampini   mat->assembled     = PETSC_FALSE;
63cbc6b225SStefano Zampini   mat->was_assembled = PETSC_FALSE;
649566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
652c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
662c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
672c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
682c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
692c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70219fbbafSJunchao Zhang   } else {
712c4ab24aSJunchao Zhang     i = coo_i;
722c4ab24aSJunchao Zhang     j = coo_j;
732c4ab24aSJunchao Zhang   }
742c4ab24aSJunchao Zhang 
752c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
762c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
77cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
782c4ab24aSJunchao Zhang   // Create the GPU memory
799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
809566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
81219fbbafSJunchao Zhang 
822c4ab24aSJunchao Zhang   // Copy the COO struct to device
832c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
842c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
852c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
862c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87219fbbafSJunchao Zhang 
882c4ab24aSJunchao Zhang   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
892c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
902c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
912c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
922c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
932c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
942c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
952c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
962c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
972c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
982c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
992c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
1002c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
1012c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102219fbbafSJunchao Zhang 
1032c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1042c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1052c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1062c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1072c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1082c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1092c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1102c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1112c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1122c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1132c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
114219fbbafSJunchao Zhang 
1152c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
1162c4ab24aSJunchao Zhang   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
1172c4ab24aSJunchao Zhang   PetscCall(PetscContainerSetPointer(container_d, coo_d));
1182c4ab24aSJunchao Zhang   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
1192c4ab24aSJunchao Zhang   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
1202c4ab24aSJunchao Zhang   PetscCall(PetscContainerDestroy(&container_d));
1213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
122219fbbafSJunchao Zhang }
123219fbbafSJunchao Zhang 
124d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
125d71ae5a4SJacob Faibussowitsch {
126219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
127219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
128219fbbafSJunchao Zhang   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
129219fbbafSJunchao Zhang }
130219fbbafSJunchao Zhang 
131d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
132d71ae5a4SJacob Faibussowitsch {
133219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
134219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
135158ec288SJunchao Zhang   for (; i < Annz + Bnnz; i += grid_size) {
136158ec288SJunchao Zhang     PetscScalar sum = 0.0;
137158ec288SJunchao Zhang     if (i < Annz) {
138158ec288SJunchao Zhang       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
139158ec288SJunchao Zhang       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
140158ec288SJunchao Zhang     } else {
141158ec288SJunchao Zhang       i -= Annz;
142158ec288SJunchao Zhang       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
143158ec288SJunchao Zhang       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
144158ec288SJunchao Zhang     }
145158ec288SJunchao Zhang   }
146158ec288SJunchao Zhang }
147158ec288SJunchao Zhang 
148d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
149d71ae5a4SJacob Faibussowitsch {
150158ec288SJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
151158ec288SJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
152158ec288SJunchao Zhang   for (; i < Annz2 + Bnnz2; i += grid_size) {
153158ec288SJunchao Zhang     if (i < Annz2) {
154158ec288SJunchao Zhang       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
155158ec288SJunchao Zhang     } else {
156158ec288SJunchao Zhang       i -= Annz2;
157158ec288SJunchao Zhang       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
158158ec288SJunchao Zhang     }
159158ec288SJunchao Zhang   }
160219fbbafSJunchao Zhang }
161219fbbafSJunchao Zhang 
162d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
163d71ae5a4SJacob Faibussowitsch {
164219fbbafSJunchao Zhang   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
165219fbbafSJunchao Zhang   Mat                  A = mpiaij->A, B = mpiaij->B;
1662c4ab24aSJunchao Zhang   PetscScalar         *Aa, *Ba;
167219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
168219fbbafSJunchao Zhang   PetscMemType         memtype;
1692c4ab24aSJunchao Zhang   PetscContainer       container;
1702c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo;
171219fbbafSJunchao Zhang 
172219fbbafSJunchao Zhang   PetscFunctionBegin;
1732c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1742c4ab24aSJunchao Zhang   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
1752c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
176cbc6b225SStefano Zampini 
1772c4ab24aSJunchao Zhang   const auto &Annz   = coo->Annz;
1782c4ab24aSJunchao Zhang   const auto &Annz2  = coo->Annz2;
1792c4ab24aSJunchao Zhang   const auto &Bnnz   = coo->Bnnz;
1802c4ab24aSJunchao Zhang   const auto &Bnnz2  = coo->Bnnz2;
1812c4ab24aSJunchao Zhang   const auto &vsend  = coo->sendbuf;
1822c4ab24aSJunchao Zhang   const auto &v2     = coo->recvbuf;
1832c4ab24aSJunchao Zhang   const auto &Ajmap1 = coo->Ajmap1;
1842c4ab24aSJunchao Zhang   const auto &Ajmap2 = coo->Ajmap2;
1852c4ab24aSJunchao Zhang   const auto &Aimap2 = coo->Aimap2;
1862c4ab24aSJunchao Zhang   const auto &Bjmap1 = coo->Bjmap1;
1872c4ab24aSJunchao Zhang   const auto &Bjmap2 = coo->Bjmap2;
1882c4ab24aSJunchao Zhang   const auto &Bimap2 = coo->Bimap2;
1892c4ab24aSJunchao Zhang   const auto &Aperm1 = coo->Aperm1;
1902c4ab24aSJunchao Zhang   const auto &Aperm2 = coo->Aperm2;
1912c4ab24aSJunchao Zhang   const auto &Bperm1 = coo->Bperm1;
1922c4ab24aSJunchao Zhang   const auto &Bperm2 = coo->Bperm2;
1932c4ab24aSJunchao Zhang   const auto &Cperm1 = coo->Cperm1;
1942c4ab24aSJunchao Zhang 
1959566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
196219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
1972c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
1982c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
199219fbbafSJunchao Zhang   }
200219fbbafSJunchao Zhang 
201219fbbafSJunchao Zhang   if (imode == INSERT_VALUES) {
2029566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
2039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
204219fbbafSJunchao Zhang   } else {
2059566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
206158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
207219fbbafSJunchao Zhang   }
208219fbbafSJunchao Zhang 
209219fbbafSJunchao Zhang   /* Pack entries to be sent to remote */
2102c4ab24aSJunchao Zhang   if (coo->sendlen) {
2112c4ab24aSJunchao Zhang     MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
2129371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError());
213cbc6b225SStefano Zampini   }
214219fbbafSJunchao Zhang 
215219fbbafSJunchao Zhang   /* Send remote entries to their owner and overlap the communication with local computation */
2162c4ab24aSJunchao Zhang   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
217219fbbafSJunchao Zhang   /* Add local entries to A and B */
218158ec288SJunchao Zhang   if (Annz + Bnnz > 0) {
219158ec288SJunchao Zhang     MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
2209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
221cbc6b225SStefano Zampini   }
2222c4ab24aSJunchao Zhang   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
223219fbbafSJunchao Zhang 
224219fbbafSJunchao Zhang   /* Add received remote entries to A and B */
225158ec288SJunchao Zhang   if (Annz2 + Bnnz2 > 0) {
226158ec288SJunchao Zhang     MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
2279566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
228cbc6b225SStefano Zampini   }
229219fbbafSJunchao Zhang 
230219fbbafSJunchao Zhang   if (imode == INSERT_VALUES) {
2319566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
232158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
233219fbbafSJunchao Zhang   } else {
2349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
235158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
236219fbbafSJunchao Zhang   }
2379566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
238cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
2393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
240219fbbafSJunchao Zhang }
241219fbbafSJunchao Zhang 
242d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
243d71ae5a4SJacob Faibussowitsch {
244ed502f03SStefano Zampini   Mat             Ad, Ao;
245ed502f03SStefano Zampini   const PetscInt *cmap;
246ed502f03SStefano Zampini 
247ed502f03SStefano Zampini   PetscFunctionBegin;
2489566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
2499566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
250ed502f03SStefano Zampini   if (glob) {
251ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
252ed502f03SStefano Zampini 
2539566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
2549566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ao, NULL, &on));
2559566063dSJacob Faibussowitsch     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
2569566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(dn + on, &gidx));
257ed502f03SStefano Zampini     for (i = 0; i < dn; i++) gidx[i] = cst + i;
258ed502f03SStefano Zampini     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
2599566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
260ed502f03SStefano Zampini   }
2613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
262ed502f03SStefano Zampini }
263ed502f03SStefano Zampini 
264d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
265d71ae5a4SJacob Faibussowitsch {
266bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
267bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
2689ae82921SPaul Mullowney   PetscInt            i;
2699ae82921SPaul Mullowney 
2709ae82921SPaul Mullowney   PetscFunctionBegin;
271*2f3c17daSJacob Faibussowitsch   if (B->hash_active) {
272*2f3c17daSJacob Faibussowitsch     B->ops[0]      = b->cops;
273*2f3c17daSJacob Faibussowitsch     B->hash_active = PETSC_FALSE;
274*2f3c17daSJacob Faibussowitsch   }
2759566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->rmap));
2769566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->cmap));
2777e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
278ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
2799ae82921SPaul Mullowney   }
2807e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
281ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
2829ae82921SPaul Mullowney   }
2836a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
284eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&b->colmap));
2856a29ce69SStefano Zampini #else
2869566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->colmap));
2876a29ce69SStefano Zampini #endif
2889566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->garray));
2899566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&b->lvec));
2909566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&b->Mvctx));
2916a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
2929566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
2936a29ce69SStefano Zampini   if (!b->A) {
2949566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2959566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2966a29ce69SStefano Zampini   }
2976a29ce69SStefano Zampini   if (!b->B) {
2986a29ce69SStefano Zampini     PetscMPIInt size;
2999566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
3009566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
3019566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
3029ae82921SPaul Mullowney   }
3039566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
3049566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
3059566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
3069566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
3079566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
3089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
3099566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
3109566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
3119ae82921SPaul Mullowney   B->preallocated  = PETSC_TRUE;
312*2f3c17daSJacob Faibussowitsch   B->was_assembled = PETSC_FALSE;
313*2f3c17daSJacob Faibussowitsch   B->assembled     = PETSC_FALSE;
3143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3159ae82921SPaul Mullowney }
316e057df02SPaul Mullowney 
317d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
318d71ae5a4SJacob Faibussowitsch {
3199ae82921SPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
3209ae82921SPaul Mullowney 
3219ae82921SPaul Mullowney   PetscFunctionBegin;
3229566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3239566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
3249566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3259566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
3263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3279ae82921SPaul Mullowney }
328ca45077fSPaul Mullowney 
329d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
330d71ae5a4SJacob Faibussowitsch {
3313fa6b06aSMark Adams   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
3323fa6b06aSMark Adams 
3333fa6b06aSMark Adams   PetscFunctionBegin;
3349566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->A));
3359566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->B));
3363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3373fa6b06aSMark Adams }
338042217e8SBarry Smith 
339d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
340d71ae5a4SJacob Faibussowitsch {
341fdc842d1SBarry Smith   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
342fdc842d1SBarry Smith 
343fdc842d1SBarry Smith   PetscFunctionBegin;
3449566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3459566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
3469566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3479566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
3483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
349fdc842d1SBarry Smith }
350fdc842d1SBarry Smith 
351d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
352d71ae5a4SJacob Faibussowitsch {
353ca45077fSPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
354ca45077fSPaul Mullowney 
355ca45077fSPaul Mullowney   PetscFunctionBegin;
3569566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
3579566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
3589566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
3599566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
3603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
361ca45077fSPaul Mullowney }
3629ae82921SPaul Mullowney 
363d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
364d71ae5a4SJacob Faibussowitsch {
365ca45077fSPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
366bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
367e057df02SPaul Mullowney 
368ca45077fSPaul Mullowney   PetscFunctionBegin;
369e057df02SPaul Mullowney   switch (op) {
370d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_DIAG:
371d71ae5a4SJacob Faibussowitsch     cusparseStruct->diagGPUMatFormat = format;
372d71ae5a4SJacob Faibussowitsch     break;
373d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_OFFDIAG:
374d71ae5a4SJacob Faibussowitsch     cusparseStruct->offdiagGPUMatFormat = format;
375d71ae5a4SJacob Faibussowitsch     break;
376e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
377e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
378e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
379045c96e1SPaul Mullowney     break;
380d71ae5a4SJacob Faibussowitsch   default:
381d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
382045c96e1SPaul Mullowney   }
3833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
384ca45077fSPaul Mullowney }
385e057df02SPaul Mullowney 
386d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
387d71ae5a4SJacob Faibussowitsch {
388e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
389e057df02SPaul Mullowney   PetscBool                flg;
390a183c035SDominic Meiser   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
391a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
3925fd66863SKarl Rupp 
393e057df02SPaul Mullowney   PetscFunctionBegin;
394d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
395e057df02SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3969371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
3979566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
3989371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
3999566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
4009371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
4019566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
402e057df02SPaul Mullowney   }
403d0609cedSBarry Smith   PetscOptionsHeadEnd();
4043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
405e057df02SPaul Mullowney }
406e057df02SPaul Mullowney 
407d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
408d71ae5a4SJacob Faibussowitsch {
4093fa6b06aSMark Adams   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
410042217e8SBarry Smith 
41134d6c7a5SJose E. Roman   PetscFunctionBegin;
4129566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
4139566063dSJacob Faibussowitsch   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
4143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41534d6c7a5SJose E. Roman }
41634d6c7a5SJose E. Roman 
417d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
418d71ae5a4SJacob Faibussowitsch {
4193fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
4203fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
421bbf3fe20SPaul Mullowney 
422bbf3fe20SPaul Mullowney   PetscFunctionBegin;
42328b400f6SJacob Faibussowitsch   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
424acbf8a88SJunchao Zhang   PetscCallCXX(delete cusparseStruct);
4259566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
4269566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
4279566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4289566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4299566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
4309566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
4319566063dSJacob Faibussowitsch   PetscCall(MatDestroy_MPIAIJ(A));
4323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
433bbf3fe20SPaul Mullowney }
434ca45077fSPaul Mullowney 
43526cec326SBarry Smith /* defines MatSetValues_MPICUSPARSE_Hash() */
43626cec326SBarry Smith #define TYPE AIJ
43726cec326SBarry Smith #define TYPE_AIJ
43826cec326SBarry Smith #define SUB_TYPE_CUSPARSE
43926cec326SBarry Smith #include "../src/mat/impls/aij/mpi/mpihashmat.h"
44026cec326SBarry Smith #undef TYPE
44126cec326SBarry Smith #undef TYPE_AIJ
44226cec326SBarry Smith #undef SUB_TYPE_CUSPARSE
44326cec326SBarry Smith 
44426cec326SBarry Smith static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
44526cec326SBarry Smith {
44626cec326SBarry Smith   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
44726cec326SBarry Smith   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
44826cec326SBarry Smith 
44926cec326SBarry Smith   PetscFunctionBegin;
45026cec326SBarry Smith   PetscCall(MatSetUp_MPI_Hash(A));
45126cec326SBarry Smith   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
45226cec326SBarry Smith   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
45326cec326SBarry Smith   A->preallocated = PETSC_TRUE;
45426cec326SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
45526cec326SBarry Smith }
45626cec326SBarry Smith 
4578eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
458d71ae5a4SJacob Faibussowitsch {
459bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
4603338378cSStefano Zampini   Mat         A;
4619ae82921SPaul Mullowney 
4629ae82921SPaul Mullowney   PetscFunctionBegin;
4639566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
4649566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
4659566063dSJacob Faibussowitsch   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
4663338378cSStefano Zampini   A             = *newmat;
4676f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
4689566063dSJacob Faibussowitsch   PetscCall(PetscFree(A->defaultvectype));
4699566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
47034136279SStefano Zampini 
471bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ *)A->data;
4729566063dSJacob Faibussowitsch   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
4739566063dSJacob Faibussowitsch   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
4749566063dSJacob Faibussowitsch   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
475d98d7c49SStefano Zampini 
47648a46eb9SPierre Jolivet   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
4772205254eSKarl Rupp 
47834d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
479bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
480fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
481bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
482bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
483bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
4843fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
4854e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
48626cec326SBarry Smith   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
4872205254eSKarl Rupp 
4889566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
4899566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
4909566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
4919566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
4929566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
4939566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
494ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
4959566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
496ae48a8d0SStefano Zampini #endif
4973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4989ae82921SPaul Mullowney }
4999ae82921SPaul Mullowney 
500d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
501d71ae5a4SJacob Faibussowitsch {
5023338378cSStefano Zampini   PetscFunctionBegin;
5039566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
5049566063dSJacob Faibussowitsch   PetscCall(MatCreate_MPIAIJ(A));
5059566063dSJacob Faibussowitsch   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
5063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5073338378cSStefano Zampini }
5083338378cSStefano Zampini 
5093f9c0db1SPaul Mullowney /*@
51011a5261eSBarry Smith    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
511e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
51220f4b53cSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations.
5139ae82921SPaul Mullowney 
514d083f849SBarry Smith    Collective
515e057df02SPaul Mullowney 
516e057df02SPaul Mullowney    Input Parameters:
51711a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
51820f4b53cSBarry Smith .  m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
51920f4b53cSBarry Smith            This value should be the same as the local size used in creating the
52020f4b53cSBarry Smith            y vector for the matrix-vector product y = Ax.
52120f4b53cSBarry Smith .  n - This value should be the same as the local size used in creating the
52220f4b53cSBarry Smith        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
52320f4b53cSBarry Smith        calculated if `N` is given) For square matrices `n` is almost always `m`.
52420f4b53cSBarry Smith .  M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
52520f4b53cSBarry Smith .  N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
52620f4b53cSBarry Smith .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
52720f4b53cSBarry Smith            (same value is used for all local rows)
52820f4b53cSBarry Smith .  d_nnz - array containing the number of nonzeros in the various rows of the
52920f4b53cSBarry Smith            DIAGONAL portion of the local submatrix (possibly different for each row)
53020f4b53cSBarry Smith            or `NULL`, if `d_nz` is used to specify the nonzero structure.
53120f4b53cSBarry Smith            The size of this array is equal to the number of local rows, i.e `m`.
53220f4b53cSBarry Smith            For matrices you plan to factor you must leave room for the diagonal entry and
53320f4b53cSBarry Smith            put in the entry even if it is zero.
53420f4b53cSBarry Smith .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
53520f4b53cSBarry Smith            submatrix (same value is used for all local rows).
53620f4b53cSBarry Smith -  o_nnz - array containing the number of nonzeros in the various rows of the
53720f4b53cSBarry Smith            OFF-DIAGONAL portion of the local submatrix (possibly different for
53820f4b53cSBarry Smith            each row) or `NULL`, if `o_nz` is used to specify the nonzero
53920f4b53cSBarry Smith            structure. The size of this array is equal to the number
54020f4b53cSBarry Smith            of local rows, i.e `m`.
541e057df02SPaul Mullowney 
542e057df02SPaul Mullowney    Output Parameter:
543e057df02SPaul Mullowney .  A - the matrix
544e057df02SPaul Mullowney 
5452ef1f0ffSBarry Smith    Level: intermediate
5462ef1f0ffSBarry Smith 
5472ef1f0ffSBarry Smith    Notes:
54811a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
549e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
55011a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
551e057df02SPaul Mullowney 
55211a5261eSBarry Smith    The AIJ format, also called the
5532ef1f0ffSBarry Smith    compressed row storage), is fully compatible with standard Fortran
554e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
5552ef1f0ffSBarry Smith    either one (as in Fortran) or zero.
556e057df02SPaul Mullowney 
5571cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
558e057df02SPaul Mullowney @*/
559d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
560d71ae5a4SJacob Faibussowitsch {
5619ae82921SPaul Mullowney   PetscMPIInt size;
5629ae82921SPaul Mullowney 
5639ae82921SPaul Mullowney   PetscFunctionBegin;
5649566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
5659566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, M, N));
5669566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
5679ae82921SPaul Mullowney   if (size > 1) {
5689566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
5699566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
5709ae82921SPaul Mullowney   } else {
5719566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
5729566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
5739ae82921SPaul Mullowney   }
5743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5759ae82921SPaul Mullowney }
5769ae82921SPaul Mullowney 
5773ca39a21SBarry Smith /*MC
57811a5261eSBarry Smith    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
579e057df02SPaul Mullowney 
58011a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
5812692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
58211a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
5839ae82921SPaul Mullowney 
58411a5261eSBarry Smith    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
58511a5261eSBarry Smith    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
58611a5261eSBarry Smith    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
5879ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
5889ae82921SPaul Mullowney    the above preallocation routines for simplicity.
5899ae82921SPaul Mullowney 
5909ae82921SPaul Mullowney    Options Database Keys:
5912ef1f0ffSBarry Smith +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
5922ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
5932ef1f0ffSBarry Smith .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
5942ef1f0ffSBarry Smith -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
5959ae82921SPaul Mullowney 
5969ae82921SPaul Mullowney   Level: beginner
5979ae82921SPaul Mullowney 
5981cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
5996bb69460SJunchao Zhang M*/
6006bb69460SJunchao Zhang 
6016bb69460SJunchao Zhang /*MC
60211a5261eSBarry Smith    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
6036bb69460SJunchao Zhang 
6046bb69460SJunchao Zhang   Level: beginner
6056bb69460SJunchao Zhang 
6061cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
6079ae82921SPaul Mullowney M*/
608