xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 158ec288e43a668c7d0638fc9121b5c255ad5de4)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
9a2cee5feSJed Brown #include <thrust/partition.h>
10a2cee5feSJed Brown #include <thrust/sort.h>
11a2cee5feSJed Brown #include <thrust/unique.h>
124eb5378fSStefano Zampini #include <petscsf.h>
137e8381f9SStefano Zampini 
147e8381f9SStefano Zampini struct VecCUDAEquals
157e8381f9SStefano Zampini {
167e8381f9SStefano Zampini   template <typename Tuple>
177e8381f9SStefano Zampini   __host__ __device__
187e8381f9SStefano Zampini   void operator()(Tuple t)
197e8381f9SStefano Zampini   {
207e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
217e8381f9SStefano Zampini   }
227e8381f9SStefano Zampini };
237e8381f9SStefano Zampini 
24cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
25cbc6b225SStefano Zampini {
26cbc6b225SStefano Zampini   Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)mat->data;
27cbc6b225SStefano Zampini   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
28cbc6b225SStefano Zampini 
29cbc6b225SStefano Zampini   PetscFunctionBegin;
30cbc6b225SStefano Zampini   if (!cusparseStruct) PetscFunctionReturn(0);
31cbc6b225SStefano Zampini   if (cusparseStruct->use_extended_coo) {
329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
349566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
359566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
369566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
379566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
389566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
45cbc6b225SStefano Zampini   }
46cbc6b225SStefano Zampini   cusparseStruct->use_extended_coo = PETSC_FALSE;
47cbc6b225SStefano Zampini   delete cusparseStruct->coo_p;
48cbc6b225SStefano Zampini   delete cusparseStruct->coo_pw;
49cbc6b225SStefano Zampini   cusparseStruct->coo_p  = NULL;
50cbc6b225SStefano Zampini   cusparseStruct->coo_pw = NULL;
51cbc6b225SStefano Zampini   PetscFunctionReturn(0);
52cbc6b225SStefano Zampini }
53cbc6b225SStefano Zampini 
54219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
557e8381f9SStefano Zampini {
567e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
577e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
587e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
597e8381f9SStefano Zampini 
607e8381f9SStefano Zampini   PetscFunctionBegin;
61e61fc153SStefano Zampini   if (cusp->coo_p && v) {
6208391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
6308391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
6408391a17SStefano Zampini 
6508391a17SStefano Zampini     if (isCudaMem(v)) {
6608391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
6708391a17SStefano Zampini     } else {
68e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
69e61fc153SStefano Zampini       w->assign(v,v+n);
709566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
7108391a17SStefano Zampini       d_v = w->data();
7208391a17SStefano Zampini     }
7308391a17SStefano Zampini 
7408391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
757e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
7608391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
777e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
789566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
797e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
809566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
81e61fc153SStefano Zampini     delete w;
829566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode));
839566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode));
847e8381f9SStefano Zampini   } else {
859566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode));
869566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode));
877e8381f9SStefano Zampini   }
887e8381f9SStefano Zampini   PetscFunctionReturn(0);
897e8381f9SStefano Zampini }
907e8381f9SStefano Zampini 
917e8381f9SStefano Zampini template <typename Tuple>
927e8381f9SStefano Zampini struct IsNotOffDiagT
937e8381f9SStefano Zampini {
947e8381f9SStefano Zampini   PetscInt _cstart,_cend;
957e8381f9SStefano Zampini 
967e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
977e8381f9SStefano Zampini   __host__ __device__
987e8381f9SStefano Zampini   inline bool operator()(Tuple t)
997e8381f9SStefano Zampini   {
1007e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
1017e8381f9SStefano Zampini   }
1027e8381f9SStefano Zampini };
1037e8381f9SStefano Zampini 
1047e8381f9SStefano Zampini struct IsOffDiag
1057e8381f9SStefano Zampini {
1067e8381f9SStefano Zampini   PetscInt _cstart,_cend;
1077e8381f9SStefano Zampini 
1087e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
1097e8381f9SStefano Zampini   __host__ __device__
1107e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
1117e8381f9SStefano Zampini   {
1127e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
1137e8381f9SStefano Zampini   }
1147e8381f9SStefano Zampini };
1157e8381f9SStefano Zampini 
1167e8381f9SStefano Zampini struct GlobToLoc
1177e8381f9SStefano Zampini {
1187e8381f9SStefano Zampini   PetscInt _start;
1197e8381f9SStefano Zampini 
1207e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
1217e8381f9SStefano Zampini   __host__ __device__
1227e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
1237e8381f9SStefano Zampini   {
1247e8381f9SStefano Zampini     return c - _start;
1257e8381f9SStefano Zampini   }
1267e8381f9SStefano Zampini };
1277e8381f9SStefano Zampini 
128219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
1297e8381f9SStefano Zampini {
1307e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1317e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
13282a78a4eSJed Brown   PetscInt               N,*jj;
1337e8381f9SStefano Zampini   size_t                 noff = 0;
134ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1357e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1367e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1377e8381f9SStefano Zampini 
1387e8381f9SStefano Zampini   PetscFunctionBegin;
1399566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->A));
1409566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
1417e8381f9SStefano Zampini 
1429566063dSJacob Faibussowitsch   PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
1437e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1447e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1459566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1467e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1477e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1487e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1497e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1507e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1517e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1527e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1537e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1547e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1557e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1567e8381f9SStefano Zampini   }
1577e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1587e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1597e8381f9SStefano Zampini 
1607e8381f9SStefano Zampini   /* from global to local */
1617e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1627e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
1639566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
1647e8381f9SStefano Zampini 
1657e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1669566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */
1679566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost));
1687e8381f9SStefano Zampini   auto o_j = d_j.begin();
1699566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
170ddea5d60SJunchao Zhang   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
1717e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
172ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
1739566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
1747e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
175cbc6b225SStefano Zampini   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
1769566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(noff,&b->garray));
1779566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost));
1789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt)));
1799566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g));
1809566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH));
1819566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj));
1822c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
1839566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
1847e8381f9SStefano Zampini 
1859566063dSJacob Faibussowitsch   PetscCall(MatCreate(PETSC_COMM_SELF,&b->A));
1869566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
1879566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE));
1889566063dSJacob Faibussowitsch   PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
1899566063dSJacob Faibussowitsch   PetscCall(MatCreate(PETSC_COMM_SELF,&b->B));
1909566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff));
1919566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE));
1929566063dSJacob Faibussowitsch   PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
1937e8381f9SStefano Zampini 
1947e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1959566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get()));
1969566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj));
1979566063dSJacob Faibussowitsch   PetscCall(PetscFree(jj));
1987e8381f9SStefano Zampini 
1999566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat));
2009566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat));
2017e8381f9SStefano Zampini 
2029566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A,B->boundtocpu));
2039566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B,B->boundtocpu));
2049566063dSJacob Faibussowitsch   PetscCall(MatSetUpMultiply_MPIAIJ(B));
2057e8381f9SStefano Zampini   PetscFunctionReturn(0);
2067e8381f9SStefano Zampini }
2079ae82921SPaul Mullowney 
208219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
209219fbbafSJunchao Zhang {
210cbc6b225SStefano Zampini   Mat_MPIAIJ         *mpiaij    = (Mat_MPIAIJ*)mat->data;
211219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev;
212cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
213219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
214219fbbafSJunchao Zhang   PetscInt            rstart,rend;
215219fbbafSJunchao Zhang 
216219fbbafSJunchao Zhang   PetscFunctionBegin;
2179566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->garray));
2189566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&mpiaij->lvec));
219cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
2209566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&mpiaij->colmap));
221cbc6b225SStefano Zampini #else
2229566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->colmap));
223cbc6b225SStefano Zampini #endif
2249566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
225cbc6b225SStefano Zampini   mat->assembled                                                                      = PETSC_FALSE;
226cbc6b225SStefano Zampini   mat->was_assembled                                                                  = PETSC_FALSE;
2279566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
2289566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
229219fbbafSJunchao Zhang   if (coo_i) {
2309566063dSJacob Faibussowitsch     PetscCall(PetscLayoutGetRange(mat->rmap,&rstart,&rend));
2319566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i,&mtype));
232219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
233219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
234cbc6b225SStefano Zampini         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
235219fbbafSJunchao Zhang       }
236219fbbafSJunchao Zhang     }
237219fbbafSJunchao Zhang   }
238219fbbafSJunchao Zhang   /* All ranks must agree on the value of coo_basic */
2399566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat)));
240219fbbafSJunchao Zhang   if (coo_basic) {
2419566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
242219fbbafSJunchao Zhang   } else {
2439566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j));
244cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
245cbc6b225SStefano Zampini     /* creates the GPU memory */
2469566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
2479566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
248cbc6b225SStefano Zampini     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
249219fbbafSJunchao Zhang     mpidev->use_extended_coo = PETSC_TRUE;
250219fbbafSJunchao Zhang 
251*158ec288SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz+1)*sizeof(PetscCount)));
2529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount)));
253219fbbafSJunchao Zhang 
254*158ec288SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz+1)*sizeof(PetscCount)));
2559566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount)));
256219fbbafSJunchao Zhang 
2579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount)));
2589566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount)));
2599566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount)));
260219fbbafSJunchao Zhang 
2619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount)));
2629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount)));
2639566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount)));
264219fbbafSJunchao Zhang 
2659566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount)));
2669566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar)));
2679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar)));
268219fbbafSJunchao Zhang 
269*158ec288SJunchao Zhang     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2709566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
271219fbbafSJunchao Zhang 
272*158ec288SJunchao Zhang     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
274219fbbafSJunchao Zhang 
2759566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
2769566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2779566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
278219fbbafSJunchao Zhang 
2799566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
2809566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2819566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
282219fbbafSJunchao Zhang 
2839566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice));
284219fbbafSJunchao Zhang   }
285219fbbafSJunchao Zhang   PetscFunctionReturn(0);
286219fbbafSJunchao Zhang }
287219fbbafSJunchao Zhang 
288219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
289219fbbafSJunchao Zhang {
290219fbbafSJunchao Zhang   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
291219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
292219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
293219fbbafSJunchao Zhang }
294219fbbafSJunchao Zhang 
295*158ec288SJunchao Zhang __global__ void MatAddLocalCOOValues(const PetscScalar kv[],InsertMode imode,
296*158ec288SJunchao Zhang   PetscCount Annz,const PetscCount Ajmap1[],const PetscCount Aperm1[],PetscScalar Aa[],
297*158ec288SJunchao Zhang   PetscCount Bnnz,const PetscCount Bjmap1[],const PetscCount Bperm1[],PetscScalar Ba[])
298219fbbafSJunchao Zhang {
299219fbbafSJunchao Zhang   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
300219fbbafSJunchao Zhang   const PetscCount grid_size  = gridDim.x * blockDim.x;
301*158ec288SJunchao Zhang   for (; i<Annz+Bnnz; i+= grid_size) {
302*158ec288SJunchao Zhang     PetscScalar sum = 0.0;
303*158ec288SJunchao Zhang     if (i<Annz) {
304*158ec288SJunchao Zhang       for (PetscCount k=Ajmap1[i]; k<Ajmap1[i+1]; k++) sum += kv[Aperm1[k]];
305*158ec288SJunchao Zhang       Aa[i] = (imode == INSERT_VALUES? 0.0 : Aa[i]) + sum;
306*158ec288SJunchao Zhang     } else {
307*158ec288SJunchao Zhang       i -= Annz;
308*158ec288SJunchao Zhang       for (PetscCount k=Bjmap1[i]; k<Bjmap1[i+1]; k++) sum += kv[Bperm1[k]];
309*158ec288SJunchao Zhang       Ba[i] = (imode == INSERT_VALUES? 0.0 : Ba[i]) + sum;
310*158ec288SJunchao Zhang     }
311*158ec288SJunchao Zhang   }
312*158ec288SJunchao Zhang }
313*158ec288SJunchao Zhang 
314*158ec288SJunchao Zhang __global__ void MatAddRemoteCOOValues(const PetscScalar kv[],
315*158ec288SJunchao Zhang   PetscCount Annz2,const PetscCount Aimap2[],const PetscCount Ajmap2[],const PetscCount Aperm2[],PetscScalar Aa[],
316*158ec288SJunchao Zhang   PetscCount Bnnz2,const PetscCount Bimap2[],const PetscCount Bjmap2[],const PetscCount Bperm2[],PetscScalar Ba[])
317*158ec288SJunchao Zhang {
318*158ec288SJunchao Zhang   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
319*158ec288SJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
320*158ec288SJunchao Zhang   for (; i<Annz2+Bnnz2; i+= grid_size) {
321*158ec288SJunchao Zhang     if (i<Annz2) {
322*158ec288SJunchao Zhang       for (PetscCount k=Ajmap2[i]; k<Ajmap2[i+1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
323*158ec288SJunchao Zhang     } else {
324*158ec288SJunchao Zhang       i -= Annz2;
325*158ec288SJunchao Zhang       for (PetscCount k=Bjmap2[i]; k<Bjmap2[i+1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
326*158ec288SJunchao Zhang     }
327*158ec288SJunchao Zhang   }
328219fbbafSJunchao Zhang }
329219fbbafSJunchao Zhang 
330219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
331219fbbafSJunchao Zhang {
332219fbbafSJunchao Zhang   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
333219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
334219fbbafSJunchao Zhang   Mat                 A      = mpiaij->A,B = mpiaij->B;
335*158ec288SJunchao Zhang   PetscCount          Annz   = mpiaij->Annz,Annz2 = mpiaij->Annz2,Bnnz = mpiaij->Bnnz,Bnnz2 = mpiaij->Bnnz2;
336cbc6b225SStefano Zampini   PetscScalar        *Aa,*Ba = NULL;
337219fbbafSJunchao Zhang   PetscScalar        *vsend  = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
338219fbbafSJunchao Zhang   const PetscScalar  *v1     = v;
339*158ec288SJunchao Zhang   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap2 = mpidev->Aimap2_d;
340*158ec288SJunchao Zhang   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap2 = mpidev->Bimap2_d;
341219fbbafSJunchao Zhang   const PetscCount   *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
342219fbbafSJunchao Zhang   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
343219fbbafSJunchao Zhang   PetscMemType        memtype;
344219fbbafSJunchao Zhang 
345219fbbafSJunchao Zhang   PetscFunctionBegin;
346219fbbafSJunchao Zhang   if (mpidev->use_extended_coo) {
347cbc6b225SStefano Zampini     PetscMPIInt size;
348cbc6b225SStefano Zampini 
3499566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size));
3509566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v,&memtype));
351219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
3529566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar)));
3539566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
354219fbbafSJunchao Zhang     }
355219fbbafSJunchao Zhang 
356219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3579566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */
3589566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba));
359219fbbafSJunchao Zhang     } else {
3609566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */
361*158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSEGetArray(B,&Ba));
362219fbbafSJunchao Zhang     }
363219fbbafSJunchao Zhang 
364219fbbafSJunchao Zhang     /* Pack entries to be sent to remote */
365cbc6b225SStefano Zampini     if (mpiaij->sendlen) {
366*158ec288SJunchao Zhang       MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);PetscCallCUDA(cudaPeekAtLastError());
367cbc6b225SStefano Zampini     }
368219fbbafSJunchao Zhang 
369219fbbafSJunchao Zhang     /* Send remote entries to their owner and overlap the communication with local computation */
370*158ec288SJunchao Zhang     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE));
371219fbbafSJunchao Zhang     /* Add local entries to A and B */
372*158ec288SJunchao Zhang     if (Annz+Bnnz > 0) {
373*158ec288SJunchao Zhang       MatAddLocalCOOValues<<<(Annz+Bnnz+255)/256,256>>>(v1,imode,Annz,Ajmap1,Aperm1,Aa,Bnnz,Bjmap1,Bperm1,Ba);
3749566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
375cbc6b225SStefano Zampini     }
376*158ec288SJunchao Zhang     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE));
377219fbbafSJunchao Zhang 
378219fbbafSJunchao Zhang     /* Add received remote entries to A and B */
379*158ec288SJunchao Zhang     if (Annz2+Bnnz2 > 0) {
380*158ec288SJunchao Zhang       MatAddRemoteCOOValues<<<(Annz2+Bnnz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
3819566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
382cbc6b225SStefano Zampini     }
383219fbbafSJunchao Zhang 
384219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3859566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
386*158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba));
387219fbbafSJunchao Zhang     } else {
3889566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
389*158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSERestoreArray(B,&Ba));
390219fbbafSJunchao Zhang     }
3919566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
392219fbbafSJunchao Zhang   } else {
3939566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode));
394219fbbafSJunchao Zhang   }
395cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
396219fbbafSJunchao Zhang   PetscFunctionReturn(0);
397219fbbafSJunchao Zhang }
398219fbbafSJunchao Zhang 
399ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
400ed502f03SStefano Zampini {
401ed502f03SStefano Zampini   Mat             Ad,Ao;
402ed502f03SStefano Zampini   const PetscInt *cmap;
403ed502f03SStefano Zampini 
404ed502f03SStefano Zampini   PetscFunctionBegin;
4059566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap));
4069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc));
407ed502f03SStefano Zampini   if (glob) {
408ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
409ed502f03SStefano Zampini 
4109566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ad,NULL,&dn));
4119566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ao,NULL,&on));
4129566063dSJacob Faibussowitsch     PetscCall(MatGetOwnershipRangeColumn(A,&cst,NULL));
4139566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(dn+on,&gidx));
414ed502f03SStefano Zampini     for (i = 0; i<dn; i++) gidx[i]    = cst + i;
415ed502f03SStefano Zampini     for (i = 0; i<on; i++) gidx[i+dn] = cmap[i];
4169566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob));
417ed502f03SStefano Zampini   }
418ed502f03SStefano Zampini   PetscFunctionReturn(0);
419ed502f03SStefano Zampini }
420ed502f03SStefano Zampini 
4219ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
4229ae82921SPaul Mullowney {
423bbf3fe20SPaul Mullowney   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
424bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
4259ae82921SPaul Mullowney   PetscInt           i;
4269ae82921SPaul Mullowney 
4279ae82921SPaul Mullowney   PetscFunctionBegin;
4289566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->rmap));
4299566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->cmap));
4307e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
4319ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
43208401ef6SPierre Jolivet       PetscCheck(d_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
4339ae82921SPaul Mullowney     }
4349ae82921SPaul Mullowney   }
4357e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
4369ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
43708401ef6SPierre Jolivet       PetscCheck(o_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
4389ae82921SPaul Mullowney     }
4399ae82921SPaul Mullowney   }
4406a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
4419566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&b->colmap));
4426a29ce69SStefano Zampini #else
4439566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->colmap));
4446a29ce69SStefano Zampini #endif
4459566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->garray));
4469566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&b->lvec));
4479566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&b->Mvctx));
4486a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
4499566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
4506a29ce69SStefano Zampini   if (!b->A) {
4519566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF,&b->A));
4529566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
4539566063dSJacob Faibussowitsch     PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
4546a29ce69SStefano Zampini   }
4556a29ce69SStefano Zampini   if (!b->B) {
4566a29ce69SStefano Zampini     PetscMPIInt size;
4579566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
4589566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF,&b->B));
4599566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0));
4609566063dSJacob Faibussowitsch     PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
4619ae82921SPaul Mullowney   }
4629566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE));
4639566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE));
4649566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A,B->boundtocpu));
4659566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B,B->boundtocpu));
4669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz));
4679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz));
4689566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat));
4699566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat));
4709ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
4719ae82921SPaul Mullowney   PetscFunctionReturn(0);
4729ae82921SPaul Mullowney }
473e057df02SPaul Mullowney 
4749ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
4759ae82921SPaul Mullowney {
4769ae82921SPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
4779ae82921SPaul Mullowney 
4789ae82921SPaul Mullowney   PetscFunctionBegin;
4799566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
4809566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->mult)(a->A,xx,yy));
4819566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
4829566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B,a->lvec,yy,yy));
4839ae82921SPaul Mullowney   PetscFunctionReturn(0);
4849ae82921SPaul Mullowney }
485ca45077fSPaul Mullowney 
4863fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
4873fa6b06aSMark Adams {
4883fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
4893fa6b06aSMark Adams 
4903fa6b06aSMark Adams   PetscFunctionBegin;
4919566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->A));
4929566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->B));
4933fa6b06aSMark Adams   PetscFunctionReturn(0);
4943fa6b06aSMark Adams }
495042217e8SBarry Smith 
496fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
497fdc842d1SBarry Smith {
498fdc842d1SBarry Smith   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
499fdc842d1SBarry Smith 
500fdc842d1SBarry Smith   PetscFunctionBegin;
5019566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
5029566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multadd)(a->A,xx,yy,zz));
5039566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
5049566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B,a->lvec,zz,zz));
505fdc842d1SBarry Smith   PetscFunctionReturn(0);
506fdc842d1SBarry Smith }
507fdc842d1SBarry Smith 
508ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
509ca45077fSPaul Mullowney {
510ca45077fSPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
511ca45077fSPaul Mullowney 
512ca45077fSPaul Mullowney   PetscFunctionBegin;
5139566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multtranspose)(a->B,xx,a->lvec));
5149566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multtranspose)(a->A,xx,yy));
5159566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
5169566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
517ca45077fSPaul Mullowney   PetscFunctionReturn(0);
518ca45077fSPaul Mullowney }
5199ae82921SPaul Mullowney 
520e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
521ca45077fSPaul Mullowney {
522ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
523bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
524e057df02SPaul Mullowney 
525ca45077fSPaul Mullowney   PetscFunctionBegin;
526e057df02SPaul Mullowney   switch (op) {
527e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
528e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
529045c96e1SPaul Mullowney     break;
530e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
531e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
532045c96e1SPaul Mullowney     break;
533e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
534e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
535e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
536045c96e1SPaul Mullowney     break;
537e057df02SPaul Mullowney   default:
53898921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
539045c96e1SPaul Mullowney   }
540ca45077fSPaul Mullowney   PetscFunctionReturn(0);
541ca45077fSPaul Mullowney }
542e057df02SPaul Mullowney 
5434416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
544e057df02SPaul Mullowney {
545e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
546e057df02SPaul Mullowney   PetscBool                flg;
547a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
548a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
5495fd66863SKarl Rupp 
550e057df02SPaul Mullowney   PetscFunctionBegin;
551d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject,"MPIAIJCUSPARSE options");
552e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
5539566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
5545f80ce2aSJacob Faibussowitsch                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
5559566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format));
5569566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
5575f80ce2aSJacob Faibussowitsch                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg));
5589566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format));
5599566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
5605f80ce2aSJacob Faibussowitsch                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
5619566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
562e057df02SPaul Mullowney   }
563d0609cedSBarry Smith   PetscOptionsHeadEnd();
564e057df02SPaul Mullowney   PetscFunctionReturn(0);
565e057df02SPaul Mullowney }
566e057df02SPaul Mullowney 
56734d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
56834d6c7a5SJose E. Roman {
5693fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
570042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
571042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
572042217e8SBarry Smith 
57334d6c7a5SJose E. Roman   PetscFunctionBegin;
5749566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_MPIAIJ(A,mode));
5759566063dSJacob Faibussowitsch   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec,VECSEQCUDA));
576042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
577042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
578042217e8SBarry Smith 
5799566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
5809566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
5819566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
5829566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(h_mat->colmap));
58399551766SMark Adams     if (h_mat->allocated_indices) {
5849566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.i));
5859566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.j));
58699551766SMark Adams       if (h_mat->offdiag.j) {
5879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
5889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
58999551766SMark Adams       }
59099551766SMark Adams     }
5919566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(d_mat));
5929566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
593042217e8SBarry Smith     cusp->deviceMat = NULL;
5943fa6b06aSMark Adams   }
59534d6c7a5SJose E. Roman   PetscFunctionReturn(0);
59634d6c7a5SJose E. Roman }
59734d6c7a5SJose E. Roman 
598bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
599bbf3fe20SPaul Mullowney {
6003fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
6013fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
602bbf3fe20SPaul Mullowney 
603bbf3fe20SPaul Mullowney   PetscFunctionBegin;
60428b400f6SJacob Faibussowitsch   PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
6053fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
606042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
607042217e8SBarry Smith 
6089566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Have device matrix\n"));
6099566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
6109566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
6119566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(h_mat->colmap));
61299551766SMark Adams     if (h_mat->allocated_indices) {
6139566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.i));
6149566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.j));
61599551766SMark Adams       if (h_mat->offdiag.j) {
6169566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
6179566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
61899551766SMark Adams       }
61999551766SMark Adams     }
6209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(d_mat));
6219566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
6223fa6b06aSMark Adams   }
623cbc6b225SStefano Zampini   /* Free COO */
6249566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
625acbf8a88SJunchao Zhang   PetscCallCXX(delete cusparseStruct);
6269566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL));
6279566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL));
6289566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
6299566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
6309566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
6319566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL));
6329566063dSJacob Faibussowitsch   PetscCall(MatDestroy_MPIAIJ(A));
633bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
634bbf3fe20SPaul Mullowney }
635ca45077fSPaul Mullowney 
6363338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
6379ae82921SPaul Mullowney {
638bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
6393338378cSStefano Zampini   Mat         A;
6409ae82921SPaul Mullowney 
6419ae82921SPaul Mullowney   PetscFunctionBegin;
6429566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6439566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B,MAT_COPY_VALUES,newmat));
6449566063dSJacob Faibussowitsch   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B,*newmat,SAME_NONZERO_PATTERN));
6453338378cSStefano Zampini   A = *newmat;
6466f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
6479566063dSJacob Faibussowitsch   PetscCall(PetscFree(A->defaultvectype));
6489566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
64934136279SStefano Zampini 
650bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
6519566063dSJacob Faibussowitsch   if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJCUSPARSE));
6529566063dSJacob Faibussowitsch   if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJCUSPARSE));
6539566063dSJacob Faibussowitsch   if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQCUDA));
654d98d7c49SStefano Zampini 
6553338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
656acbf8a88SJunchao Zhang     PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
6573338378cSStefano Zampini   }
6582205254eSKarl Rupp 
65934d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
660bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
661fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
662bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
663bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
664bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
6653fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
6664e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
6672205254eSKarl Rupp 
6689566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE));
6699566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
6709566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
6719566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE));
6729566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE));
6739566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE));
674ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
6759566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
676ae48a8d0SStefano Zampini #endif
6779ae82921SPaul Mullowney   PetscFunctionReturn(0);
6789ae82921SPaul Mullowney }
6799ae82921SPaul Mullowney 
6803338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
6813338378cSStefano Zampini {
6823338378cSStefano Zampini   PetscFunctionBegin;
6839566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6849566063dSJacob Faibussowitsch   PetscCall(MatCreate_MPIAIJ(A));
6859566063dSJacob Faibussowitsch   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A));
6863338378cSStefano Zampini   PetscFunctionReturn(0);
6873338378cSStefano Zampini }
6883338378cSStefano Zampini 
6893f9c0db1SPaul Mullowney /*@
6903f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
691e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
6923f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
693e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
694e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
695e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
6969ae82921SPaul Mullowney 
697d083f849SBarry Smith    Collective
698e057df02SPaul Mullowney 
699e057df02SPaul Mullowney    Input Parameters:
700e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
701e057df02SPaul Mullowney .  m - number of rows
702e057df02SPaul Mullowney .  n - number of columns
703e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
704e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
7050298fd71SBarry Smith          (possibly different for each row) or NULL
706e057df02SPaul Mullowney 
707e057df02SPaul Mullowney    Output Parameter:
708e057df02SPaul Mullowney .  A - the matrix
709e057df02SPaul Mullowney 
710e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
711e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
712e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
713e057df02SPaul Mullowney 
714e057df02SPaul Mullowney    Notes:
715e057df02SPaul Mullowney    If nnz is given then nz is ignored
716e057df02SPaul Mullowney 
717e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
718e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
719e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
720e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
721e057df02SPaul Mullowney 
722e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
7230298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
724e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
725e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
726e057df02SPaul Mullowney 
727e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
728e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
729e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
730e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
731e057df02SPaul Mullowney 
732e057df02SPaul Mullowney    Level: intermediate
733e057df02SPaul Mullowney 
734e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
735e057df02SPaul Mullowney @*/
7369ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
7379ae82921SPaul Mullowney {
7389ae82921SPaul Mullowney   PetscMPIInt size;
7399ae82921SPaul Mullowney 
7409ae82921SPaul Mullowney   PetscFunctionBegin;
7419566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
7429566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,M,N));
7439566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm,&size));
7449ae82921SPaul Mullowney   if (size > 1) {
7459566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A,MATMPIAIJCUSPARSE));
7469566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz));
7479ae82921SPaul Mullowney   } else {
7489566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
7499566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz));
7509ae82921SPaul Mullowney   }
7519ae82921SPaul Mullowney   PetscFunctionReturn(0);
7529ae82921SPaul Mullowney }
7539ae82921SPaul Mullowney 
7543ca39a21SBarry Smith /*MC
7556bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
756e057df02SPaul Mullowney 
7572692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
7582692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
7592692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
7609ae82921SPaul Mullowney 
7619ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
7629ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
7639ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
7649ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
7659ae82921SPaul Mullowney    the above preallocation routines for simplicity.
7669ae82921SPaul Mullowney 
7679ae82921SPaul Mullowney    Options Database Keys:
768e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
7698468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7708468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7718468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7729ae82921SPaul Mullowney 
7739ae82921SPaul Mullowney   Level: beginner
7749ae82921SPaul Mullowney 
7756bb69460SJunchao Zhang  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
7766bb69460SJunchao Zhang M*/
7776bb69460SJunchao Zhang 
7786bb69460SJunchao Zhang /*MC
7796bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
7806bb69460SJunchao Zhang 
7816bb69460SJunchao Zhang   Level: beginner
7826bb69460SJunchao Zhang 
7836bb69460SJunchao Zhang  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
7849ae82921SPaul Mullowney M*/
7853fa6b06aSMark Adams 
786042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
787042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
7883fa6b06aSMark Adams {
789042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
790042217e8SBarry Smith   PetscMPIInt                size;
791042217e8SBarry Smith   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
792042217e8SBarry Smith   PetscScalar                *aa = NULL,*ba = NULL;
79399551766SMark Adams   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
794042217e8SBarry Smith   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
795042217e8SBarry Smith   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
7963fa6b06aSMark Adams 
7973fa6b06aSMark Adams   PetscFunctionBegin;
79828b400f6SJacob Faibussowitsch   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
799042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
8003fa6b06aSMark Adams     *B = NULL;
8013fa6b06aSMark Adams     PetscFunctionReturn(0);
8023fa6b06aSMark Adams   }
8039566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
80499551766SMark Adams   // get jaca
8053fa6b06aSMark Adams   if (size == 1) {
806042217e8SBarry Smith     PetscBool isseqaij;
807042217e8SBarry Smith 
8089566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij));
809042217e8SBarry Smith     if (isseqaij) {
8103fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
81128b400f6SJacob Faibussowitsch       PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
812042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
813042217e8SBarry Smith       d_mat = cusparsestructA->deviceMat;
8149566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
8153fa6b06aSMark Adams     } else {
8163fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
81728b400f6SJacob Faibussowitsch       PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
818042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8193fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
820042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
821042217e8SBarry Smith       d_mat = spptr->deviceMat;
8229566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
8233fa6b06aSMark Adams     }
824042217e8SBarry Smith     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
825042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
82628b400f6SJacob Faibussowitsch       PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
827042217e8SBarry Smith       matrixA = (CsrMatrix*)matstruct->mat;
828042217e8SBarry Smith       bi = NULL;
829042217e8SBarry Smith       bj = NULL;
830042217e8SBarry Smith       ba = NULL;
831042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
832042217e8SBarry Smith   } else {
833042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
83428b400f6SJacob Faibussowitsch     PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
835042217e8SBarry Smith     jaca = (Mat_SeqAIJ*)aij->A->data;
83699551766SMark Adams     jacb = (Mat_SeqAIJ*)aij->B->data;
837042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8383fa6b06aSMark Adams 
83908401ef6SPierre Jolivet     PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
840042217e8SBarry Smith     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
841042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
84208401ef6SPierre Jolivet     PetscCheck(cusparsestructA->format==MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
843042217e8SBarry Smith     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
8449566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
8459566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
846042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
847042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
84828b400f6SJacob Faibussowitsch       PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
84928b400f6SJacob Faibussowitsch       PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
850042217e8SBarry Smith       matrixA = (CsrMatrix*)matstructA->mat;
851042217e8SBarry Smith       matrixB = (CsrMatrix*)matstructB->mat;
8523fa6b06aSMark Adams       if (jacb->compressedrow.use) {
853042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
854042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
855042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
8563fa6b06aSMark Adams         }
857042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
858042217e8SBarry Smith       } else {
859042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
860042217e8SBarry Smith       }
861042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
862042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
863042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
864042217e8SBarry Smith     d_mat = spptr->deviceMat;
865042217e8SBarry Smith   }
8663fa6b06aSMark Adams   if (jaca->compressedrow.use) {
867042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
868042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
869042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
8703fa6b06aSMark Adams     }
871042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
8723fa6b06aSMark Adams   } else {
873042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
8743fa6b06aSMark Adams   }
875042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
876042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
877042217e8SBarry Smith 
878042217e8SBarry Smith   if (!d_mat) {
879042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
880042217e8SBarry Smith 
881042217e8SBarry Smith     // create and populate strucy on host and copy on device
8829566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Create device matrix\n"));
8839566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
8849566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat)));
885042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
886042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
88799551766SMark Adams       PetscInt   *colmap;
888042217e8SBarry Smith       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
889042217e8SBarry Smith 
89008401ef6SPierre Jolivet       PetscCheck(!n || aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
891042217e8SBarry Smith 
8929566063dSJacob Faibussowitsch       PetscCall(PetscCalloc1(N+1,&colmap));
893042217e8SBarry Smith       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
894365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
895365b711fSMark Adams       { // have to make a long version of these
89699551766SMark Adams         int        *h_bi32, *h_bj32;
89799551766SMark Adams         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
8989566063dSJacob Faibussowitsch         PetscCall(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64));
8999566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost));
90099551766SMark Adams         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
9019566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost));
90299551766SMark Adams         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
90399551766SMark Adams 
9049566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64)));
9059566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice));
9069566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64)));
9079566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice));
90899551766SMark Adams 
90999551766SMark Adams         h_mat->offdiag.i = d_bi64;
91099551766SMark Adams         h_mat->offdiag.j = d_bj64;
91199551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
91299551766SMark Adams 
9139566063dSJacob Faibussowitsch         PetscCall(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64));
914365b711fSMark Adams       }
915365b711fSMark Adams #else
91699551766SMark Adams       h_mat->offdiag.i = (PetscInt*)bi;
91799551766SMark Adams       h_mat->offdiag.j = (PetscInt*)bj;
91899551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
919365b711fSMark Adams #endif
920042217e8SBarry Smith       h_mat->offdiag.a = ba;
921042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
922042217e8SBarry Smith 
9239566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap)));
9249566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice));
9259566063dSJacob Faibussowitsch       PetscCall(PetscFree(colmap));
926042217e8SBarry Smith     }
927042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
928042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
929042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
930042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
93149b994a9SMark Adams     h_mat->M      = A->cmap->N;
932365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
933365b711fSMark Adams     {
93499551766SMark Adams       int        *h_ai32, *h_aj32;
93599551766SMark Adams       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
93663a3b9bcSJacob Faibussowitsch 
93763a3b9bcSJacob Faibussowitsch       static_assert(sizeof(PetscInt) == 8,"");
9389566063dSJacob Faibussowitsch       PetscCall(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64));
9399566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost));
94099551766SMark Adams       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
9419566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost));
94299551766SMark Adams       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
94399551766SMark Adams 
9449566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64)));
9459566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice));
9469566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64)));
9479566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice));
94899551766SMark Adams 
94999551766SMark Adams       h_mat->diag.i = d_ai64;
95099551766SMark Adams       h_mat->diag.j = d_aj64;
95199551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
95299551766SMark Adams 
9539566063dSJacob Faibussowitsch       PetscCall(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64));
954365b711fSMark Adams     }
955365b711fSMark Adams #else
95699551766SMark Adams     h_mat->diag.i = (PetscInt*)ai;
95799551766SMark Adams     h_mat->diag.j = (PetscInt*)aj;
95899551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
959365b711fSMark Adams #endif
960042217e8SBarry Smith     h_mat->diag.a = aa;
961042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
962042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
963042217e8SBarry Smith     // copy pointers and metadata to device
9649566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice));
9659566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
966042217e8SBarry Smith   } else {
9679566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Reusing device matrix\n"));
968042217e8SBarry Smith   }
969042217e8SBarry Smith   *B = d_mat;
970042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
9713fa6b06aSMark Adams   PetscFunctionReturn(0);
9723fa6b06aSMark Adams }
973