xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision dbbe0bcd3f3a8fbab5a45420dc06f8387e5764c6)
1be1d678aSKris Buschelman 
2d50806bdSBarry Smith /*
32499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4d50806bdSBarry Smith           C = A * B
5d50806bdSBarry Smith */
6d50806bdSBarry Smith 
7c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
9c6db04a5SJed Brown #include <petscbt.h>
10af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1107475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
127bab7c10SHong Zhang 
13df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14df97dc6dSFande Kong {
15df97dc6dSFande Kong   PetscFunctionBegin;
16*dbbe0bcdSBarry Smith   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A,B,C));
17*dbbe0bcdSBarry Smith   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C));
18df97dc6dSFande Kong   PetscFunctionReturn(0);
19df97dc6dSFande Kong }
20df97dc6dSFande Kong 
214222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
22e4e71118SStefano Zampini PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
23df97dc6dSFande Kong {
244222ddf1SHong Zhang   PetscInt       ii;
254222ddf1SHong Zhang   Mat_SeqAIJ     *aij;
26cbc6b225SStefano Zampini   PetscBool      isseqaij, osingle, ofree_a, ofree_ij;
275c66b693SKris Buschelman 
285c66b693SKris Buschelman   PetscFunctionBegin;
29aed4548fSBarry Smith   PetscCheck(m <= 0 || !i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
309566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(mat,m,n,m,n));
314222ddf1SHong Zhang 
32e4e71118SStefano Zampini   if (!mtype) {
339566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij));
349566063dSJacob Faibussowitsch     if (!isseqaij) PetscCall(MatSetType(mat,MATSEQAIJ));
35e4e71118SStefano Zampini   } else {
369566063dSJacob Faibussowitsch     PetscCall(MatSetType(mat,mtype));
37e4e71118SStefano Zampini   }
38cbc6b225SStefano Zampini 
394222ddf1SHong Zhang   aij  = (Mat_SeqAIJ*)(mat)->data;
40cbc6b225SStefano Zampini   osingle = aij->singlemalloc;
41cbc6b225SStefano Zampini   ofree_a = aij->free_a;
42cbc6b225SStefano Zampini   ofree_ij = aij->free_ij;
43cbc6b225SStefano Zampini   /* changes the free flags */
449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL));
45cbc6b225SStefano Zampini 
469566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->ilen));
479566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->imax));
489566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&aij->imax));
499566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&aij->ilen));
50cbc6b225SStefano Zampini   for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) {
51cbc6b225SStefano Zampini     const PetscInt rnz = i[ii+1] - i[ii];
52cbc6b225SStefano Zampini     aij->nonzerorowcnt += !!rnz;
53cbc6b225SStefano Zampini     aij->rmax = PetscMax(aij->rmax,rnz);
54cbc6b225SStefano Zampini     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
55cbc6b225SStefano Zampini   }
56cbc6b225SStefano Zampini   aij->maxnz = i[m];
57cbc6b225SStefano Zampini   aij->nz = i[m];
584222ddf1SHong Zhang 
59cbc6b225SStefano Zampini   if (osingle) {
609566063dSJacob Faibussowitsch     PetscCall(PetscFree3(aij->a,aij->j,aij->i));
61cbc6b225SStefano Zampini   } else {
629566063dSJacob Faibussowitsch     if (ofree_a)  PetscCall(PetscFree(aij->a));
639566063dSJacob Faibussowitsch     if (ofree_ij) PetscCall(PetscFree(aij->j));
649566063dSJacob Faibussowitsch     if (ofree_ij) PetscCall(PetscFree(aij->i));
65cbc6b225SStefano Zampini   }
664222ddf1SHong Zhang   aij->i            = i;
674222ddf1SHong Zhang   aij->j            = j;
684222ddf1SHong Zhang   aij->a            = a;
694222ddf1SHong Zhang   aij->nonew        = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
70cbc6b225SStefano Zampini   /* default to not retain ownership */
71cbc6b225SStefano Zampini   aij->singlemalloc = PETSC_FALSE;
724222ddf1SHong Zhang   aij->free_a       = PETSC_FALSE;
734222ddf1SHong Zhang   aij->free_ij      = PETSC_FALSE;
749566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6));
755c66b693SKris Buschelman   PetscFunctionReturn(0);
765c66b693SKris Buschelman }
771c24bd37SHong Zhang 
784222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
794222ddf1SHong Zhang {
804222ddf1SHong Zhang   Mat_Product         *product = C->product;
814222ddf1SHong Zhang   MatProductAlgorithm alg;
824222ddf1SHong Zhang   PetscBool           flg;
834222ddf1SHong Zhang 
844222ddf1SHong Zhang   PetscFunctionBegin;
854222ddf1SHong Zhang   if (product) {
864222ddf1SHong Zhang     alg = product->alg;
874222ddf1SHong Zhang   } else {
884222ddf1SHong Zhang     alg = "sorted";
894222ddf1SHong Zhang   }
904222ddf1SHong Zhang   /* sorted */
919566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"sorted",&flg));
924222ddf1SHong Zhang   if (flg) {
939566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C));
944222ddf1SHong Zhang     PetscFunctionReturn(0);
954222ddf1SHong Zhang   }
964222ddf1SHong Zhang 
974222ddf1SHong Zhang   /* scalable */
989566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"scalable",&flg));
994222ddf1SHong Zhang   if (flg) {
1009566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C));
1014222ddf1SHong Zhang     PetscFunctionReturn(0);
1024222ddf1SHong Zhang   }
1034222ddf1SHong Zhang 
1044222ddf1SHong Zhang   /* scalable_fast */
1059566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"scalable_fast",&flg));
1064222ddf1SHong Zhang   if (flg) {
1079566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C));
1084222ddf1SHong Zhang     PetscFunctionReturn(0);
1094222ddf1SHong Zhang   }
1104222ddf1SHong Zhang 
1114222ddf1SHong Zhang   /* heap */
1129566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"heap",&flg));
1134222ddf1SHong Zhang   if (flg) {
1149566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C));
1154222ddf1SHong Zhang     PetscFunctionReturn(0);
1164222ddf1SHong Zhang   }
1174222ddf1SHong Zhang 
1184222ddf1SHong Zhang   /* btheap */
1199566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"btheap",&flg));
1204222ddf1SHong Zhang   if (flg) {
1219566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C));
1224222ddf1SHong Zhang     PetscFunctionReturn(0);
1234222ddf1SHong Zhang   }
1244222ddf1SHong Zhang 
1254222ddf1SHong Zhang   /* llcondensed */
1269566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"llcondensed",&flg));
1274222ddf1SHong Zhang   if (flg) {
1289566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C));
1294222ddf1SHong Zhang     PetscFunctionReturn(0);
1304222ddf1SHong Zhang   }
1314222ddf1SHong Zhang 
1324222ddf1SHong Zhang   /* rowmerge */
1339566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"rowmerge",&flg));
1344222ddf1SHong Zhang   if (flg) {
1359566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C));
1364222ddf1SHong Zhang     PetscFunctionReturn(0);
1374222ddf1SHong Zhang   }
1384222ddf1SHong Zhang 
1394222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1409566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg,"hypre",&flg));
1414222ddf1SHong Zhang   if (flg) {
1429566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C));
1434222ddf1SHong Zhang     PetscFunctionReturn(0);
1444222ddf1SHong Zhang   }
1454222ddf1SHong Zhang #endif
1464222ddf1SHong Zhang 
1474222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1484222ddf1SHong Zhang }
1494222ddf1SHong Zhang 
1504222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
151b561aa9dSHong Zhang {
152b561aa9dSHong Zhang   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
153971236abSHong Zhang   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
154c1ba5575SJed Brown   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
155b561aa9dSHong Zhang   PetscReal          afill;
156eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
157eca6b86aSHong Zhang   PetscTable         ta;
158fb908031SHong Zhang   PetscBT            lnkbt;
1590298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
160b561aa9dSHong Zhang 
161b561aa9dSHong Zhang   PetscFunctionBegin;
162bd958071SHong Zhang   /* Get ci and cj */
163bd958071SHong Zhang   /*---------------*/
164fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
165fb908031SHong Zhang   /* free space for accumulating nonzero column info */
1669566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am+2,&ci));
167fb908031SHong Zhang   ci[0] = 0;
168fb908031SHong Zhang 
169fb908031SHong Zhang   /* create and initialize a linked list */
1709566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(bn,bn,&ta));
171c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
1729566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta,&Crmax));
1739566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
174eca6b86aSHong Zhang 
1759566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt));
176fb908031SHong Zhang 
177fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1789566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
1792205254eSKarl Rupp 
180fb908031SHong Zhang   current_space = free_space;
181fb908031SHong Zhang 
182fb908031SHong Zhang   /* Determine ci and cj */
183fb908031SHong Zhang   for (i=0; i<am; i++) {
184fb908031SHong Zhang     anzi = ai[i+1] - ai[i];
185fb908031SHong Zhang     aj   = a->j + ai[i];
186fb908031SHong Zhang     for (j=0; j<anzi; j++) {
187fb908031SHong Zhang       brow = aj[j];
188fb908031SHong Zhang       bnzj = bi[brow+1] - bi[brow];
189fb908031SHong Zhang       bj   = b->j + bi[brow];
190fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
1919566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt));
192fb908031SHong Zhang     }
1938c78258cSHong Zhang     /* add possible missing diagonal entry */
1948c78258cSHong Zhang     if (C->force_diagonals) {
1959566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(1,&i,lnk,lnkbt));
1968c78258cSHong Zhang     }
197fb908031SHong Zhang     cnzi = lnk[0];
198fb908031SHong Zhang 
199fb908031SHong Zhang     /* If free space is not available, make more free space */
200fb908031SHong Zhang     /* Double the amount of total space in the list */
201fb908031SHong Zhang     if (current_space->local_remaining<cnzi) {
2029566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
203fb908031SHong Zhang       ndouble++;
204fb908031SHong Zhang     }
205fb908031SHong Zhang 
206fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
2079566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt));
2082205254eSKarl Rupp 
209fb908031SHong Zhang     current_space->array           += cnzi;
210fb908031SHong Zhang     current_space->local_used      += cnzi;
211fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2122205254eSKarl Rupp 
213fb908031SHong Zhang     ci[i+1] = ci[i] + cnzi;
214fb908031SHong Zhang   }
215fb908031SHong Zhang 
216fb908031SHong Zhang   /* Column indices are in the list of free space */
217fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
218fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
2199566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am]+1,&cj));
2209566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
2219566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy(lnk,lnkbt));
222b561aa9dSHong Zhang 
22326be0446SHong Zhang   /* put together the new symbolic matrix */
2249566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
2259566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C,A,B));
22658c24d83SHong Zhang 
22758c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
22858c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
2294222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
230aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
231e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
23258c24d83SHong Zhang   c->nonew   = 0;
2334222ddf1SHong Zhang 
2344222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2354222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2360b7e3e3dSHong Zhang 
2377212da7cSHong Zhang   /* set MatInfo */
2387212da7cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
239f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2404222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2414222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2424222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2437212da7cSHong Zhang 
2447212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2457212da7cSHong Zhang   if (ci[am]) {
2469566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
2479566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
248f2b054eeSHong Zhang   } else {
2499566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Empty matrix product\n"));
250be0fcf8dSHong Zhang   }
251f2b054eeSHong Zhang #endif
25258c24d83SHong Zhang   PetscFunctionReturn(0);
25358c24d83SHong Zhang }
254d50806bdSBarry Smith 
255df97dc6dSFande Kong PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
256d50806bdSBarry Smith {
257d13dce4bSSatish Balay   PetscLogDouble    flops=0.0;
258d50806bdSBarry Smith   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
259d50806bdSBarry Smith   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
260d50806bdSBarry Smith   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
26138baddfdSBarry Smith   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
262c58ca2e3SHong Zhang   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
263519eb980SHong Zhang   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
2642e5835c6SStefano Zampini   PetscScalar       *ca,valtmp;
265aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2666718818eSStefano Zampini   PetscContainer    cab_dense;
2672e5835c6SStefano Zampini   const PetscScalar *aa,*ba,*baj;
268d50806bdSBarry Smith 
269d50806bdSBarry Smith   PetscFunctionBegin;
2709566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
2719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
272aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
2739566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
274aa1aec99SHong Zhang     c->a      = ca;
275aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2766718818eSStefano Zampini   } else ca = c->a;
2776718818eSStefano Zampini 
2786718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2796718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2806718818eSStefano Zampini      that is hard to eradicate) */
2819566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense));
2826718818eSStefano Zampini   if (!cab_dense) {
2839566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(B->cmap->N,&ab_dense));
2849566063dSJacob Faibussowitsch     PetscCall(PetscContainerCreate(PETSC_COMM_SELF,&cab_dense));
2859566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetPointer(cab_dense,ab_dense));
2869566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault));
2879566063dSJacob Faibussowitsch     PetscCall(PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense));
2889566063dSJacob Faibussowitsch     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
289d90d86d0SMatthew G. Knepley   }
2909566063dSJacob Faibussowitsch   PetscCall(PetscContainerGetPointer(cab_dense,(void**)&ab_dense));
2919566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ab_dense,B->cmap->N));
292aa1aec99SHong Zhang 
293c124e916SHong Zhang   /* clean old values in C */
2949566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca,ci[cm]));
295d50806bdSBarry Smith   /* Traverse A row-wise. */
296d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
297d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
298d50806bdSBarry Smith   for (i=0; i<am; i++) {
299d50806bdSBarry Smith     anzi = ai[i+1] - ai[i];
300d50806bdSBarry Smith     for (j=0; j<anzi; j++) {
301519eb980SHong Zhang       brow = aj[j];
302d50806bdSBarry Smith       bnzi = bi[brow+1] - bi[brow];
303d50806bdSBarry Smith       bjj  = bj + bi[brow];
304d50806bdSBarry Smith       baj  = ba + bi[brow];
305519eb980SHong Zhang       /* perform dense axpy */
30636ec6d2dSHong Zhang       valtmp = aa[j];
307519eb980SHong Zhang       for (k=0; k<bnzi; k++) {
30836ec6d2dSHong Zhang         ab_dense[bjj[k]] += valtmp*baj[k];
309519eb980SHong Zhang       }
310519eb980SHong Zhang       flops += 2*bnzi;
311519eb980SHong Zhang     }
312c58ca2e3SHong Zhang     aj += anzi; aa += anzi;
313c58ca2e3SHong Zhang 
314c58ca2e3SHong Zhang     cnzi = ci[i+1] - ci[i];
315519eb980SHong Zhang     for (k=0; k<cnzi; k++) {
316519eb980SHong Zhang       ca[k]          += ab_dense[cj[k]];
317519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
318519eb980SHong Zhang     }
319519eb980SHong Zhang     flops += cnzi;
320519eb980SHong Zhang     cj    += cnzi; ca += cnzi;
321519eb980SHong Zhang   }
3222e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3232e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3242e5835c6SStefano Zampini #endif
3259566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
3269566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
3279566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3289566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
3299566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
330c58ca2e3SHong Zhang   PetscFunctionReturn(0);
331c58ca2e3SHong Zhang }
332c58ca2e3SHong Zhang 
33325023028SHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
334c58ca2e3SHong Zhang {
335c58ca2e3SHong Zhang   PetscLogDouble    flops=0.0;
336c58ca2e3SHong Zhang   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
337c58ca2e3SHong Zhang   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
338c58ca2e3SHong Zhang   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
339c58ca2e3SHong Zhang   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
340c58ca2e3SHong Zhang   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
341c58ca2e3SHong Zhang   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
3422e5835c6SStefano Zampini   PetscScalar       *ca=c->a,valtmp;
3432e5835c6SStefano Zampini   const PetscScalar *aa,*ba,*baj;
344c58ca2e3SHong Zhang   PetscInt          nextb;
345c58ca2e3SHong Zhang 
346c58ca2e3SHong Zhang   PetscFunctionBegin;
3479566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
3489566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
349cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
3509566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
351cf742fccSHong Zhang     c->a      = ca;
352cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
353cf742fccSHong Zhang   }
354cf742fccSHong Zhang 
355c58ca2e3SHong Zhang   /* clean old values in C */
3569566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca,ci[cm]));
357c58ca2e3SHong Zhang   /* Traverse A row-wise. */
358c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
359c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
360519eb980SHong Zhang   for (i=0; i<am; i++) {
361519eb980SHong Zhang     anzi = ai[i+1] - ai[i];
362519eb980SHong Zhang     cnzi = ci[i+1] - ci[i];
363519eb980SHong Zhang     for (j=0; j<anzi; j++) {
364519eb980SHong Zhang       brow = aj[j];
365519eb980SHong Zhang       bnzi = bi[brow+1] - bi[brow];
366519eb980SHong Zhang       bjj  = bj + bi[brow];
367519eb980SHong Zhang       baj  = ba + bi[brow];
368519eb980SHong Zhang       /* perform sparse axpy */
36936ec6d2dSHong Zhang       valtmp = aa[j];
370c124e916SHong Zhang       nextb  = 0;
371c124e916SHong Zhang       for (k=0; nextb<bnzi; k++) {
372c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
37336ec6d2dSHong Zhang           ca[k] += valtmp*baj[nextb++];
374c124e916SHong Zhang         }
375d50806bdSBarry Smith       }
376d50806bdSBarry Smith       flops += 2*bnzi;
377d50806bdSBarry Smith     }
378519eb980SHong Zhang     aj += anzi; aa += anzi;
379519eb980SHong Zhang     cj += cnzi; ca += cnzi;
380d50806bdSBarry Smith   }
3812e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3822e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3832e5835c6SStefano Zampini #endif
3849566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
3859566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
3869566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
3889566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
389d50806bdSBarry Smith   PetscFunctionReturn(0);
390d50806bdSBarry Smith }
391bc011b1eSHong Zhang 
3924222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
39325296bd5SBarry Smith {
39425296bd5SBarry Smith   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
39525296bd5SBarry Smith   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
3963c50cad2SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
39725296bd5SBarry Smith   MatScalar          *ca;
39825296bd5SBarry Smith   PetscReal          afill;
399eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
400eca6b86aSHong Zhang   PetscTable         ta;
4010298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
40225296bd5SBarry Smith 
40325296bd5SBarry Smith   PetscFunctionBegin;
4043c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
4053c50cad2SHong Zhang   /*-----------------------------------------------------------------------------------------*/
4063c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
4079566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am+2,&ci));
4083c50cad2SHong Zhang   ci[0] = 0;
4093c50cad2SHong Zhang 
4103c50cad2SHong Zhang   /* create and initialize a linked list */
4119566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(bn,bn,&ta));
412c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
4139566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta,&Crmax));
4149566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
415eca6b86aSHong Zhang 
4169566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_fast(Crmax,&lnk));
4173c50cad2SHong Zhang 
4183c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
4199566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
4203c50cad2SHong Zhang   current_space = free_space;
4213c50cad2SHong Zhang 
4223c50cad2SHong Zhang   /* Determine ci and cj */
4233c50cad2SHong Zhang   for (i=0; i<am; i++) {
4243c50cad2SHong Zhang     anzi = ai[i+1] - ai[i];
4253c50cad2SHong Zhang     aj   = a->j + ai[i];
4263c50cad2SHong Zhang     for (j=0; j<anzi; j++) {
4273c50cad2SHong Zhang       brow = aj[j];
4283c50cad2SHong Zhang       bnzj = bi[brow+1] - bi[brow];
4293c50cad2SHong Zhang       bj   = b->j + bi[brow];
4303c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4319566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_fast(bnzj,bj,lnk));
4323c50cad2SHong Zhang     }
4338c78258cSHong Zhang     /* add possible missing diagonal entry */
4348c78258cSHong Zhang     if (C->force_diagonals) {
4359566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_fast(1,&i,lnk));
4368c78258cSHong Zhang     }
4373c50cad2SHong Zhang     cnzi = lnk[1];
4383c50cad2SHong Zhang 
4393c50cad2SHong Zhang     /* If free space is not available, make more free space */
4403c50cad2SHong Zhang     /* Double the amount of total space in the list */
4413c50cad2SHong Zhang     if (current_space->local_remaining<cnzi) {
4429566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
4433c50cad2SHong Zhang       ndouble++;
4443c50cad2SHong Zhang     }
4453c50cad2SHong Zhang 
4463c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4479566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_fast(cnzi,current_space->array,lnk));
4482205254eSKarl Rupp 
4493c50cad2SHong Zhang     current_space->array           += cnzi;
4503c50cad2SHong Zhang     current_space->local_used      += cnzi;
4513c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4522205254eSKarl Rupp 
4533c50cad2SHong Zhang     ci[i+1] = ci[i] + cnzi;
4543c50cad2SHong Zhang   }
4553c50cad2SHong Zhang 
4563c50cad2SHong Zhang   /* Column indices are in the list of free space */
4573c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4583c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
4599566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am]+1,&cj));
4609566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
4619566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_fast(lnk));
46225296bd5SBarry Smith 
46325296bd5SBarry Smith   /* Allocate space for ca */
4649566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am]+1,&ca));
46525296bd5SBarry Smith 
46625296bd5SBarry Smith   /* put together the new symbolic matrix */
4679566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
4689566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C,A,B));
46925296bd5SBarry Smith 
47025296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
47125296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
4724222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
47325296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
47425296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
47525296bd5SBarry Smith   c->nonew   = 0;
4762205254eSKarl Rupp 
4774222ddf1SHong Zhang   /* slower, less memory */
4784222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
47925296bd5SBarry Smith 
48025296bd5SBarry Smith   /* set MatInfo */
48125296bd5SBarry Smith   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
48225296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
4834222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4844222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4854222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
48625296bd5SBarry Smith 
48725296bd5SBarry Smith #if defined(PETSC_USE_INFO)
48825296bd5SBarry Smith   if (ci[am]) {
4899566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
4909566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
49125296bd5SBarry Smith   } else {
4929566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Empty matrix product\n"));
49325296bd5SBarry Smith   }
49425296bd5SBarry Smith #endif
49525296bd5SBarry Smith   PetscFunctionReturn(0);
49625296bd5SBarry Smith }
49725296bd5SBarry Smith 
4984222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
499e9e4536cSHong Zhang {
500e9e4536cSHong Zhang   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
501bf9555e6SHong Zhang   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
50225c41797SHong Zhang   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
503e9e4536cSHong Zhang   MatScalar          *ca;
504e9e4536cSHong Zhang   PetscReal          afill;
505eca6b86aSHong Zhang   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
506eca6b86aSHong Zhang   PetscTable         ta;
5070298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
508e9e4536cSHong Zhang 
509e9e4536cSHong Zhang   PetscFunctionBegin;
510bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
511bd958071SHong Zhang   /*---------------------------------------------------------------------------------------------*/
512bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
5139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am+2,&ci));
514bd958071SHong Zhang   ci[0] = 0;
515bd958071SHong Zhang 
516bd958071SHong Zhang   /* create and initialize a linked list */
5179566063dSJacob Faibussowitsch   PetscCall(PetscTableCreate(bn,bn,&ta));
518c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b,bm,ta);
5199566063dSJacob Faibussowitsch   PetscCall(PetscTableGetCount(ta,&Crmax));
5209566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&ta));
5219566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
522bd958071SHong Zhang 
523bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
5249566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
525bd958071SHong Zhang   current_space = free_space;
526bd958071SHong Zhang 
527bd958071SHong Zhang   /* Determine ci and cj */
528bd958071SHong Zhang   for (i=0; i<am; i++) {
529bd958071SHong Zhang     anzi = ai[i+1] - ai[i];
530bd958071SHong Zhang     aj   = a->j + ai[i];
531bd958071SHong Zhang     for (j=0; j<anzi; j++) {
532bd958071SHong Zhang       brow = aj[j];
533bd958071SHong Zhang       bnzj = bi[brow+1] - bi[brow];
534bd958071SHong Zhang       bj   = b->j + bi[brow];
535bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
5369566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk));
537bd958071SHong Zhang     }
5388c78258cSHong Zhang     /* add possible missing diagonal entry */
5398c78258cSHong Zhang     if (C->force_diagonals) {
5409566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(1,&i,lnk));
5418c78258cSHong Zhang     }
5428c78258cSHong Zhang 
543bd958071SHong Zhang     cnzi = lnk[0];
544bd958071SHong Zhang 
545bd958071SHong Zhang     /* If free space is not available, make more free space */
546bd958071SHong Zhang     /* Double the amount of total space in the list */
547bd958071SHong Zhang     if (current_space->local_remaining<cnzi) {
5489566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
549bd958071SHong Zhang       ndouble++;
550bd958071SHong Zhang     }
551bd958071SHong Zhang 
552bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
5539566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk));
5542205254eSKarl Rupp 
555bd958071SHong Zhang     current_space->array           += cnzi;
556bd958071SHong Zhang     current_space->local_used      += cnzi;
557bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5582205254eSKarl Rupp 
559bd958071SHong Zhang     ci[i+1] = ci[i] + cnzi;
560bd958071SHong Zhang   }
561bd958071SHong Zhang 
562bd958071SHong Zhang   /* Column indices are in the list of free space */
563bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
564bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
5659566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am]+1,&cj));
5669566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
5679566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
568e9e4536cSHong Zhang 
569e9e4536cSHong Zhang   /* Allocate space for ca */
570bd958071SHong Zhang   /*-----------------------*/
5719566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am]+1,&ca));
572e9e4536cSHong Zhang 
573e9e4536cSHong Zhang   /* put together the new symbolic matrix */
5749566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
5759566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C,A,B));
576e9e4536cSHong Zhang 
577e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
578e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
5794222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
580e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
581e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
582e9e4536cSHong Zhang   c->nonew   = 0;
5832205254eSKarl Rupp 
5844222ddf1SHong Zhang   /* slower, less memory */
5854222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
586e9e4536cSHong Zhang 
587e9e4536cSHong Zhang   /* set MatInfo */
588e9e4536cSHong Zhang   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
589e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
5904222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5914222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5924222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
593e9e4536cSHong Zhang 
594e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
595e9e4536cSHong Zhang   if (ci[am]) {
5969566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
5979566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
598e9e4536cSHong Zhang   } else {
5999566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Empty matrix product\n"));
600e9e4536cSHong Zhang   }
601e9e4536cSHong Zhang #endif
602e9e4536cSHong Zhang   PetscFunctionReturn(0);
603e9e4536cSHong Zhang }
604e9e4536cSHong Zhang 
6054222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
6060ced3a2bSJed Brown {
6070ced3a2bSJed Brown   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
6080ced3a2bSJed Brown   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
6090ced3a2bSJed Brown   PetscInt           *ci,*cj,*bb;
6100ced3a2bSJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
6110ced3a2bSJed Brown   PetscReal          afill;
6120ced3a2bSJed Brown   PetscInt           i,j,col,ndouble = 0;
6130298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
6140ced3a2bSJed Brown   PetscHeap          h;
6150ced3a2bSJed Brown 
6160ced3a2bSJed Brown   PetscFunctionBegin;
617cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6180ced3a2bSJed Brown   /*---------------------------------------------------------------------------------------------*/
6190ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
6209566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am+2,&ci));
6210ced3a2bSJed Brown   ci[0] = 0;
6220ced3a2bSJed Brown 
6230ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
6249566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
6250ced3a2bSJed Brown   current_space = free_space;
6260ced3a2bSJed Brown 
6279566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax,&h));
6289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax,&bb));
6290ced3a2bSJed Brown 
6300ced3a2bSJed Brown   /* Determine ci and cj */
6310ced3a2bSJed Brown   for (i=0; i<am; i++) {
6320ced3a2bSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6330ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
6340ced3a2bSJed Brown     ci[i+1] = ci[i];
6350ced3a2bSJed Brown     /* Populate the min heap */
6360ced3a2bSJed Brown     for (j=0; j<anzi; j++) {
6370ced3a2bSJed Brown       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
6380ced3a2bSJed Brown       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
6399566063dSJacob Faibussowitsch         PetscCall(PetscHeapAdd(h,j,bj[bb[j]++]));
6400ced3a2bSJed Brown       }
6410ced3a2bSJed Brown     }
6420ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6439566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h,&j,&col));
6440ced3a2bSJed Brown     while (j >= 0) {
6450ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6469566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
6470ced3a2bSJed Brown         ndouble++;
6480ced3a2bSJed Brown       }
6490ced3a2bSJed Brown       *(current_space->array++) = col;
6500ced3a2bSJed Brown       current_space->local_used++;
6510ced3a2bSJed Brown       current_space->local_remaining--;
6520ced3a2bSJed Brown       ci[i+1]++;
6530ced3a2bSJed Brown 
6540ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6559566063dSJacob Faibussowitsch       if (bb[j] < bi[acol[j]+1]) PetscCall(PetscHeapStash(h,j,bj[bb[j]++]));
6560ced3a2bSJed Brown       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
6570ced3a2bSJed Brown         PetscInt j2,col2;
6589566063dSJacob Faibussowitsch         PetscCall(PetscHeapPeek(h,&j2,&col2));
6590ced3a2bSJed Brown         if (col2 != col) break;
6609566063dSJacob Faibussowitsch         PetscCall(PetscHeapPop(h,&j2,&col2));
6619566063dSJacob Faibussowitsch         if (bb[j2] < bi[acol[j2]+1]) PetscCall(PetscHeapStash(h,j2,bj[bb[j2]++]));
6620ced3a2bSJed Brown       }
6630ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6649566063dSJacob Faibussowitsch       PetscCall(PetscHeapUnstash(h));
6659566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h,&j,&col));
6660ced3a2bSJed Brown     }
6670ced3a2bSJed Brown   }
6689566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
6699566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
6700ced3a2bSJed Brown 
6710ced3a2bSJed Brown   /* Column indices are in the list of free space */
6720ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6730ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
6749566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am],&cj));
6759566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
6760ced3a2bSJed Brown 
6770ced3a2bSJed Brown   /* put together the new symbolic matrix */
6789566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
6799566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C,A,B));
6800ced3a2bSJed Brown 
6810ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6820ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
6834222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
6840ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6850ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6860ced3a2bSJed Brown   c->nonew   = 0;
68726fbe8dcSKarl Rupp 
6884222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6890ced3a2bSJed Brown 
6900ced3a2bSJed Brown   /* set MatInfo */
6910ced3a2bSJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
6920ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6934222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6944222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6954222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6960ced3a2bSJed Brown 
6970ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6980ced3a2bSJed Brown   if (ci[am]) {
6999566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
7009566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
7010ced3a2bSJed Brown   } else {
7029566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Empty matrix product\n"));
7030ced3a2bSJed Brown   }
7040ced3a2bSJed Brown #endif
7050ced3a2bSJed Brown   PetscFunctionReturn(0);
7060ced3a2bSJed Brown }
707e9e4536cSHong Zhang 
7084222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
7098a07c6f1SJed Brown {
7108a07c6f1SJed Brown   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
7118a07c6f1SJed Brown   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
7128a07c6f1SJed Brown   PetscInt           *ci,*cj,*bb;
7138a07c6f1SJed Brown   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
7148a07c6f1SJed Brown   PetscReal          afill;
7158a07c6f1SJed Brown   PetscInt           i,j,col,ndouble = 0;
7160298fd71SBarry Smith   PetscFreeSpaceList free_space=NULL,current_space=NULL;
7178a07c6f1SJed Brown   PetscHeap          h;
7188a07c6f1SJed Brown   PetscBT            bt;
7198a07c6f1SJed Brown 
7208a07c6f1SJed Brown   PetscFunctionBegin;
721cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7228a07c6f1SJed Brown   /*---------------------------------------------------------------------------------------------*/
7238a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
7249566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am+2,&ci));
7258a07c6f1SJed Brown   ci[0] = 0;
7268a07c6f1SJed Brown 
7278a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
7289566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
7292205254eSKarl Rupp 
7308a07c6f1SJed Brown   current_space = free_space;
7318a07c6f1SJed Brown 
7329566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax,&h));
7339566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax,&bb));
7349566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(bn,&bt));
7358a07c6f1SJed Brown 
7368a07c6f1SJed Brown   /* Determine ci and cj */
7378a07c6f1SJed Brown   for (i=0; i<am; i++) {
7388a07c6f1SJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7398a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
7408a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7418a07c6f1SJed Brown     ci[i+1] = ci[i];
7428a07c6f1SJed Brown     /* Populate the min heap */
7438a07c6f1SJed Brown     for (j=0; j<anzi; j++) {
7448a07c6f1SJed Brown       PetscInt brow = acol[j];
7458a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
7468a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7478a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7489566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h,j,bcol));
7498a07c6f1SJed Brown           bb[j]++;
7508a07c6f1SJed Brown           break;
7518a07c6f1SJed Brown         }
7528a07c6f1SJed Brown       }
7538a07c6f1SJed Brown     }
7548a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7559566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h,&j,&col));
7568a07c6f1SJed Brown     while (j >= 0) {
7578a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7580298fd71SBarry Smith         fptr = NULL;                      /* need PetscBTMemzero */
7599566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
7608a07c6f1SJed Brown         ndouble++;
7618a07c6f1SJed Brown       }
7628a07c6f1SJed Brown       *(current_space->array++) = col;
7638a07c6f1SJed Brown       current_space->local_used++;
7648a07c6f1SJed Brown       current_space->local_remaining--;
7658a07c6f1SJed Brown       ci[i+1]++;
7668a07c6f1SJed Brown 
7678a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7688a07c6f1SJed Brown       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
7698a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7708a07c6f1SJed Brown         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
7719566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h,j,bcol));
7728a07c6f1SJed Brown           bb[j]++;
7738a07c6f1SJed Brown           break;
7748a07c6f1SJed Brown         }
7758a07c6f1SJed Brown       }
7769566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h,&j,&col));
7778a07c6f1SJed Brown     }
7788a07c6f1SJed Brown     if (fptr) {                 /* Clear the bits for this row */
7799566063dSJacob Faibussowitsch       for (; fptr<current_space->array; fptr++) PetscCall(PetscBTClear(bt,*fptr));
7808a07c6f1SJed Brown     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7819566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(bn,bt));
7828a07c6f1SJed Brown     }
7838a07c6f1SJed Brown   }
7849566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
7859566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
7869566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&bt));
7878a07c6f1SJed Brown 
7888a07c6f1SJed Brown   /* Column indices are in the list of free space */
7898a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7908a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
7919566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am],&cj));
7929566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
7938a07c6f1SJed Brown 
7948a07c6f1SJed Brown   /* put together the new symbolic matrix */
7959566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
7969566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C,A,B));
7978a07c6f1SJed Brown 
7988a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7998a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
8004222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
8018a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
8028a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
8038a07c6f1SJed Brown   c->nonew   = 0;
80426fbe8dcSKarl Rupp 
8054222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
8068a07c6f1SJed Brown 
8078a07c6f1SJed Brown   /* set MatInfo */
8088a07c6f1SJed Brown   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
8098a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
8104222ddf1SHong Zhang   C->info.mallocs           = ndouble;
8114222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
8124222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
8138a07c6f1SJed Brown 
8148a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
8158a07c6f1SJed Brown   if (ci[am]) {
8169566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
8179566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
8188a07c6f1SJed Brown   } else {
8199566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Empty matrix product\n"));
8208a07c6f1SJed Brown   }
8218a07c6f1SJed Brown #endif
8228a07c6f1SJed Brown   PetscFunctionReturn(0);
8238a07c6f1SJed Brown }
8248a07c6f1SJed Brown 
8254222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
826d7ed1a4dSandi selinger {
827d7ed1a4dSandi selinger   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
828d7ed1a4dSandi selinger   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
829d7ed1a4dSandi selinger   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
830d7ed1a4dSandi selinger   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
831d7ed1a4dSandi selinger   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
832d7ed1a4dSandi selinger   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
833d7ed1a4dSandi selinger   const PetscInt     *brow_ptr[8],*brow_end[8];
834d7ed1a4dSandi selinger   PetscInt           window[8];
835d7ed1a4dSandi selinger   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
836d7ed1a4dSandi selinger   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
837d7ed1a4dSandi selinger   PetscReal          afill;
838f83700f2Sandi selinger   PetscInt           *workj_L1,*workj_L2,*workj_L3;
8397660c4dbSandi selinger   PetscInt           L1_nnz,L2_nnz;
840d7ed1a4dSandi selinger 
841d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
842d7ed1a4dSandi selinger              Because of the way virtual memory works,
843d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
844d7ed1a4dSandi selinger   PetscFunctionBegin;
8459566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am+1,&ci));
846d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
847d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
848d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
849d7ed1a4dSandi selinger     a_rownnz = 0;
850d7ed1a4dSandi selinger     for (k=0; k<anzi; ++k) {
851d7ed1a4dSandi selinger       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
852d7ed1a4dSandi selinger       if (a_rownnz > bn) {
853d7ed1a4dSandi selinger         a_rownnz = bn;
854d7ed1a4dSandi selinger         break;
855d7ed1a4dSandi selinger       }
856d7ed1a4dSandi selinger     }
857d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
858d7ed1a4dSandi selinger   }
859d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
8609566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L1));
8619566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L2));
8629566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz,&workj_L3));
863d7ed1a4dSandi selinger 
8647660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8657660c4dbSandi selinger   c_maxmem = 8*(ai[am]+bi[bm]);
866d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
8679566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c_maxmem,&cj));
868d7ed1a4dSandi selinger 
869d7ed1a4dSandi selinger   ci_nnz       = 0;
870d7ed1a4dSandi selinger   ci[0]        = 0;
871d7ed1a4dSandi selinger   worki_L1[0]  = 0;
872d7ed1a4dSandi selinger   worki_L2[0]  = 0;
873d7ed1a4dSandi selinger   for (i=0; i<am; i++) {
874d7ed1a4dSandi selinger     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
875d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
876d7ed1a4dSandi selinger     rowsleft             = anzi;
877d7ed1a4dSandi selinger     inputcol_L1          = acol;
878d7ed1a4dSandi selinger     L2_nnz               = 0;
8797660c4dbSandi selinger     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8807660c4dbSandi selinger     worki_L2[1]          = 0;
88108fe1b3cSKarl Rupp     outputi_nnz          = 0;
882d7ed1a4dSandi selinger 
883d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
884d7ed1a4dSandi selinger     while (ci_nnz+a_maxrownnz > c_maxmem) {
885d7ed1a4dSandi selinger       c_maxmem *= 2;
886d7ed1a4dSandi selinger       ndouble++;
8879566063dSJacob Faibussowitsch       PetscCall(PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj));
888d7ed1a4dSandi selinger     }
889d7ed1a4dSandi selinger 
890d7ed1a4dSandi selinger     while (rowsleft) {
891d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
892d7ed1a4dSandi selinger       L1_nrows    = 0;
8937660c4dbSandi selinger       L1_nnz      = 0;
894d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
895d7ed1a4dSandi selinger       inputi      = bi;
896d7ed1a4dSandi selinger       inputj      = bj;
897d7ed1a4dSandi selinger 
898d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
899d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
900f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
901d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
902d7ed1a4dSandi selinger        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
903d7ed1a4dSandi selinger          window_min  = bn;                                                   \
9047660c4dbSandi selinger          outputi_nnz = 0;                                                    \
9057660c4dbSandi selinger          for (k=0; k<ANNZ; ++k) {                                            \
906d7ed1a4dSandi selinger            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
907d7ed1a4dSandi selinger            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
908d7ed1a4dSandi selinger            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
909d7ed1a4dSandi selinger            window_min  = PetscMin(window[k], window_min);                    \
910d7ed1a4dSandi selinger          }                                                                   \
911d7ed1a4dSandi selinger          while (window_min < bn) {                                           \
912d7ed1a4dSandi selinger            outputj[outputi_nnz++] = window_min;                              \
913d7ed1a4dSandi selinger            /* advance front and compute new minimum */                       \
914d7ed1a4dSandi selinger            old_window_min = window_min;                                      \
915d7ed1a4dSandi selinger            window_min = bn;                                                  \
916d7ed1a4dSandi selinger            for (k=0; k<ANNZ; ++k) {                                          \
917d7ed1a4dSandi selinger              if (window[k] == old_window_min) {                              \
918d7ed1a4dSandi selinger                brow_ptr[k]++;                                                \
919d7ed1a4dSandi selinger                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
920d7ed1a4dSandi selinger              }                                                               \
921d7ed1a4dSandi selinger              window_min = PetscMin(window[k], window_min);                   \
922d7ed1a4dSandi selinger            }                                                                 \
923d7ed1a4dSandi selinger          }
924d7ed1a4dSandi selinger 
925d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
926d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
927d7ed1a4dSandi selinger       while (L1_rowsleft) {
9287660c4dbSandi selinger         outputi_nnz = 0;
9297660c4dbSandi selinger         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
9307660c4dbSandi selinger         else           outputj = cj + ci_nnz;           /* Merge directly to C */
9317660c4dbSandi selinger 
932d7ed1a4dSandi selinger         switch (L1_rowsleft) {
933d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
934d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
935d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
936d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
937d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
938d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
939d7ed1a4dSandi selinger                  break;
940d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
941d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
942d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
943d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
944d7ed1a4dSandi selinger                  break;
945d7ed1a4dSandi selinger         case 3: MatMatMultSymbolic_RowMergeMacro(3);
946d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
947d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
948d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
949d7ed1a4dSandi selinger                  break;
950d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
951d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
952d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
953d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
954d7ed1a4dSandi selinger                  break;
955d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
956d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
957d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
958d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
959d7ed1a4dSandi selinger                  break;
960d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
961d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
962d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
963d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
964d7ed1a4dSandi selinger                  break;
965d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
966d7ed1a4dSandi selinger                  inputcol    += L1_rowsleft;
967d7ed1a4dSandi selinger                  rowsleft    -= L1_rowsleft;
968d7ed1a4dSandi selinger                  L1_rowsleft  = 0;
969d7ed1a4dSandi selinger                  break;
970d7ed1a4dSandi selinger         default: MatMatMultSymbolic_RowMergeMacro(8);
971d7ed1a4dSandi selinger                  inputcol    += 8;
972d7ed1a4dSandi selinger                  rowsleft    -= 8;
973d7ed1a4dSandi selinger                  L1_rowsleft -= 8;
974d7ed1a4dSandi selinger                  break;
975d7ed1a4dSandi selinger         }
976d7ed1a4dSandi selinger         inputcol_L1           = inputcol;
9777660c4dbSandi selinger         L1_nnz               += outputi_nnz;
9787660c4dbSandi selinger         worki_L1[++L1_nrows]  = L1_nnz;
979d7ed1a4dSandi selinger       }
980d7ed1a4dSandi selinger 
981d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
982d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
983d7ed1a4dSandi selinger       if (anzi > 8) {
984d7ed1a4dSandi selinger         inputi      = worki_L1;
985d7ed1a4dSandi selinger         inputj      = workj_L1;
986d7ed1a4dSandi selinger         inputcol    = workcol;
987d7ed1a4dSandi selinger         outputi_nnz = 0;
988d7ed1a4dSandi selinger 
989d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
990d7ed1a4dSandi selinger         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
991d7ed1a4dSandi selinger 
992d7ed1a4dSandi selinger         switch (L1_nrows) {
993d7ed1a4dSandi selinger         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
994d7ed1a4dSandi selinger                  brow_end[0] = inputj + inputi[inputcol[0]+1];
995d7ed1a4dSandi selinger                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
996d7ed1a4dSandi selinger                  break;
997d7ed1a4dSandi selinger         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
998d7ed1a4dSandi selinger         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
999d7ed1a4dSandi selinger         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1000d7ed1a4dSandi selinger         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1001d7ed1a4dSandi selinger         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1002d7ed1a4dSandi selinger         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1003d7ed1a4dSandi selinger         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1004d7ed1a4dSandi selinger         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1005d7ed1a4dSandi selinger         }
1006d7ed1a4dSandi selinger         L2_nnz               += outputi_nnz;
1007d7ed1a4dSandi selinger         worki_L2[++L2_nrows]  = L2_nnz;
1008d7ed1a4dSandi selinger 
10097660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10107660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1011d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1012d7ed1a4dSandi selinger           inputi      = worki_L2;
1013d7ed1a4dSandi selinger           inputj      = workj_L2;
1014d7ed1a4dSandi selinger           inputcol    = workcol;
1015d7ed1a4dSandi selinger           outputi_nnz = 0;
1016f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1017d7ed1a4dSandi selinger           else          outputj = cj + ci_nnz;
1018d7ed1a4dSandi selinger           switch (L2_nrows) {
1019d7ed1a4dSandi selinger           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1020d7ed1a4dSandi selinger                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1021d7ed1a4dSandi selinger                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1022d7ed1a4dSandi selinger                    break;
1023d7ed1a4dSandi selinger           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1024d7ed1a4dSandi selinger           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1025d7ed1a4dSandi selinger           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1026d7ed1a4dSandi selinger           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1027d7ed1a4dSandi selinger           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1028d7ed1a4dSandi selinger           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1029d7ed1a4dSandi selinger           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1030d7ed1a4dSandi selinger           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1031d7ed1a4dSandi selinger           }
1032d7ed1a4dSandi selinger           L2_nrows    = 1;
10337660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10347660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10357660c4dbSandi selinger           /* Copy to workj_L2 */
1036d7ed1a4dSandi selinger           if (rowsleft) {
10377660c4dbSandi selinger             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1038d7ed1a4dSandi selinger           }
1039d7ed1a4dSandi selinger         }
1040d7ed1a4dSandi selinger       }
1041d7ed1a4dSandi selinger     }  /* while (rowsleft) */
1042d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1043d7ed1a4dSandi selinger 
1044d7ed1a4dSandi selinger     /* terminate current row */
1045d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1046d7ed1a4dSandi selinger     ci[i+1] = ci_nnz;
1047d7ed1a4dSandi selinger   }
1048d7ed1a4dSandi selinger 
1049d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
10509566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
10519566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1052d7ed1a4dSandi selinger 
1053d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1054d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
10554222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1056d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1057d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1058d7ed1a4dSandi selinger   c->nonew   = 0;
1059d7ed1a4dSandi selinger 
10604222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1061d7ed1a4dSandi selinger 
1062d7ed1a4dSandi selinger   /* set MatInfo */
1063d7ed1a4dSandi selinger   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1064d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
10654222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10664222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10674222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1068d7ed1a4dSandi selinger 
1069d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1070d7ed1a4dSandi selinger   if (ci[am]) {
10719566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
10729566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1073d7ed1a4dSandi selinger   } else {
10749566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1075d7ed1a4dSandi selinger   }
1076d7ed1a4dSandi selinger #endif
1077d7ed1a4dSandi selinger 
1078d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
10799566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L1));
10809566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L2));
10819566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L3));
1082d7ed1a4dSandi selinger   PetscFunctionReturn(0);
1083d7ed1a4dSandi selinger }
1084d7ed1a4dSandi selinger 
1085cd093f1dSJed Brown /* concatenate unique entries and then sort */
10864222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1087cd093f1dSJed Brown {
1088cd093f1dSJed Brown   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1089cd093f1dSJed Brown   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
10908c78258cSHong Zhang   PetscInt       *ci,*cj,bcol;
1091cd093f1dSJed Brown   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1092cd093f1dSJed Brown   PetscReal      afill;
1093cd093f1dSJed Brown   PetscInt       i,j,ndouble = 0;
1094cd093f1dSJed Brown   PetscSegBuffer seg,segrow;
1095cd093f1dSJed Brown   char           *seen;
1096cd093f1dSJed Brown 
1097cd093f1dSJed Brown   PetscFunctionBegin;
10989566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am+1,&ci));
1099cd093f1dSJed Brown   ci[0] = 0;
1100cd093f1dSJed Brown 
1101cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
11029566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg));
11039566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),100,&segrow));
11049566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(bn,&seen));
1105cd093f1dSJed Brown 
1106cd093f1dSJed Brown   /* Determine ci and cj */
1107cd093f1dSJed Brown   for (i=0; i<am; i++) {
1108cd093f1dSJed Brown     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1109cd093f1dSJed Brown     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1110cd093f1dSJed Brown     PetscInt packlen = 0,*PETSC_RESTRICT crow;
11118c78258cSHong Zhang 
1112cd093f1dSJed Brown     /* Pack segrow */
1113cd093f1dSJed Brown     for (j=0; j<anzi; j++) {
1114cd093f1dSJed Brown       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1115cd093f1dSJed Brown       for (k=bjstart; k<bjend; k++) {
11168c78258cSHong Zhang         bcol = bj[k];
1117cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1118cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
11199566063dSJacob Faibussowitsch           PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1120cd093f1dSJed Brown           *slot = bcol;
1121cd093f1dSJed Brown           seen[bcol] = 1;
1122cd093f1dSJed Brown           packlen++;
1123cd093f1dSJed Brown         }
1124cd093f1dSJed Brown       }
1125cd093f1dSJed Brown     }
11268c78258cSHong Zhang 
11278c78258cSHong Zhang     /* Check i-th diagonal entry */
11288c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11298c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11309566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
11318c78258cSHong Zhang       *slot   = i;
11328c78258cSHong Zhang       seen[i] = 1;
11338c78258cSHong Zhang       packlen++;
11348c78258cSHong Zhang     }
11358c78258cSHong Zhang 
11369566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferGetInts(seg,packlen,&crow));
11379566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferExtractTo(segrow,crow));
11389566063dSJacob Faibussowitsch     PetscCall(PetscSortInt(packlen,crow));
1139cd093f1dSJed Brown     ci[i+1] = ci[i] + packlen;
1140cd093f1dSJed Brown     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1141cd093f1dSJed Brown   }
11429566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segrow));
11439566063dSJacob Faibussowitsch   PetscCall(PetscFree(seen));
1144cd093f1dSJed Brown 
1145cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
11469566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(seg,&cj));
11479566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&seg));
1148cd093f1dSJed Brown 
1149cd093f1dSJed Brown   /* put together the new symbolic matrix */
11509566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
11519566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1152cd093f1dSJed Brown 
1153cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1154cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
11554222ddf1SHong Zhang   c          = (Mat_SeqAIJ*)(C->data);
1156cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1157cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1158cd093f1dSJed Brown   c->nonew   = 0;
1159cd093f1dSJed Brown 
11604222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1161cd093f1dSJed Brown 
1162cd093f1dSJed Brown   /* set MatInfo */
11632a09556fSStefano Zampini   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1164cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
11654222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11664222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11674222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1168cd093f1dSJed Brown 
1169cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1170cd093f1dSJed Brown   if (ci[am]) {
11719566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
11729566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1173cd093f1dSJed Brown   } else {
11749566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1175cd093f1dSJed Brown   }
1176cd093f1dSJed Brown #endif
1177cd093f1dSJed Brown   PetscFunctionReturn(0);
1178cd093f1dSJed Brown }
1179cd093f1dSJed Brown 
11806718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
11812128a86cSHong Zhang {
11826718818eSStefano Zampini   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
11832128a86cSHong Zhang 
11842128a86cSHong Zhang   PetscFunctionBegin;
11859566063dSJacob Faibussowitsch   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
11869566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->Bt_den));
11879566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->ABt_den));
11889566063dSJacob Faibussowitsch   PetscCall(PetscFree(abt));
11892128a86cSHong Zhang   PetscFunctionReturn(0);
11902128a86cSHong Zhang }
11912128a86cSHong Zhang 
11924222ddf1SHong Zhang PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1193bc011b1eSHong Zhang {
119481d82fe4SHong Zhang   Mat                 Bt;
119540192850SHong Zhang   Mat_MatMatTransMult *abt;
11964222ddf1SHong Zhang   Mat_Product         *product = C->product;
11976718818eSStefano Zampini   char                *alg;
1198d2853540SHong Zhang 
119981d82fe4SHong Zhang   PetscFunctionBegin;
120028b400f6SJacob Faibussowitsch   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
120128b400f6SJacob Faibussowitsch   PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
12026718818eSStefano Zampini 
120381d82fe4SHong Zhang   /* create symbolic Bt */
12047fb60732SBarry Smith   PetscCall(MatTransposeSymbolic(B,&Bt));
12059566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
12069566063dSJacob Faibussowitsch   PetscCall(MatSetType(Bt,((PetscObject)A)->type_name));
120781d82fe4SHong Zhang 
120881d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12099566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(product->alg,&alg));
12109566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C,"sorted")); /* set algorithm for C = A*Bt */
12119566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C));
12129566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C,alg)); /* resume original algorithm for ABt product */
12139566063dSJacob Faibussowitsch   PetscCall(PetscFree(alg));
121481d82fe4SHong Zhang 
1215a5b23f4aSJose E. Roman   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
12169566063dSJacob Faibussowitsch   PetscCall(PetscNew(&abt));
12172128a86cSHong Zhang 
12186718818eSStefano Zampini   product->data    = abt;
12196718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12206718818eSStefano Zampini 
12214222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12222128a86cSHong Zhang 
12234222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12249566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg,"color",&abt->usecoloring));
122540192850SHong Zhang   if (abt->usecoloring) {
1226b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1227b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1228335efc43SPeter Brune     MatColoring          coloring;
12292128a86cSHong Zhang     ISColoring           iscoloring;
12302128a86cSHong Zhang     Mat                  Bt_dense,C_dense;
1231e8354b3eSHong Zhang 
12324222ddf1SHong Zhang     /* inode causes memory problem */
12339566063dSJacob Faibussowitsch     PetscCall(MatSetOption(C,MAT_USE_INODES,PETSC_FALSE));
12344222ddf1SHong Zhang 
12359566063dSJacob Faibussowitsch     PetscCall(MatColoringCreate(C,&coloring));
12369566063dSJacob Faibussowitsch     PetscCall(MatColoringSetDistance(coloring,2));
12379566063dSJacob Faibussowitsch     PetscCall(MatColoringSetType(coloring,MATCOLORINGSL));
12389566063dSJacob Faibussowitsch     PetscCall(MatColoringSetFromOptions(coloring));
12399566063dSJacob Faibussowitsch     PetscCall(MatColoringApply(coloring,&iscoloring));
12409566063dSJacob Faibussowitsch     PetscCall(MatColoringDestroy(&coloring));
12419566063dSJacob Faibussowitsch     PetscCall(MatTransposeColoringCreate(C,iscoloring,&matcoloring));
12422205254eSKarl Rupp 
124340192850SHong Zhang     abt->matcoloring = matcoloring;
12442205254eSKarl Rupp 
12459566063dSJacob Faibussowitsch     PetscCall(ISColoringDestroy(&iscoloring));
12462128a86cSHong Zhang 
12472128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12489566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF,&Bt_dense));
12499566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors));
12509566063dSJacob Faibussowitsch     PetscCall(MatSetType(Bt_dense,MATSEQDENSE));
12519566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(Bt_dense,NULL));
12522205254eSKarl Rupp 
12532128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
125440192850SHong Zhang     abt->Bt_den         = Bt_dense;
12552128a86cSHong Zhang 
12569566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF,&C_dense));
12579566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors));
12589566063dSJacob Faibussowitsch     PetscCall(MatSetType(C_dense,MATSEQDENSE));
12599566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(C_dense,NULL));
12602205254eSKarl Rupp 
12612128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
126240192850SHong Zhang     abt->ABt_den  = C_dense;
1263f94ccd6cSHong Zhang 
1264f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12651ce71dffSSatish Balay     {
12664222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
12679566063dSJacob Faibussowitsch       PetscCall(PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors)))));
12681ce71dffSSatish Balay     }
1269f94ccd6cSHong Zhang #endif
12702128a86cSHong Zhang   }
1271e99be685SHong Zhang   /* clean up */
12729566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&Bt));
12735df89d91SHong Zhang   PetscFunctionReturn(0);
12745df89d91SHong Zhang }
12755df89d91SHong Zhang 
12766fc122caSHong Zhang PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
12775df89d91SHong Zhang {
12785df89d91SHong Zhang   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1279e2cac8caSJed Brown   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
128081d82fe4SHong Zhang   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
12815df89d91SHong Zhang   PetscLogDouble      flops=0.0;
1282aa1aec99SHong Zhang   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
12836718818eSStefano Zampini   Mat_MatMatTransMult *abt;
12846718818eSStefano Zampini   Mat_Product         *product = C->product;
12855df89d91SHong Zhang 
12865df89d91SHong Zhang   PetscFunctionBegin;
128728b400f6SJacob Faibussowitsch   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
12886718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
128928b400f6SJacob Faibussowitsch   PetscCheck(abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
129058ed3ceaSHong Zhang   /* clear old values in C */
129158ed3ceaSHong Zhang   if (!c->a) {
12929566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
129358ed3ceaSHong Zhang     c->a      = ca;
129458ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
129558ed3ceaSHong Zhang   } else {
129658ed3ceaSHong Zhang     ca =  c->a;
12979566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca,ci[cm]+1));
129858ed3ceaSHong Zhang   }
129958ed3ceaSHong Zhang 
130040192850SHong Zhang   if (abt->usecoloring) {
130140192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
130240192850SHong Zhang     Mat                  Bt_dense,C_dense = abt->ABt_den;
1303c8db22f6SHong Zhang 
1304b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
130540192850SHong Zhang     Bt_dense = abt->Bt_den;
13069566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplySpToDen(matcoloring,B,Bt_dense));
1307c8db22f6SHong Zhang 
1308c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13099566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense));
1310c8db22f6SHong Zhang 
13112128a86cSHong Zhang     /* Recover C from C_dense */
13129566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplyDenToSp(matcoloring,C_dense,C));
1313c8db22f6SHong Zhang     PetscFunctionReturn(0);
1314c8db22f6SHong Zhang   }
1315c8db22f6SHong Zhang 
13161fa1209cSHong Zhang   for (i=0; i<cm; i++) {
131781d82fe4SHong Zhang     anzi = ai[i+1] - ai[i];
131881d82fe4SHong Zhang     acol = aj + ai[i];
13196973769bSHong Zhang     aval = aa + ai[i];
13201fa1209cSHong Zhang     cnzi = ci[i+1] - ci[i];
13211fa1209cSHong Zhang     ccol = cj + ci[i];
13226973769bSHong Zhang     cval = ca + ci[i];
13231fa1209cSHong Zhang     for (j=0; j<cnzi; j++) {
132481d82fe4SHong Zhang       brow = ccol[j];
132581d82fe4SHong Zhang       bnzj = bi[brow+1] - bi[brow];
132681d82fe4SHong Zhang       bcol = bj + bi[brow];
13276973769bSHong Zhang       bval = ba + bi[brow];
13286973769bSHong Zhang 
132981d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
133081d82fe4SHong Zhang       nexta = 0; nextb = 0;
133181d82fe4SHong Zhang       while (nexta<anzi && nextb<bnzj) {
13327b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
133381d82fe4SHong Zhang         if (nexta == anzi) break;
13347b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
133581d82fe4SHong Zhang         if (nextb == bnzj) break;
133681d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13376973769bSHong Zhang           cval[j] += aval[nexta]*bval[nextb];
133881d82fe4SHong Zhang           nexta++; nextb++;
133981d82fe4SHong Zhang           flops += 2;
13401fa1209cSHong Zhang         }
13411fa1209cSHong Zhang       }
134281d82fe4SHong Zhang     }
134381d82fe4SHong Zhang   }
13449566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
13459566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
13469566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
13475df89d91SHong Zhang   PetscFunctionReturn(0);
13485df89d91SHong Zhang }
13495df89d91SHong Zhang 
13506718818eSStefano Zampini PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
13516d373c3eSHong Zhang {
13526718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
13536d373c3eSHong Zhang 
13546d373c3eSHong Zhang   PetscFunctionBegin;
13559566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&atb->At));
13561baa6e33SBarry Smith   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
13579566063dSJacob Faibussowitsch   PetscCall(PetscFree(atb));
13586d373c3eSHong Zhang   PetscFunctionReturn(0);
13596d373c3eSHong Zhang }
13606d373c3eSHong Zhang 
13614222ddf1SHong Zhang PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
13625df89d91SHong Zhang {
1363089a957eSStefano Zampini   Mat            At = NULL;
13644222ddf1SHong Zhang   Mat_Product    *product = C->product;
1365089a957eSStefano Zampini   PetscBool      flg,def,square;
1366bc011b1eSHong Zhang 
1367bc011b1eSHong Zhang   PetscFunctionBegin;
1368089a957eSStefano Zampini   MatCheckProduct(C,4);
1369b94d7dedSBarry Smith   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
13704222ddf1SHong Zhang   /* outerproduct */
13719566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg,"outerproduct",&flg));
13724222ddf1SHong Zhang   if (flg) {
1373bc011b1eSHong Zhang     /* create symbolic At */
1374089a957eSStefano Zampini     if (!square) {
13757fb60732SBarry Smith       PetscCall(MatTransposeSymbolic(A,&At));
13769566063dSJacob Faibussowitsch       PetscCall(MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
13779566063dSJacob Faibussowitsch       PetscCall(MatSetType(At,((PetscObject)A)->type_name));
1378089a957eSStefano Zampini     }
1379bc011b1eSHong Zhang     /* get symbolic C=At*B */
13809566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,"sorted"));
13819566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1382bc011b1eSHong Zhang 
1383bc011b1eSHong Zhang     /* clean up */
1384089a957eSStefano Zampini     if (!square) {
13859566063dSJacob Faibussowitsch       PetscCall(MatDestroy(&At));
1386089a957eSStefano Zampini     }
13876d373c3eSHong Zhang 
13884222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
13899566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,"outerproduct"));
13904222ddf1SHong Zhang     PetscFunctionReturn(0);
13914222ddf1SHong Zhang   }
13924222ddf1SHong Zhang 
13934222ddf1SHong Zhang   /* matmatmult */
13949566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg,"default",&def));
13959566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg,"at*b",&flg));
1396089a957eSStefano Zampini   if (flg || def) {
13974222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
13984222ddf1SHong Zhang 
139928b400f6SJacob Faibussowitsch     PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
14009566063dSJacob Faibussowitsch     PetscCall(PetscNew(&atb));
1401089a957eSStefano Zampini     if (!square) {
1402acd337a6SBarry Smith       PetscCall(MatTranspose(A,MAT_INITIAL_MATRIX,&At));
1403089a957eSStefano Zampini     }
14049566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,"sorted"));
14059566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
14069566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,"at*b"));
14076718818eSStefano Zampini     product->data    = atb;
14086718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
14094222ddf1SHong Zhang     atb->At          = At;
14104222ddf1SHong Zhang 
14114222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14124222ddf1SHong Zhang     PetscFunctionReturn(0);
14134222ddf1SHong Zhang   }
14144222ddf1SHong Zhang 
14154222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1416bc011b1eSHong Zhang }
1417bc011b1eSHong Zhang 
141875648e8dSHong Zhang PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1419bc011b1eSHong Zhang {
14200fbc74f4SHong Zhang   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1421d0f46423SBarry Smith   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1422d0f46423SBarry Smith   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1423d13dce4bSSatish Balay   PetscLogDouble flops=0.0;
1424aa1aec99SHong Zhang   MatScalar      *aa=a->a,*ba,*ca,*caj;
1425bc011b1eSHong Zhang 
1426bc011b1eSHong Zhang   PetscFunctionBegin;
1427aa1aec99SHong Zhang   if (!c->a) {
14289566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
14292205254eSKarl Rupp 
1430aa1aec99SHong Zhang     c->a      = ca;
1431aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1432aa1aec99SHong Zhang   } else {
1433aa1aec99SHong Zhang     ca   = c->a;
14349566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca,ci[cm]));
1435aa1aec99SHong Zhang   }
1436bc011b1eSHong Zhang 
1437bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1438bc011b1eSHong Zhang   for (i=0; i<am; i++) {
1439bc011b1eSHong Zhang     bj   = b->j + bi[i];
1440bc011b1eSHong Zhang     ba   = b->a + bi[i];
1441bc011b1eSHong Zhang     bnzi = bi[i+1] - bi[i];
1442bc011b1eSHong Zhang     anzi = ai[i+1] - ai[i];
1443bc011b1eSHong Zhang     for (j=0; j<anzi; j++) {
1444bc011b1eSHong Zhang       nextb = 0;
14450fbc74f4SHong Zhang       crow  = *aj++;
1446bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1447bc011b1eSHong Zhang       caj   = ca + ci[crow];
1448bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1449bc011b1eSHong Zhang       for (k=0; nextb<bnzi; k++) {
14500fbc74f4SHong Zhang         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
14510fbc74f4SHong Zhang           caj[k] += (*aa)*(*(ba+nextb));
1452bc011b1eSHong Zhang           nextb++;
1453bc011b1eSHong Zhang         }
1454bc011b1eSHong Zhang       }
1455bc011b1eSHong Zhang       flops += 2*bnzi;
14560fbc74f4SHong Zhang       aa++;
1457bc011b1eSHong Zhang     }
1458bc011b1eSHong Zhang   }
1459bc011b1eSHong Zhang 
1460bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
14619566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
14629566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
14639566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
1464bc011b1eSHong Zhang   PetscFunctionReturn(0);
1465bc011b1eSHong Zhang }
14669123193aSHong Zhang 
14674222ddf1SHong Zhang PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
14689123193aSHong Zhang {
14699123193aSHong Zhang   PetscFunctionBegin;
14709566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
14714222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14729123193aSHong Zhang   PetscFunctionReturn(0);
14739123193aSHong Zhang }
14749123193aSHong Zhang 
147593aa15f2SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
14769123193aSHong Zhang {
1477f32d5d43SBarry Smith   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
14781ca9667aSStefano Zampini   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1479a4af7ca8SStefano Zampini   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1480daf57211SHong Zhang   const PetscInt    *aj;
148175f6d85dSStefano Zampini   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm,am=A->rmap->n;
148275f6d85dSStefano Zampini   PetscInt          clda;
148375f6d85dSStefano Zampini   PetscInt          am4,bm4,col,i,j,n;
14849123193aSHong Zhang 
14859123193aSHong Zhang   PetscFunctionBegin;
1486f32d5d43SBarry Smith   if (!cm || !cn) PetscFunctionReturn(0);
14879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A,&av));
148893aa15f2SStefano Zampini   if (add) {
14899566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArray(C,&c));
149093aa15f2SStefano Zampini   } else {
14919566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArrayWrite(C,&c));
149293aa15f2SStefano Zampini   }
14939566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(B,&b));
14949566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B,&bm));
14959566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(C,&clda));
149675f6d85dSStefano Zampini   am4 = 4*clda;
149775f6d85dSStefano Zampini   bm4 = 4*bm;
1498f32d5d43SBarry Smith   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
14991ca9667aSStefano Zampini   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
15001ca9667aSStefano Zampini   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
15011ca9667aSStefano Zampini     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1502f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1503f32d5d43SBarry Smith       n  = a->i[i+1] - a->i[i];
1504f32d5d43SBarry Smith       aj = a->j + a->i[i];
1505a4af7ca8SStefano Zampini       aa = av + a->i[i];
1506f32d5d43SBarry Smith       for (j=0; j<n; j++) {
15071ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15081ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1509730858b9SHong Zhang         r1 += aatmp*b1[ajtmp];
1510730858b9SHong Zhang         r2 += aatmp*b2[ajtmp];
1511730858b9SHong Zhang         r3 += aatmp*b3[ajtmp];
1512730858b9SHong Zhang         r4 += aatmp*b4[ajtmp];
15139123193aSHong Zhang       }
151493aa15f2SStefano Zampini       if (add) {
151587753ddeSHong Zhang         c1[i] += r1;
151687753ddeSHong Zhang         c2[i] += r2;
151787753ddeSHong Zhang         c3[i] += r3;
151887753ddeSHong Zhang         c4[i] += r4;
151993aa15f2SStefano Zampini       } else {
152093aa15f2SStefano Zampini         c1[i] = r1;
152193aa15f2SStefano Zampini         c2[i] = r2;
152293aa15f2SStefano Zampini         c3[i] = r3;
152393aa15f2SStefano Zampini         c4[i] = r4;
152493aa15f2SStefano Zampini       }
1525f32d5d43SBarry Smith     }
1526730858b9SHong Zhang     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1527730858b9SHong Zhang     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1528f32d5d43SBarry Smith   }
152993aa15f2SStefano Zampini   /* process remaining columns */
153093aa15f2SStefano Zampini   if (col != cn) {
153193aa15f2SStefano Zampini     PetscInt rc = cn-col;
153293aa15f2SStefano Zampini 
153393aa15f2SStefano Zampini     if (rc == 1) {
153493aa15f2SStefano Zampini       for (i=0; i<am; i++) {
1535f32d5d43SBarry Smith         r1 = 0.0;
1536f32d5d43SBarry Smith         n  = a->i[i+1] - a->i[i];
1537f32d5d43SBarry Smith         aj = a->j + a->i[i];
1538a4af7ca8SStefano Zampini         aa = av + a->i[i];
153993aa15f2SStefano Zampini         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
154093aa15f2SStefano Zampini         if (add) c1[i] += r1;
154193aa15f2SStefano Zampini         else c1[i] = r1;
154293aa15f2SStefano Zampini       }
154393aa15f2SStefano Zampini     } else if (rc == 2) {
154493aa15f2SStefano Zampini       for (i=0; i<am; i++) {
154593aa15f2SStefano Zampini         r1 = r2 = 0.0;
154693aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
154793aa15f2SStefano Zampini         aj = a->j + a->i[i];
154893aa15f2SStefano Zampini         aa = av + a->i[i];
1549f32d5d43SBarry Smith         for (j=0; j<n; j++) {
155093aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
155193aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
155293aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
155393aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
1554f32d5d43SBarry Smith         }
155593aa15f2SStefano Zampini         if (add) {
155687753ddeSHong Zhang           c1[i] += r1;
155793aa15f2SStefano Zampini           c2[i] += r2;
155893aa15f2SStefano Zampini         } else {
155993aa15f2SStefano Zampini           c1[i] = r1;
156093aa15f2SStefano Zampini           c2[i] = r2;
1561f32d5d43SBarry Smith         }
156293aa15f2SStefano Zampini       }
156393aa15f2SStefano Zampini     } else {
156493aa15f2SStefano Zampini       for (i=0; i<am; i++) {
156593aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
156693aa15f2SStefano Zampini         n  = a->i[i+1] - a->i[i];
156793aa15f2SStefano Zampini         aj = a->j + a->i[i];
156893aa15f2SStefano Zampini         aa = av + a->i[i];
156993aa15f2SStefano Zampini         for (j=0; j<n; j++) {
157093aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
157193aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
157293aa15f2SStefano Zampini           r1 += aatmp*b1[ajtmp];
157393aa15f2SStefano Zampini           r2 += aatmp*b2[ajtmp];
157493aa15f2SStefano Zampini           r3 += aatmp*b3[ajtmp];
157593aa15f2SStefano Zampini         }
157693aa15f2SStefano Zampini         if (add) {
157793aa15f2SStefano Zampini           c1[i] += r1;
157893aa15f2SStefano Zampini           c2[i] += r2;
157993aa15f2SStefano Zampini           c3[i] += r3;
158093aa15f2SStefano Zampini         } else {
158193aa15f2SStefano Zampini           c1[i] = r1;
158293aa15f2SStefano Zampini           c2[i] = r2;
158393aa15f2SStefano Zampini           c3[i] = r3;
158493aa15f2SStefano Zampini         }
158593aa15f2SStefano Zampini       }
158693aa15f2SStefano Zampini     }
1587f32d5d43SBarry Smith   }
15889566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(cn*(2.0*a->nz)));
158993aa15f2SStefano Zampini   if (add) {
15909566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArray(C,&c));
159193aa15f2SStefano Zampini   } else {
15929566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArrayWrite(C,&c));
159393aa15f2SStefano Zampini   }
15949566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(B,&b));
15959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A,&av));
1596f32d5d43SBarry Smith   PetscFunctionReturn(0);
1597f32d5d43SBarry Smith }
1598f32d5d43SBarry Smith 
159987753ddeSHong Zhang PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1600f32d5d43SBarry Smith {
1601f32d5d43SBarry Smith   PetscFunctionBegin;
160208401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
160308401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
160408401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
16054324174eSBarry Smith 
16069566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE));
16079123193aSHong Zhang   PetscFunctionReturn(0);
16089123193aSHong Zhang }
1609b1683b59SHong Zhang 
16104222ddf1SHong Zhang /* ------------------------------------------------------- */
16114222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
16124222ddf1SHong Zhang {
16134222ddf1SHong Zhang   PetscFunctionBegin;
16144222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16154222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16164222ddf1SHong Zhang   PetscFunctionReturn(0);
16174222ddf1SHong Zhang }
16184222ddf1SHong Zhang 
16196718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
16206718818eSStefano Zampini 
16214222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
16224222ddf1SHong Zhang {
16234222ddf1SHong Zhang   PetscFunctionBegin;
16246718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16254222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16266718818eSStefano Zampini   PetscFunctionReturn(0);
16276718818eSStefano Zampini }
16286718818eSStefano Zampini 
16296718818eSStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
16306718818eSStefano Zampini {
16316718818eSStefano Zampini   PetscFunctionBegin;
16326718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16336718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16344222ddf1SHong Zhang   PetscFunctionReturn(0);
16354222ddf1SHong Zhang }
16364222ddf1SHong Zhang 
16374222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
16384222ddf1SHong Zhang {
16394222ddf1SHong Zhang   Mat_Product    *product = C->product;
16404222ddf1SHong Zhang 
16414222ddf1SHong Zhang   PetscFunctionBegin;
16424222ddf1SHong Zhang   switch (product->type) {
16434222ddf1SHong Zhang   case MATPRODUCT_AB:
16449566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
16454222ddf1SHong Zhang     break;
16464222ddf1SHong Zhang   case MATPRODUCT_AtB:
16479566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
16484222ddf1SHong Zhang     break;
16496718818eSStefano Zampini   case MATPRODUCT_ABt:
16509566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
16514222ddf1SHong Zhang     break;
16524222ddf1SHong Zhang   default:
16536718818eSStefano Zampini     break;
16544222ddf1SHong Zhang   }
16554222ddf1SHong Zhang   PetscFunctionReturn(0);
16564222ddf1SHong Zhang }
16574222ddf1SHong Zhang /* ------------------------------------------------------- */
16584222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
16594222ddf1SHong Zhang {
16604222ddf1SHong Zhang   Mat_Product    *product = C->product;
16614222ddf1SHong Zhang   Mat            A = product->A;
16624222ddf1SHong Zhang   PetscBool      baij;
16634222ddf1SHong Zhang 
16644222ddf1SHong Zhang   PetscFunctionBegin;
16659566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij));
16664222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
16674222ddf1SHong Zhang     PetscBool sbaij;
16689566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij));
166928b400f6SJacob Faibussowitsch     PetscCheck(sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
16704222ddf1SHong Zhang 
16714222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
16724222ddf1SHong Zhang   } else { /* A is seqbaij */
16734222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
16744222ddf1SHong Zhang   }
16754222ddf1SHong Zhang 
16764222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16774222ddf1SHong Zhang   PetscFunctionReturn(0);
16784222ddf1SHong Zhang }
16794222ddf1SHong Zhang 
16804222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
16814222ddf1SHong Zhang {
16824222ddf1SHong Zhang   Mat_Product    *product = C->product;
16834222ddf1SHong Zhang 
16844222ddf1SHong Zhang   PetscFunctionBegin;
16856718818eSStefano Zampini   MatCheckProduct(C,1);
168628b400f6SJacob Faibussowitsch   PetscCheck(product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1687b94d7dedSBarry Smith   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
16884222ddf1SHong Zhang   PetscFunctionReturn(0);
16894222ddf1SHong Zhang }
16906718818eSStefano Zampini 
16914222ddf1SHong Zhang /* ------------------------------------------------------- */
16924222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
16934222ddf1SHong Zhang {
16944222ddf1SHong Zhang   PetscFunctionBegin;
16954222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
16964222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16974222ddf1SHong Zhang   PetscFunctionReturn(0);
16984222ddf1SHong Zhang }
16994222ddf1SHong Zhang 
17004222ddf1SHong Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
17014222ddf1SHong Zhang {
17024222ddf1SHong Zhang   Mat_Product    *product = C->product;
17034222ddf1SHong Zhang 
17044222ddf1SHong Zhang   PetscFunctionBegin;
17054222ddf1SHong Zhang   if (product->type == MATPRODUCT_AB) {
17069566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
17076718818eSStefano Zampini   }
17084222ddf1SHong Zhang   PetscFunctionReturn(0);
17094222ddf1SHong Zhang }
17104222ddf1SHong Zhang /* ------------------------------------------------------- */
17114222ddf1SHong Zhang 
1712b9af6bddSHong Zhang PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1713c8db22f6SHong Zhang {
17142f5f1f90SHong Zhang   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
17152f5f1f90SHong Zhang   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
17162f5f1f90SHong Zhang   PetscInt       *bi      = b->i,*bj=b->j;
17172f5f1f90SHong Zhang   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
17182f5f1f90SHong Zhang   MatScalar      *btval,*btval_den,*ba=b->a;
17192f5f1f90SHong Zhang   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1720c8db22f6SHong Zhang 
1721c8db22f6SHong Zhang   PetscFunctionBegin;
17222f5f1f90SHong Zhang   btval_den=btdense->v;
17239566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(btval_den,m*n));
17242f5f1f90SHong Zhang   for (k=0; k<ncolors; k++) {
17252f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17262f5f1f90SHong Zhang     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1727d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17282f5f1f90SHong Zhang       btcol = bj + bi[col];
17292f5f1f90SHong Zhang       btval = ba + bi[col];
17302f5f1f90SHong Zhang       anz   = bi[col+1] - bi[col];
1731d2853540SHong Zhang       for (j=0; j<anz; j++) {
17322f5f1f90SHong Zhang         brow            = btcol[j];
17332f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1734c8db22f6SHong Zhang       }
1735c8db22f6SHong Zhang     }
17362f5f1f90SHong Zhang     btval_den += m;
1737c8db22f6SHong Zhang   }
1738c8db22f6SHong Zhang   PetscFunctionReturn(0);
1739c8db22f6SHong Zhang }
1740c8db22f6SHong Zhang 
1741b9af6bddSHong Zhang PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
17428972f759SHong Zhang {
1743b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
17441683a169SBarry Smith   const PetscScalar *ca_den,*ca_den_ptr;
17451683a169SBarry Smith   PetscScalar       *ca=csp->a;
1746f99a636bSHong Zhang   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1747e88777f2SHong Zhang   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1748077f23c2SHong Zhang   PetscInt          nrows,*row,*idx;
1749077f23c2SHong Zhang   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
17508972f759SHong Zhang 
17518972f759SHong Zhang   PetscFunctionBegin;
17529566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(Cden,&ca_den));
1753f99a636bSHong Zhang 
1754077f23c2SHong Zhang   if (brows > 0) {
1755077f23c2SHong Zhang     PetscInt *lstart,row_end,row_start;
1756980ae229SHong Zhang     lstart = matcoloring->lstart;
17579566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(lstart,ncolors));
1758eeb4fd02SHong Zhang 
1759077f23c2SHong Zhang     row_end = brows;
1760eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1761077f23c2SHong Zhang     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1762077f23c2SHong Zhang       ca_den_ptr = ca_den;
1763980ae229SHong Zhang       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1764eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1765eeb4fd02SHong Zhang         row   = rows  + colorforrow[k];
1766eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1767eeb4fd02SHong Zhang         for (l=lstart[k]; l<nrows; l++) {
1768eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1769eeb4fd02SHong Zhang             lstart[k] = l;
1770eeb4fd02SHong Zhang             break;
1771eeb4fd02SHong Zhang           } else {
1772077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1773eeb4fd02SHong Zhang           }
1774eeb4fd02SHong Zhang         }
1775077f23c2SHong Zhang         ca_den_ptr += m;
1776eeb4fd02SHong Zhang       }
1777077f23c2SHong Zhang       row_end += brows;
1778eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1779eeb4fd02SHong Zhang     }
1780077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1781077f23c2SHong Zhang     ca_den_ptr = ca_den;
1782077f23c2SHong Zhang     for (k=0; k<ncolors; k++) {
1783077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1784077f23c2SHong Zhang       row   = rows  + colorforrow[k];
1785077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1786077f23c2SHong Zhang       for (l=0; l<nrows; l++) {
1787077f23c2SHong Zhang         ca[idx[l]] = ca_den_ptr[row[l]];
1788077f23c2SHong Zhang       }
1789077f23c2SHong Zhang       ca_den_ptr += m;
1790077f23c2SHong Zhang     }
1791f99a636bSHong Zhang   }
1792f99a636bSHong Zhang 
17939566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(Cden,&ca_den));
1794f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1795077f23c2SHong Zhang   if (matcoloring->brows > 0) {
17969566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows));
1797e88777f2SHong Zhang   } else {
17989566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1799e88777f2SHong Zhang   }
1800f99a636bSHong Zhang #endif
18018972f759SHong Zhang   PetscFunctionReturn(0);
18028972f759SHong Zhang }
18038972f759SHong Zhang 
1804b9af6bddSHong Zhang PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1805b1683b59SHong Zhang {
1806e88777f2SHong Zhang   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
18071a83f524SJed Brown   const PetscInt *is,*ci,*cj,*row_idx;
1808b28d80bdSHong Zhang   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1809b1683b59SHong Zhang   IS             *isa;
1810b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1811e88777f2SHong Zhang   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1812e88777f2SHong Zhang   PetscInt       *colorforcol,*columns,*columns_i,brows;
1813e88777f2SHong Zhang   PetscBool      flg;
1814b1683b59SHong Zhang 
1815b1683b59SHong Zhang   PetscFunctionBegin;
18169566063dSJacob Faibussowitsch   PetscCall(ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa));
1817e99be685SHong Zhang 
18187ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1819e88777f2SHong Zhang   Nbs       = mat->cmap->N/bs;
1820b1683b59SHong Zhang   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1821e88777f2SHong Zhang   c->N      = Nbs;
1822e88777f2SHong Zhang   c->m      = c->M;
1823b1683b59SHong Zhang   c->rstart = 0;
1824e88777f2SHong Zhang   c->brows  = 100;
1825b1683b59SHong Zhang 
1826b1683b59SHong Zhang   c->ncolors = nis;
18279566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow));
18289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz+1,&rows));
18299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz+1,&den2sp));
1830e88777f2SHong Zhang 
1831e88777f2SHong Zhang   brows = c->brows;
18329566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg));
1833e88777f2SHong Zhang   if (flg) c->brows = brows;
1834eeb4fd02SHong Zhang   if (brows > 0) {
18359566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(nis+1,&c->lstart));
1836e88777f2SHong Zhang   }
18372205254eSKarl Rupp 
1838d2853540SHong Zhang   colorforrow[0] = 0;
1839d2853540SHong Zhang   rows_i         = rows;
1840f99a636bSHong Zhang   den2sp_i       = den2sp;
1841b1683b59SHong Zhang 
18429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nis+1,&colorforcol));
18439566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(Nbs+1,&columns));
18442205254eSKarl Rupp 
1845d2853540SHong Zhang   colorforcol[0] = 0;
1846d2853540SHong Zhang   columns_i      = columns;
1847d2853540SHong Zhang 
1848077f23c2SHong Zhang   /* get column-wise storage of mat */
18499566063dSJacob Faibussowitsch   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1850b1683b59SHong Zhang 
1851b28d80bdSHong Zhang   cm   = c->m;
18529566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm+1,&rowhit));
18539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm+1,&idxhit));
1854f99a636bSHong Zhang   for (i=0; i<nis; i++) { /* loop over color */
18559566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(isa[i],&n));
18569566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isa[i],&is));
18572205254eSKarl Rupp 
1858b1683b59SHong Zhang     c->ncolumns[i] = n;
18591baa6e33SBarry Smith     if (n) PetscCall(PetscArraycpy(columns_i,is,n));
1860d2853540SHong Zhang     colorforcol[i+1] = colorforcol[i] + n;
1861d2853540SHong Zhang     columns_i       += n;
1862b1683b59SHong Zhang 
1863b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
18649566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(rowhit,cm));
1865e99be685SHong Zhang 
1866b7caf3d6SHong Zhang     for (j=0; j<n; j++) { /* loop over columns*/
1867b1683b59SHong Zhang       col     = is[j];
1868d2853540SHong Zhang       row_idx = cj + ci[col];
1869b1683b59SHong Zhang       m       = ci[col+1] - ci[col];
1870b7caf3d6SHong Zhang       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1871e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1872d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1873b1683b59SHong Zhang       }
1874b1683b59SHong Zhang     }
1875b1683b59SHong Zhang     /* count the number of hits */
1876b1683b59SHong Zhang     nrows = 0;
1877e8354b3eSHong Zhang     for (j=0; j<cm; j++) {
1878b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1879b1683b59SHong Zhang     }
1880b1683b59SHong Zhang     c->nrows[i]      = nrows;
1881d2853540SHong Zhang     colorforrow[i+1] = colorforrow[i] + nrows;
1882d2853540SHong Zhang 
1883b1683b59SHong Zhang     nrows = 0;
1884b7caf3d6SHong Zhang     for (j=0; j<cm; j++) { /* loop over rows */
1885b1683b59SHong Zhang       if (rowhit[j]) {
1886d2853540SHong Zhang         rows_i[nrows]   = j;
188712b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1888b1683b59SHong Zhang         nrows++;
1889b1683b59SHong Zhang       }
1890b1683b59SHong Zhang     }
1891e88777f2SHong Zhang     den2sp_i += nrows;
1892077f23c2SHong Zhang 
18939566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isa[i],&is));
1894f99a636bSHong Zhang     rows_i += nrows;
1895b1683b59SHong Zhang   }
18969566063dSJacob Faibussowitsch   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
18979566063dSJacob Faibussowitsch   PetscCall(PetscFree(rowhit));
18989566063dSJacob Faibussowitsch   PetscCall(ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa));
18992c71b3e2SJacob Faibussowitsch   PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]);
1900b28d80bdSHong Zhang 
1901d2853540SHong Zhang   c->colorforrow = colorforrow;
1902d2853540SHong Zhang   c->rows        = rows;
1903f99a636bSHong Zhang   c->den2sp      = den2sp;
1904d2853540SHong Zhang   c->colorforcol = colorforcol;
1905d2853540SHong Zhang   c->columns     = columns;
19062205254eSKarl Rupp 
19079566063dSJacob Faibussowitsch   PetscCall(PetscFree(idxhit));
1908b1683b59SHong Zhang   PetscFunctionReturn(0);
1909b1683b59SHong Zhang }
1910d013fc79Sandi selinger 
19114222ddf1SHong Zhang /* --------------------------------------------------------------- */
19124222ddf1SHong Zhang static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1913df97dc6dSFande Kong {
19144222ddf1SHong Zhang   Mat_Product    *product = C->product;
19154222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19164222ddf1SHong Zhang 
1917df97dc6dSFande Kong   PetscFunctionBegin;
19184222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19194222ddf1SHong Zhang     /* Alg: "outerproduct" */
19209566063dSJacob Faibussowitsch     PetscCall((*C->ops->mattransposemultnumeric)(A,B,C));
19214222ddf1SHong Zhang   } else {
19224222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19236718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19244222ddf1SHong Zhang 
192528b400f6SJacob Faibussowitsch     PetscCheck(atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
19261cdffd5eSHong Zhang     if (atb->At) {
19271cdffd5eSHong Zhang       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
19281cdffd5eSHong Zhang          user may have called MatProductReplaceMats() to get this A=product->A */
19291cdffd5eSHong Zhang       PetscCall(MatTransposeSetPrecursor(A,atb->At));
19307fb60732SBarry Smith       PetscCall(MatTranspose(A,MAT_REUSE_MATRIX,&atb->At));
19314222ddf1SHong Zhang     }
19327fb60732SBarry Smith     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A,B,C));
19334222ddf1SHong Zhang   }
1934df97dc6dSFande Kong   PetscFunctionReturn(0);
1935df97dc6dSFande Kong }
1936df97dc6dSFande Kong 
19374222ddf1SHong Zhang static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1938d013fc79Sandi selinger {
19394222ddf1SHong Zhang   Mat_Product    *product = C->product;
19404222ddf1SHong Zhang   Mat            A=product->A,B=product->B;
19414222ddf1SHong Zhang   PetscReal      fill=product->fill;
1942d013fc79Sandi selinger 
1943d013fc79Sandi selinger   PetscFunctionBegin;
19449566063dSJacob Faibussowitsch   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C));
19452869b61bSandi selinger 
19464222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19474222ddf1SHong Zhang   PetscFunctionReturn(0);
19482869b61bSandi selinger }
1949d013fc79Sandi selinger 
19504222ddf1SHong Zhang /* --------------------------------------------------------------- */
19514222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
19524222ddf1SHong Zhang {
19534222ddf1SHong Zhang   Mat_Product    *product = C->product;
19544222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
19554222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
19564222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19574222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
19584222ddf1SHong Zhang   PetscInt       nalg = 7;
19594222ddf1SHong Zhang #else
19604222ddf1SHong Zhang   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
19614222ddf1SHong Zhang   PetscInt       nalg = 8;
19624222ddf1SHong Zhang #endif
19634222ddf1SHong Zhang 
19644222ddf1SHong Zhang   PetscFunctionBegin;
19654222ddf1SHong Zhang   /* Set default algorithm */
19669566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
19674222ddf1SHong Zhang   if (flg) {
19689566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1969d013fc79Sandi selinger   }
1970d013fc79Sandi selinger 
19714222ddf1SHong Zhang   /* Get runtime option */
19724222ddf1SHong Zhang   if (product->api_user) {
1973d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");
19749566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg));
1975d0609cedSBarry Smith     PetscOptionsEnd();
19764222ddf1SHong Zhang   } else {
1977d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");
19789566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg));
1979d0609cedSBarry Smith     PetscOptionsEnd();
1980d013fc79Sandi selinger   }
19814222ddf1SHong Zhang   if (flg) {
19829566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1983d013fc79Sandi selinger   }
1984d013fc79Sandi selinger 
19854222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
19864222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
19874222ddf1SHong Zhang   PetscFunctionReturn(0);
19884222ddf1SHong Zhang }
1989d013fc79Sandi selinger 
19904222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
19914222ddf1SHong Zhang {
19924222ddf1SHong Zhang   Mat_Product    *product = C->product;
19934222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
19944222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
1995089a957eSStefano Zampini   const char     *algTypes[3] = {"default","at*b","outerproduct"};
1996089a957eSStefano Zampini   PetscInt       nalg = 3;
1997d013fc79Sandi selinger 
19984222ddf1SHong Zhang   PetscFunctionBegin;
19994222ddf1SHong Zhang   /* Get runtime option */
20004222ddf1SHong Zhang   if (product->api_user) {
2001d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");
20029566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2003d0609cedSBarry Smith     PetscOptionsEnd();
20044222ddf1SHong Zhang   } else {
2005d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");
20069566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg));
2007d0609cedSBarry Smith     PetscOptionsEnd();
20084222ddf1SHong Zhang   }
20094222ddf1SHong Zhang   if (flg) {
20109566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
20114222ddf1SHong Zhang   }
2012d013fc79Sandi selinger 
20134222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20144222ddf1SHong Zhang   PetscFunctionReturn(0);
20154222ddf1SHong Zhang }
20164222ddf1SHong Zhang 
20174222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
20184222ddf1SHong Zhang {
20194222ddf1SHong Zhang   Mat_Product    *product = C->product;
20204222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20214222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20224222ddf1SHong Zhang   const char     *algTypes[2] = {"default","color"};
20234222ddf1SHong Zhang   PetscInt       nalg = 2;
20244222ddf1SHong Zhang 
20254222ddf1SHong Zhang   PetscFunctionBegin;
20264222ddf1SHong Zhang   /* Set default algorithm */
20279566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
20284222ddf1SHong Zhang   if (!flg) {
20294222ddf1SHong Zhang     alg = 1;
20309566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
20314222ddf1SHong Zhang   }
20324222ddf1SHong Zhang 
20334222ddf1SHong Zhang   /* Get runtime option */
20344222ddf1SHong Zhang   if (product->api_user) {
2035d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");
20369566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2037d0609cedSBarry Smith     PetscOptionsEnd();
20384222ddf1SHong Zhang   } else {
2039d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");
20409566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg));
2041d0609cedSBarry Smith     PetscOptionsEnd();
20424222ddf1SHong Zhang   }
20434222ddf1SHong Zhang   if (flg) {
20449566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
20454222ddf1SHong Zhang   }
20464222ddf1SHong Zhang 
20474222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20484222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20494222ddf1SHong Zhang   PetscFunctionReturn(0);
20504222ddf1SHong Zhang }
20514222ddf1SHong Zhang 
20524222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
20534222ddf1SHong Zhang {
20544222ddf1SHong Zhang   Mat_Product    *product = C->product;
20554222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20564222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
20574222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20584222ddf1SHong Zhang   const char     *algTypes[2] = {"scalable","rap"};
20594222ddf1SHong Zhang   PetscInt       nalg = 2;
20604222ddf1SHong Zhang #else
20614222ddf1SHong Zhang   const char     *algTypes[3] = {"scalable","rap","hypre"};
20624222ddf1SHong Zhang   PetscInt       nalg = 3;
20634222ddf1SHong Zhang #endif
20644222ddf1SHong Zhang 
20654222ddf1SHong Zhang   PetscFunctionBegin;
20664222ddf1SHong Zhang   /* Set default algorithm */
20679566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg,"default",&flg));
20684222ddf1SHong Zhang   if (flg) {
20699566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
20704222ddf1SHong Zhang   }
20714222ddf1SHong Zhang 
20724222ddf1SHong Zhang   /* Get runtime option */
20734222ddf1SHong Zhang   if (product->api_user) {
2074d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");
20759566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2076d0609cedSBarry Smith     PetscOptionsEnd();
20774222ddf1SHong Zhang   } else {
2078d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");
20799566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2080d0609cedSBarry Smith     PetscOptionsEnd();
20814222ddf1SHong Zhang   }
20824222ddf1SHong Zhang   if (flg) {
20839566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
20844222ddf1SHong Zhang   }
20854222ddf1SHong Zhang 
20864222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
20874222ddf1SHong Zhang   PetscFunctionReturn(0);
20884222ddf1SHong Zhang }
20894222ddf1SHong Zhang 
20904222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
20914222ddf1SHong Zhang {
20924222ddf1SHong Zhang   Mat_Product    *product = C->product;
20934222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
20944222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
20954222ddf1SHong Zhang   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
20964222ddf1SHong Zhang   PetscInt        nalg = 3;
20974222ddf1SHong Zhang 
20984222ddf1SHong Zhang   PetscFunctionBegin;
20994222ddf1SHong Zhang   /* Set default algorithm */
21009566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg,"default",&flg));
21014222ddf1SHong Zhang   if (flg) {
21029566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
21034222ddf1SHong Zhang   }
21044222ddf1SHong Zhang 
21054222ddf1SHong Zhang   /* Get runtime option */
21064222ddf1SHong Zhang   if (product->api_user) {
2107d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");
21089566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg));
2109d0609cedSBarry Smith     PetscOptionsEnd();
21104222ddf1SHong Zhang   } else {
2111d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");
21129566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg));
2113d0609cedSBarry Smith     PetscOptionsEnd();
21144222ddf1SHong Zhang   }
21154222ddf1SHong Zhang   if (flg) {
21169566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
21174222ddf1SHong Zhang   }
21184222ddf1SHong Zhang 
21194222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21204222ddf1SHong Zhang   PetscFunctionReturn(0);
21214222ddf1SHong Zhang }
21224222ddf1SHong Zhang 
21234222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
21244222ddf1SHong Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
21254222ddf1SHong Zhang {
21264222ddf1SHong Zhang   Mat_Product    *product = C->product;
21274222ddf1SHong Zhang   PetscInt       alg = 0; /* default algorithm */
21284222ddf1SHong Zhang   PetscBool      flg = PETSC_FALSE;
21294222ddf1SHong Zhang   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
21304222ddf1SHong Zhang   PetscInt       nalg = 7;
21314222ddf1SHong Zhang 
21324222ddf1SHong Zhang   PetscFunctionBegin;
21334222ddf1SHong Zhang   /* Set default algorithm */
21349566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg,"default",&flg));
21354222ddf1SHong Zhang   if (flg) {
21369566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
21374222ddf1SHong Zhang   }
21384222ddf1SHong Zhang 
21394222ddf1SHong Zhang   /* Get runtime option */
21404222ddf1SHong Zhang   if (product->api_user) {
2141d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");
21429566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2143d0609cedSBarry Smith     PetscOptionsEnd();
21444222ddf1SHong Zhang   } else {
2145d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");
21469566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg));
2147d0609cedSBarry Smith     PetscOptionsEnd();
21484222ddf1SHong Zhang   }
21494222ddf1SHong Zhang   if (flg) {
21509566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
21514222ddf1SHong Zhang   }
21524222ddf1SHong Zhang 
21534222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
21544222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
21554222ddf1SHong Zhang   PetscFunctionReturn(0);
21564222ddf1SHong Zhang }
21574222ddf1SHong Zhang 
21584222ddf1SHong Zhang PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
21594222ddf1SHong Zhang {
21604222ddf1SHong Zhang   Mat_Product    *product = C->product;
21614222ddf1SHong Zhang 
21624222ddf1SHong Zhang   PetscFunctionBegin;
21634222ddf1SHong Zhang   switch (product->type) {
21644222ddf1SHong Zhang   case MATPRODUCT_AB:
21659566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
21664222ddf1SHong Zhang     break;
21674222ddf1SHong Zhang   case MATPRODUCT_AtB:
21689566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
21694222ddf1SHong Zhang     break;
21704222ddf1SHong Zhang   case MATPRODUCT_ABt:
21719566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
21724222ddf1SHong Zhang     break;
21734222ddf1SHong Zhang   case MATPRODUCT_PtAP:
21749566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
21754222ddf1SHong Zhang     break;
21764222ddf1SHong Zhang   case MATPRODUCT_RARt:
21779566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
21784222ddf1SHong Zhang     break;
21794222ddf1SHong Zhang   case MATPRODUCT_ABC:
21809566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
21814222ddf1SHong Zhang     break;
21826718818eSStefano Zampini   default:
21836718818eSStefano Zampini     break;
21844222ddf1SHong Zhang   }
2185d013fc79Sandi selinger   PetscFunctionReturn(0);
2186d013fc79Sandi selinger }
2187