xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 71052fdfa316db9eff6713675d06cb2251c7acfb)
1 
2 /*
3   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
4           C = A * B
5 */
6 
7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
8 #include <../src/mat/utils/freespace.h>
9 #include <petscbt.h>
10 #include <petsc/private/isimpl.h>
11 #include <../src/mat/impls/dense/seq/dense.h>
12 
13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
14 {
15   PetscFunctionBegin;
16   if (C->ops->matmultnumeric) {
17     PetscCall((*C->ops->matmultnumeric)(A,B,C));
18   } else {
19     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A,B,C));
20   }
21   PetscFunctionReturn(0);
22 }
23 
24 /* Modified from MatCreateSeqAIJWithArrays() */
25 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)
26 {
27   PetscInt       ii;
28   Mat_SeqAIJ     *aij;
29   PetscBool      isseqaij, osingle, ofree_a, ofree_ij;
30 
31   PetscFunctionBegin;
32   PetscCheck(m <= 0 || !i[0],PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"i (row indices) must start with 0");
33   PetscCall(MatSetSizes(mat,m,n,m,n));
34 
35   if (!mtype) {
36     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat,MATSEQAIJ,&isseqaij));
37     if (!isseqaij) PetscCall(MatSetType(mat,MATSEQAIJ));
38   } else {
39     PetscCall(MatSetType(mat,mtype));
40   }
41 
42   aij  = (Mat_SeqAIJ*)(mat)->data;
43   osingle = aij->singlemalloc;
44   ofree_a = aij->free_a;
45   ofree_ij = aij->free_ij;
46   /* changes the free flags */
47   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat,MAT_SKIP_ALLOCATION,NULL));
48 
49   PetscCall(PetscFree(aij->ilen));
50   PetscCall(PetscFree(aij->imax));
51   PetscCall(PetscMalloc1(m,&aij->imax));
52   PetscCall(PetscMalloc1(m,&aij->ilen));
53   for (ii=0,aij->nonzerorowcnt=0,aij->rmax = 0; ii<m; ii++) {
54     const PetscInt rnz = i[ii+1] - i[ii];
55     aij->nonzerorowcnt += !!rnz;
56     aij->rmax = PetscMax(aij->rmax,rnz);
57     aij->ilen[ii] = aij->imax[ii] = i[ii+1] - i[ii];
58   }
59   aij->maxnz = i[m];
60   aij->nz = i[m];
61 
62   if (osingle) {
63     PetscCall(PetscFree3(aij->a,aij->j,aij->i));
64   } else {
65     if (ofree_a)  PetscCall(PetscFree(aij->a));
66     if (ofree_ij) PetscCall(PetscFree(aij->j));
67     if (ofree_ij) PetscCall(PetscFree(aij->i));
68   }
69   aij->i            = i;
70   aij->j            = j;
71   aij->a            = a;
72   aij->nonew        = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
73   /* default to not retain ownership */
74   aij->singlemalloc = PETSC_FALSE;
75   aij->free_a       = PETSC_FALSE;
76   aij->free_ij      = PETSC_FALSE;
77   PetscCall(MatCheckCompressedRow(mat,aij->nonzerorowcnt,&aij->compressedrow,aij->i,m,0.6));
78   PetscFunctionReturn(0);
79 }
80 
81 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
82 {
83   Mat_Product         *product = C->product;
84   MatProductAlgorithm alg;
85   PetscBool           flg;
86 
87   PetscFunctionBegin;
88   if (product) {
89     alg = product->alg;
90   } else {
91     alg = "sorted";
92   }
93   /* sorted */
94   PetscCall(PetscStrcmp(alg,"sorted",&flg));
95   if (flg) {
96     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A,B,fill,C));
97     PetscFunctionReturn(0);
98   }
99 
100   /* scalable */
101   PetscCall(PetscStrcmp(alg,"scalable",&flg));
102   if (flg) {
103     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A,B,fill,C));
104     PetscFunctionReturn(0);
105   }
106 
107   /* scalable_fast */
108   PetscCall(PetscStrcmp(alg,"scalable_fast",&flg));
109   if (flg) {
110     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A,B,fill,C));
111     PetscFunctionReturn(0);
112   }
113 
114   /* heap */
115   PetscCall(PetscStrcmp(alg,"heap",&flg));
116   if (flg) {
117     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A,B,fill,C));
118     PetscFunctionReturn(0);
119   }
120 
121   /* btheap */
122   PetscCall(PetscStrcmp(alg,"btheap",&flg));
123   if (flg) {
124     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A,B,fill,C));
125     PetscFunctionReturn(0);
126   }
127 
128   /* llcondensed */
129   PetscCall(PetscStrcmp(alg,"llcondensed",&flg));
130   if (flg) {
131     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A,B,fill,C));
132     PetscFunctionReturn(0);
133   }
134 
135   /* rowmerge */
136   PetscCall(PetscStrcmp(alg,"rowmerge",&flg));
137   if (flg) {
138     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A,B,fill,C));
139     PetscFunctionReturn(0);
140   }
141 
142 #if defined(PETSC_HAVE_HYPRE)
143   PetscCall(PetscStrcmp(alg,"hypre",&flg));
144   if (flg) {
145     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A,B,fill,C));
146     PetscFunctionReturn(0);
147   }
148 #endif
149 
150   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
151 }
152 
153 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)
154 {
155   Mat_SeqAIJ         *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
156   PetscInt           *ai=a->i,*bi=b->i,*ci,*cj;
157   PetscInt           am =A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
158   PetscReal          afill;
159   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
160   PetscTable         ta;
161   PetscBT            lnkbt;
162   PetscFreeSpaceList free_space=NULL,current_space=NULL;
163 
164   PetscFunctionBegin;
165   /* Get ci and cj */
166   /*---------------*/
167   /* Allocate ci array, arrays for fill computation and */
168   /* free space for accumulating nonzero column info */
169   PetscCall(PetscMalloc1(am+2,&ci));
170   ci[0] = 0;
171 
172   /* create and initialize a linked list */
173   PetscCall(PetscTableCreate(bn,bn,&ta));
174   MatRowMergeMax_SeqAIJ(b,bm,ta);
175   PetscCall(PetscTableGetCount(ta,&Crmax));
176   PetscCall(PetscTableDestroy(&ta));
177 
178   PetscCall(PetscLLCondensedCreate(Crmax,bn,&lnk,&lnkbt));
179 
180   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
181   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
182 
183   current_space = free_space;
184 
185   /* Determine ci and cj */
186   for (i=0; i<am; i++) {
187     anzi = ai[i+1] - ai[i];
188     aj   = a->j + ai[i];
189     for (j=0; j<anzi; j++) {
190       brow = aj[j];
191       bnzj = bi[brow+1] - bi[brow];
192       bj   = b->j + bi[brow];
193       /* add non-zero cols of B into the sorted linked list lnk */
194       PetscCall(PetscLLCondensedAddSorted(bnzj,bj,lnk,lnkbt));
195     }
196     /* add possible missing diagonal entry */
197     if (C->force_diagonals) {
198       PetscCall(PetscLLCondensedAddSorted(1,&i,lnk,lnkbt));
199     }
200     cnzi = lnk[0];
201 
202     /* If free space is not available, make more free space */
203     /* Double the amount of total space in the list */
204     if (current_space->local_remaining<cnzi) {
205       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
206       ndouble++;
207     }
208 
209     /* Copy data into free space, then initialize lnk */
210     PetscCall(PetscLLCondensedClean(bn,cnzi,current_space->array,lnk,lnkbt));
211 
212     current_space->array           += cnzi;
213     current_space->local_used      += cnzi;
214     current_space->local_remaining -= cnzi;
215 
216     ci[i+1] = ci[i] + cnzi;
217   }
218 
219   /* Column indices are in the list of free space */
220   /* Allocate space for cj, initialize cj, and */
221   /* destroy list of free space and other temporary array(s) */
222   PetscCall(PetscMalloc1(ci[am]+1,&cj));
223   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
224   PetscCall(PetscLLCondensedDestroy(lnk,lnkbt));
225 
226   /* put together the new symbolic matrix */
227   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
228   PetscCall(MatSetBlockSizesFromMats(C,A,B));
229 
230   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
231   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
232   c          = (Mat_SeqAIJ*)(C->data);
233   c->free_a  = PETSC_FALSE;
234   c->free_ij = PETSC_TRUE;
235   c->nonew   = 0;
236 
237   /* fast, needs non-scalable O(bn) array 'abdense' */
238   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
239 
240   /* set MatInfo */
241   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
242   if (afill < 1.0) afill = 1.0;
243   C->info.mallocs           = ndouble;
244   C->info.fill_ratio_given  = fill;
245   C->info.fill_ratio_needed = afill;
246 
247 #if defined(PETSC_USE_INFO)
248   if (ci[am]) {
249     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
250     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
251   } else {
252     PetscCall(PetscInfo(C,"Empty matrix product\n"));
253   }
254 #endif
255   PetscFunctionReturn(0);
256 }
257 
258 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)
259 {
260   PetscLogDouble    flops=0.0;
261   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
262   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
263   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
264   PetscInt          *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
265   PetscInt          am   =A->rmap->n,cm=C->rmap->n;
266   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
267   PetscScalar       *ca,valtmp;
268   PetscScalar       *ab_dense;
269   PetscContainer    cab_dense;
270   const PetscScalar *aa,*ba,*baj;
271 
272   PetscFunctionBegin;
273   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
274   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
275   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
276     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
277     c->a      = ca;
278     c->free_a = PETSC_TRUE;
279   } else ca = c->a;
280 
281   /* TODO this should be done in the symbolic phase */
282   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
283      that is hard to eradicate) */
284   PetscCall(PetscObjectQuery((PetscObject)C,"__PETSc__ab_dense",(PetscObject*)&cab_dense));
285   if (!cab_dense) {
286     PetscCall(PetscMalloc1(B->cmap->N,&ab_dense));
287     PetscCall(PetscContainerCreate(PETSC_COMM_SELF,&cab_dense));
288     PetscCall(PetscContainerSetPointer(cab_dense,ab_dense));
289     PetscCall(PetscContainerSetUserDestroy(cab_dense,PetscContainerUserDestroyDefault));
290     PetscCall(PetscObjectCompose((PetscObject)C,"__PETSc__ab_dense",(PetscObject)cab_dense));
291     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
292   }
293   PetscCall(PetscContainerGetPointer(cab_dense,(void**)&ab_dense));
294   PetscCall(PetscArrayzero(ab_dense,B->cmap->N));
295 
296   /* clean old values in C */
297   PetscCall(PetscArrayzero(ca,ci[cm]));
298   /* Traverse A row-wise. */
299   /* Build the ith row in C by summing over nonzero columns in A, */
300   /* the rows of B corresponding to nonzeros of A. */
301   for (i=0; i<am; i++) {
302     anzi = ai[i+1] - ai[i];
303     for (j=0; j<anzi; j++) {
304       brow = aj[j];
305       bnzi = bi[brow+1] - bi[brow];
306       bjj  = bj + bi[brow];
307       baj  = ba + bi[brow];
308       /* perform dense axpy */
309       valtmp = aa[j];
310       for (k=0; k<bnzi; k++) {
311         ab_dense[bjj[k]] += valtmp*baj[k];
312       }
313       flops += 2*bnzi;
314     }
315     aj += anzi; aa += anzi;
316 
317     cnzi = ci[i+1] - ci[i];
318     for (k=0; k<cnzi; k++) {
319       ca[k]          += ab_dense[cj[k]];
320       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
321     }
322     flops += cnzi;
323     cj    += cnzi; ca += cnzi;
324   }
325 #if defined(PETSC_HAVE_DEVICE)
326   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
327 #endif
328   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
329   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
330   PetscCall(PetscLogFlops(flops));
331   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
332   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
333   PetscFunctionReturn(0);
334 }
335 
336 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)
337 {
338   PetscLogDouble    flops=0.0;
339   Mat_SeqAIJ        *a   = (Mat_SeqAIJ*)A->data;
340   Mat_SeqAIJ        *b   = (Mat_SeqAIJ*)B->data;
341   Mat_SeqAIJ        *c   = (Mat_SeqAIJ*)C->data;
342   PetscInt          *ai  = a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bjj,*ci=c->i,*cj=c->j;
343   PetscInt          am   = A->rmap->N,cm=C->rmap->N;
344   PetscInt          i,j,k,anzi,bnzi,cnzi,brow;
345   PetscScalar       *ca=c->a,valtmp;
346   const PetscScalar *aa,*ba,*baj;
347   PetscInt          nextb;
348 
349   PetscFunctionBegin;
350   PetscCall(MatSeqAIJGetArrayRead(A,&aa));
351   PetscCall(MatSeqAIJGetArrayRead(B,&ba));
352   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
353     PetscCall(PetscMalloc1(ci[cm]+1,&ca));
354     c->a      = ca;
355     c->free_a = PETSC_TRUE;
356   }
357 
358   /* clean old values in C */
359   PetscCall(PetscArrayzero(ca,ci[cm]));
360   /* Traverse A row-wise. */
361   /* Build the ith row in C by summing over nonzero columns in A, */
362   /* the rows of B corresponding to nonzeros of A. */
363   for (i=0; i<am; i++) {
364     anzi = ai[i+1] - ai[i];
365     cnzi = ci[i+1] - ci[i];
366     for (j=0; j<anzi; j++) {
367       brow = aj[j];
368       bnzi = bi[brow+1] - bi[brow];
369       bjj  = bj + bi[brow];
370       baj  = ba + bi[brow];
371       /* perform sparse axpy */
372       valtmp = aa[j];
373       nextb  = 0;
374       for (k=0; nextb<bnzi; k++) {
375         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
376           ca[k] += valtmp*baj[nextb++];
377         }
378       }
379       flops += 2*bnzi;
380     }
381     aj += anzi; aa += anzi;
382     cj += cnzi; ca += cnzi;
383   }
384 #if defined(PETSC_HAVE_DEVICE)
385   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
386 #endif
387   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
388   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
389   PetscCall(PetscLogFlops(flops));
390   PetscCall(MatSeqAIJRestoreArrayRead(A,&aa));
391   PetscCall(MatSeqAIJRestoreArrayRead(B,&ba));
392   PetscFunctionReturn(0);
393 }
394 
395 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)
396 {
397   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
398   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
399   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
400   MatScalar          *ca;
401   PetscReal          afill;
402   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
403   PetscTable         ta;
404   PetscFreeSpaceList free_space=NULL,current_space=NULL;
405 
406   PetscFunctionBegin;
407   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
408   /*-----------------------------------------------------------------------------------------*/
409   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
410   PetscCall(PetscMalloc1(am+2,&ci));
411   ci[0] = 0;
412 
413   /* create and initialize a linked list */
414   PetscCall(PetscTableCreate(bn,bn,&ta));
415   MatRowMergeMax_SeqAIJ(b,bm,ta);
416   PetscCall(PetscTableGetCount(ta,&Crmax));
417   PetscCall(PetscTableDestroy(&ta));
418 
419   PetscCall(PetscLLCondensedCreate_fast(Crmax,&lnk));
420 
421   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
422   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
423   current_space = free_space;
424 
425   /* Determine ci and cj */
426   for (i=0; i<am; i++) {
427     anzi = ai[i+1] - ai[i];
428     aj   = a->j + ai[i];
429     for (j=0; j<anzi; j++) {
430       brow = aj[j];
431       bnzj = bi[brow+1] - bi[brow];
432       bj   = b->j + bi[brow];
433       /* add non-zero cols of B into the sorted linked list lnk */
434       PetscCall(PetscLLCondensedAddSorted_fast(bnzj,bj,lnk));
435     }
436     /* add possible missing diagonal entry */
437     if (C->force_diagonals) {
438       PetscCall(PetscLLCondensedAddSorted_fast(1,&i,lnk));
439     }
440     cnzi = lnk[1];
441 
442     /* If free space is not available, make more free space */
443     /* Double the amount of total space in the list */
444     if (current_space->local_remaining<cnzi) {
445       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
446       ndouble++;
447     }
448 
449     /* Copy data into free space, then initialize lnk */
450     PetscCall(PetscLLCondensedClean_fast(cnzi,current_space->array,lnk));
451 
452     current_space->array           += cnzi;
453     current_space->local_used      += cnzi;
454     current_space->local_remaining -= cnzi;
455 
456     ci[i+1] = ci[i] + cnzi;
457   }
458 
459   /* Column indices are in the list of free space */
460   /* Allocate space for cj, initialize cj, and */
461   /* destroy list of free space and other temporary array(s) */
462   PetscCall(PetscMalloc1(ci[am]+1,&cj));
463   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
464   PetscCall(PetscLLCondensedDestroy_fast(lnk));
465 
466   /* Allocate space for ca */
467   PetscCall(PetscCalloc1(ci[am]+1,&ca));
468 
469   /* put together the new symbolic matrix */
470   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
471   PetscCall(MatSetBlockSizesFromMats(C,A,B));
472 
473   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
474   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
475   c          = (Mat_SeqAIJ*)(C->data);
476   c->free_a  = PETSC_TRUE;
477   c->free_ij = PETSC_TRUE;
478   c->nonew   = 0;
479 
480   /* slower, less memory */
481   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
482 
483   /* set MatInfo */
484   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
485   if (afill < 1.0) afill = 1.0;
486   C->info.mallocs           = ndouble;
487   C->info.fill_ratio_given  = fill;
488   C->info.fill_ratio_needed = afill;
489 
490 #if defined(PETSC_USE_INFO)
491   if (ci[am]) {
492     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
493     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
494   } else {
495     PetscCall(PetscInfo(C,"Empty matrix product\n"));
496   }
497 #endif
498   PetscFunctionReturn(0);
499 }
500 
501 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)
502 {
503   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
504   PetscInt           *ai = a->i,*bi=b->i,*ci,*cj;
505   PetscInt           am  = A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
506   MatScalar          *ca;
507   PetscReal          afill;
508   PetscInt           i,j,anzi,brow,bnzj,cnzi,*bj,*aj,*lnk,ndouble=0,Crmax;
509   PetscTable         ta;
510   PetscFreeSpaceList free_space=NULL,current_space=NULL;
511 
512   PetscFunctionBegin;
513   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
514   /*---------------------------------------------------------------------------------------------*/
515   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
516   PetscCall(PetscMalloc1(am+2,&ci));
517   ci[0] = 0;
518 
519   /* create and initialize a linked list */
520   PetscCall(PetscTableCreate(bn,bn,&ta));
521   MatRowMergeMax_SeqAIJ(b,bm,ta);
522   PetscCall(PetscTableGetCount(ta,&Crmax));
523   PetscCall(PetscTableDestroy(&ta));
524   PetscCall(PetscLLCondensedCreate_Scalable(Crmax,&lnk));
525 
526   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
527   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
528   current_space = free_space;
529 
530   /* Determine ci and cj */
531   for (i=0; i<am; i++) {
532     anzi = ai[i+1] - ai[i];
533     aj   = a->j + ai[i];
534     for (j=0; j<anzi; j++) {
535       brow = aj[j];
536       bnzj = bi[brow+1] - bi[brow];
537       bj   = b->j + bi[brow];
538       /* add non-zero cols of B into the sorted linked list lnk */
539       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj,bj,lnk));
540     }
541     /* add possible missing diagonal entry */
542     if (C->force_diagonals) {
543       PetscCall(PetscLLCondensedAddSorted_Scalable(1,&i,lnk));
544     }
545 
546     cnzi = lnk[0];
547 
548     /* If free space is not available, make more free space */
549     /* Double the amount of total space in the list */
550     if (current_space->local_remaining<cnzi) {
551       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi,current_space->total_array_size),&current_space));
552       ndouble++;
553     }
554 
555     /* Copy data into free space, then initialize lnk */
556     PetscCall(PetscLLCondensedClean_Scalable(cnzi,current_space->array,lnk));
557 
558     current_space->array           += cnzi;
559     current_space->local_used      += cnzi;
560     current_space->local_remaining -= cnzi;
561 
562     ci[i+1] = ci[i] + cnzi;
563   }
564 
565   /* Column indices are in the list of free space */
566   /* Allocate space for cj, initialize cj, and */
567   /* destroy list of free space and other temporary array(s) */
568   PetscCall(PetscMalloc1(ci[am]+1,&cj));
569   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
570   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
571 
572   /* Allocate space for ca */
573   /*-----------------------*/
574   PetscCall(PetscCalloc1(ci[am]+1,&ca));
575 
576   /* put together the new symbolic matrix */
577   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,ca,((PetscObject)A)->type_name,C));
578   PetscCall(MatSetBlockSizesFromMats(C,A,B));
579 
580   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
581   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
582   c          = (Mat_SeqAIJ*)(C->data);
583   c->free_a  = PETSC_TRUE;
584   c->free_ij = PETSC_TRUE;
585   c->nonew   = 0;
586 
587   /* slower, less memory */
588   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
589 
590   /* set MatInfo */
591   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
592   if (afill < 1.0) afill = 1.0;
593   C->info.mallocs           = ndouble;
594   C->info.fill_ratio_given  = fill;
595   C->info.fill_ratio_needed = afill;
596 
597 #if defined(PETSC_USE_INFO)
598   if (ci[am]) {
599     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
600     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
601   } else {
602     PetscCall(PetscInfo(C,"Empty matrix product\n"));
603   }
604 #endif
605   PetscFunctionReturn(0);
606 }
607 
608 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)
609 {
610   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
611   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j;
612   PetscInt           *ci,*cj,*bb;
613   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
614   PetscReal          afill;
615   PetscInt           i,j,col,ndouble = 0;
616   PetscFreeSpaceList free_space=NULL,current_space=NULL;
617   PetscHeap          h;
618 
619   PetscFunctionBegin;
620   /* Get ci and cj - by merging sorted rows using a heap */
621   /*---------------------------------------------------------------------------------------------*/
622   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
623   PetscCall(PetscMalloc1(am+2,&ci));
624   ci[0] = 0;
625 
626   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
627   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
628   current_space = free_space;
629 
630   PetscCall(PetscHeapCreate(a->rmax,&h));
631   PetscCall(PetscMalloc1(a->rmax,&bb));
632 
633   /* Determine ci and cj */
634   for (i=0; i<am; i++) {
635     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
636     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
637     ci[i+1] = ci[i];
638     /* Populate the min heap */
639     for (j=0; j<anzi; j++) {
640       bb[j] = bi[acol[j]];         /* bb points at the start of the row */
641       if (bb[j] < bi[acol[j]+1]) { /* Add if row is nonempty */
642         PetscCall(PetscHeapAdd(h,j,bj[bb[j]++]));
643       }
644     }
645     /* Pick off the min element, adding it to free space */
646     PetscCall(PetscHeapPop(h,&j,&col));
647     while (j >= 0) {
648       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
649         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
650         ndouble++;
651       }
652       *(current_space->array++) = col;
653       current_space->local_used++;
654       current_space->local_remaining--;
655       ci[i+1]++;
656 
657       /* stash if anything else remains in this row of B */
658       if (bb[j] < bi[acol[j]+1]) PetscCall(PetscHeapStash(h,j,bj[bb[j]++]));
659       while (1) {               /* pop and stash any other rows of B that also had an entry in this column */
660         PetscInt j2,col2;
661         PetscCall(PetscHeapPeek(h,&j2,&col2));
662         if (col2 != col) break;
663         PetscCall(PetscHeapPop(h,&j2,&col2));
664         if (bb[j2] < bi[acol[j2]+1]) PetscCall(PetscHeapStash(h,j2,bj[bb[j2]++]));
665       }
666       /* Put any stashed elements back into the min heap */
667       PetscCall(PetscHeapUnstash(h));
668       PetscCall(PetscHeapPop(h,&j,&col));
669     }
670   }
671   PetscCall(PetscFree(bb));
672   PetscCall(PetscHeapDestroy(&h));
673 
674   /* Column indices are in the list of free space */
675   /* Allocate space for cj, initialize cj, and */
676   /* destroy list of free space and other temporary array(s) */
677   PetscCall(PetscMalloc1(ci[am],&cj));
678   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
679 
680   /* put together the new symbolic matrix */
681   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
682   PetscCall(MatSetBlockSizesFromMats(C,A,B));
683 
684   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
685   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
686   c          = (Mat_SeqAIJ*)(C->data);
687   c->free_a  = PETSC_TRUE;
688   c->free_ij = PETSC_TRUE;
689   c->nonew   = 0;
690 
691   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
692 
693   /* set MatInfo */
694   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
695   if (afill < 1.0) afill = 1.0;
696   C->info.mallocs           = ndouble;
697   C->info.fill_ratio_given  = fill;
698   C->info.fill_ratio_needed = afill;
699 
700 #if defined(PETSC_USE_INFO)
701   if (ci[am]) {
702     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
703     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
704   } else {
705     PetscCall(PetscInfo(C,"Empty matrix product\n"));
706   }
707 #endif
708   PetscFunctionReturn(0);
709 }
710 
711 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)
712 {
713   Mat_SeqAIJ         *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
714   const PetscInt     *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
715   PetscInt           *ci,*cj,*bb;
716   PetscInt           am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
717   PetscReal          afill;
718   PetscInt           i,j,col,ndouble = 0;
719   PetscFreeSpaceList free_space=NULL,current_space=NULL;
720   PetscHeap          h;
721   PetscBT            bt;
722 
723   PetscFunctionBegin;
724   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
725   /*---------------------------------------------------------------------------------------------*/
726   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
727   PetscCall(PetscMalloc1(am+2,&ci));
728   ci[0] = 0;
729 
730   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
731   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ai[am],bi[bm])),&free_space));
732 
733   current_space = free_space;
734 
735   PetscCall(PetscHeapCreate(a->rmax,&h));
736   PetscCall(PetscMalloc1(a->rmax,&bb));
737   PetscCall(PetscBTCreate(bn,&bt));
738 
739   /* Determine ci and cj */
740   for (i=0; i<am; i++) {
741     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
742     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
743     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
744     ci[i+1] = ci[i];
745     /* Populate the min heap */
746     for (j=0; j<anzi; j++) {
747       PetscInt brow = acol[j];
748       for (bb[j] = bi[brow]; bb[j] < bi[brow+1]; bb[j]++) {
749         PetscInt bcol = bj[bb[j]];
750         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
751           PetscCall(PetscHeapAdd(h,j,bcol));
752           bb[j]++;
753           break;
754         }
755       }
756     }
757     /* Pick off the min element, adding it to free space */
758     PetscCall(PetscHeapPop(h,&j,&col));
759     while (j >= 0) {
760       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
761         fptr = NULL;                      /* need PetscBTMemzero */
762         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2,current_space->total_array_size),16 << 20),&current_space));
763         ndouble++;
764       }
765       *(current_space->array++) = col;
766       current_space->local_used++;
767       current_space->local_remaining--;
768       ci[i+1]++;
769 
770       /* stash if anything else remains in this row of B */
771       for (; bb[j] < bi[acol[j]+1]; bb[j]++) {
772         PetscInt bcol = bj[bb[j]];
773         if (!PetscBTLookupSet(bt,bcol)) { /* new entry */
774           PetscCall(PetscHeapAdd(h,j,bcol));
775           bb[j]++;
776           break;
777         }
778       }
779       PetscCall(PetscHeapPop(h,&j,&col));
780     }
781     if (fptr) {                 /* Clear the bits for this row */
782       for (; fptr<current_space->array; fptr++) PetscCall(PetscBTClear(bt,*fptr));
783     } else {                    /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
784       PetscCall(PetscBTMemzero(bn,bt));
785     }
786   }
787   PetscCall(PetscFree(bb));
788   PetscCall(PetscHeapDestroy(&h));
789   PetscCall(PetscBTDestroy(&bt));
790 
791   /* Column indices are in the list of free space */
792   /* Allocate space for cj, initialize cj, and */
793   /* destroy list of free space and other temporary array(s) */
794   PetscCall(PetscMalloc1(ci[am],&cj));
795   PetscCall(PetscFreeSpaceContiguous(&free_space,cj));
796 
797   /* put together the new symbolic matrix */
798   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
799   PetscCall(MatSetBlockSizesFromMats(C,A,B));
800 
801   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
802   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
803   c          = (Mat_SeqAIJ*)(C->data);
804   c->free_a  = PETSC_TRUE;
805   c->free_ij = PETSC_TRUE;
806   c->nonew   = 0;
807 
808   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
809 
810   /* set MatInfo */
811   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
812   if (afill < 1.0) afill = 1.0;
813   C->info.mallocs           = ndouble;
814   C->info.fill_ratio_given  = fill;
815   C->info.fill_ratio_needed = afill;
816 
817 #if defined(PETSC_USE_INFO)
818   if (ci[am]) {
819     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
820     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
821   } else {
822     PetscCall(PetscInfo(C,"Empty matrix product\n"));
823   }
824 #endif
825   PetscFunctionReturn(0);
826 }
827 
828 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)
829 {
830   Mat_SeqAIJ         *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
831   const PetscInt     *ai=a->i,*bi=b->i,*aj=a->j,*bj=b->j,*inputi,*inputj,*inputcol,*inputcol_L1;
832   PetscInt           *ci,*cj,*outputj,worki_L1[9],worki_L2[9];
833   PetscInt           c_maxmem,a_maxrownnz=0,a_rownnz;
834   const PetscInt     workcol[8]={0,1,2,3,4,5,6,7};
835   const PetscInt     am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
836   const PetscInt     *brow_ptr[8],*brow_end[8];
837   PetscInt           window[8];
838   PetscInt           window_min,old_window_min,ci_nnz,outputi_nnz=0,L1_nrows,L2_nrows;
839   PetscInt           i,k,ndouble=0,L1_rowsleft,rowsleft;
840   PetscReal          afill;
841   PetscInt           *workj_L1,*workj_L2,*workj_L3;
842   PetscInt           L1_nnz,L2_nnz;
843 
844   /* Step 1: Get upper bound on memory required for allocation.
845              Because of the way virtual memory works,
846              only the memory pages that are actually needed will be physically allocated. */
847   PetscFunctionBegin;
848   PetscCall(PetscMalloc1(am+1,&ci));
849   for (i=0; i<am; i++) {
850     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
851     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
852     a_rownnz = 0;
853     for (k=0; k<anzi; ++k) {
854       a_rownnz += bi[acol[k]+1] - bi[acol[k]];
855       if (a_rownnz > bn) {
856         a_rownnz = bn;
857         break;
858       }
859     }
860     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
861   }
862   /* temporary work areas for merging rows */
863   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L1));
864   PetscCall(PetscMalloc1(a_maxrownnz*8,&workj_L2));
865   PetscCall(PetscMalloc1(a_maxrownnz,&workj_L3));
866 
867   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
868   c_maxmem = 8*(ai[am]+bi[bm]);
869   /* Step 2: Populate pattern for C */
870   PetscCall(PetscMalloc1(c_maxmem,&cj));
871 
872   ci_nnz       = 0;
873   ci[0]        = 0;
874   worki_L1[0]  = 0;
875   worki_L2[0]  = 0;
876   for (i=0; i<am; i++) {
877     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
878     const PetscInt *acol = aj + ai[i];      /* column indices of nonzero entries in this row */
879     rowsleft             = anzi;
880     inputcol_L1          = acol;
881     L2_nnz               = 0;
882     L2_nrows             = 1;  /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
883     worki_L2[1]          = 0;
884     outputi_nnz          = 0;
885 
886     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
887     while (ci_nnz+a_maxrownnz > c_maxmem) {
888       c_maxmem *= 2;
889       ndouble++;
890       PetscCall(PetscRealloc(sizeof(PetscInt)*c_maxmem,&cj));
891     }
892 
893     while (rowsleft) {
894       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
895       L1_nrows    = 0;
896       L1_nnz      = 0;
897       inputcol    = inputcol_L1;
898       inputi      = bi;
899       inputj      = bj;
900 
901       /* The following macro is used to specialize for small rows in A.
902          This helps with compiler unrolling, improving performance substantially.
903           Input:  inputj   inputi  inputcol  bn
904           Output: outputj  outputi_nnz                       */
905        #define MatMatMultSymbolic_RowMergeMacro(ANNZ)                        \
906          window_min  = bn;                                                   \
907          outputi_nnz = 0;                                                    \
908          for (k=0; k<ANNZ; ++k) {                                            \
909            brow_ptr[k] = inputj + inputi[inputcol[k]];                       \
910            brow_end[k] = inputj + inputi[inputcol[k]+1];                     \
911            window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn;   \
912            window_min  = PetscMin(window[k], window_min);                    \
913          }                                                                   \
914          while (window_min < bn) {                                           \
915            outputj[outputi_nnz++] = window_min;                              \
916            /* advance front and compute new minimum */                       \
917            old_window_min = window_min;                                      \
918            window_min = bn;                                                  \
919            for (k=0; k<ANNZ; ++k) {                                          \
920              if (window[k] == old_window_min) {                              \
921                brow_ptr[k]++;                                                \
922                window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
923              }                                                               \
924              window_min = PetscMin(window[k], window_min);                   \
925            }                                                                 \
926          }
927 
928       /************** L E V E L  1 ***************/
929       /* Merge up to 8 rows of B to L1 work array*/
930       while (L1_rowsleft) {
931         outputi_nnz = 0;
932         if (anzi > 8)  outputj = workj_L1 + L1_nnz;     /* Level 1 rowmerge*/
933         else           outputj = cj + ci_nnz;           /* Merge directly to C */
934 
935         switch (L1_rowsleft) {
936         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
937                  brow_end[0] = inputj + inputi[inputcol[0]+1];
938                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
939                  inputcol    += L1_rowsleft;
940                  rowsleft    -= L1_rowsleft;
941                  L1_rowsleft  = 0;
942                  break;
943         case 2:  MatMatMultSymbolic_RowMergeMacro(2);
944                  inputcol    += L1_rowsleft;
945                  rowsleft    -= L1_rowsleft;
946                  L1_rowsleft  = 0;
947                  break;
948         case 3: MatMatMultSymbolic_RowMergeMacro(3);
949                  inputcol    += L1_rowsleft;
950                  rowsleft    -= L1_rowsleft;
951                  L1_rowsleft  = 0;
952                  break;
953         case 4:  MatMatMultSymbolic_RowMergeMacro(4);
954                  inputcol    += L1_rowsleft;
955                  rowsleft    -= L1_rowsleft;
956                  L1_rowsleft  = 0;
957                  break;
958         case 5:  MatMatMultSymbolic_RowMergeMacro(5);
959                  inputcol    += L1_rowsleft;
960                  rowsleft    -= L1_rowsleft;
961                  L1_rowsleft  = 0;
962                  break;
963         case 6:  MatMatMultSymbolic_RowMergeMacro(6);
964                  inputcol    += L1_rowsleft;
965                  rowsleft    -= L1_rowsleft;
966                  L1_rowsleft  = 0;
967                  break;
968         case 7:  MatMatMultSymbolic_RowMergeMacro(7);
969                  inputcol    += L1_rowsleft;
970                  rowsleft    -= L1_rowsleft;
971                  L1_rowsleft  = 0;
972                  break;
973         default: MatMatMultSymbolic_RowMergeMacro(8);
974                  inputcol    += 8;
975                  rowsleft    -= 8;
976                  L1_rowsleft -= 8;
977                  break;
978         }
979         inputcol_L1           = inputcol;
980         L1_nnz               += outputi_nnz;
981         worki_L1[++L1_nrows]  = L1_nnz;
982       }
983 
984       /********************** L E V E L  2 ************************/
985       /* Merge from L1 work array to either C or to L2 work array */
986       if (anzi > 8) {
987         inputi      = worki_L1;
988         inputj      = workj_L1;
989         inputcol    = workcol;
990         outputi_nnz = 0;
991 
992         if (anzi <= 64) outputj = cj + ci_nnz;        /* Merge from L1 work array to C */
993         else            outputj = workj_L2 + L2_nnz;  /* Merge from L1 work array to L2 work array */
994 
995         switch (L1_nrows) {
996         case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
997                  brow_end[0] = inputj + inputi[inputcol[0]+1];
998                  for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
999                  break;
1000         case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1001         case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1002         case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1003         case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1004         case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1005         case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1006         case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1007         default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1008         }
1009         L2_nnz               += outputi_nnz;
1010         worki_L2[++L2_nrows]  = L2_nnz;
1011 
1012         /************************ L E V E L  3 **********************/
1013         /* Merge from L2 work array to either C or to L2 work array */
1014         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1015           inputi      = worki_L2;
1016           inputj      = workj_L2;
1017           inputcol    = workcol;
1018           outputi_nnz = 0;
1019           if (rowsleft) outputj = workj_L3;
1020           else          outputj = cj + ci_nnz;
1021           switch (L2_nrows) {
1022           case 1:  brow_ptr[0] = inputj + inputi[inputcol[0]];
1023                    brow_end[0] = inputj + inputi[inputcol[0]+1];
1024                    for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1025                    break;
1026           case 2:  MatMatMultSymbolic_RowMergeMacro(2); break;
1027           case 3:  MatMatMultSymbolic_RowMergeMacro(3); break;
1028           case 4:  MatMatMultSymbolic_RowMergeMacro(4); break;
1029           case 5:  MatMatMultSymbolic_RowMergeMacro(5); break;
1030           case 6:  MatMatMultSymbolic_RowMergeMacro(6); break;
1031           case 7:  MatMatMultSymbolic_RowMergeMacro(7); break;
1032           case 8:  MatMatMultSymbolic_RowMergeMacro(8); break;
1033           default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1034           }
1035           L2_nrows    = 1;
1036           L2_nnz      = outputi_nnz;
1037           worki_L2[1] = outputi_nnz;
1038           /* Copy to workj_L2 */
1039           if (rowsleft) {
1040             for (k=0; k<outputi_nnz; ++k)  workj_L2[k] = outputj[k];
1041           }
1042         }
1043       }
1044     }  /* while (rowsleft) */
1045 #undef MatMatMultSymbolic_RowMergeMacro
1046 
1047     /* terminate current row */
1048     ci_nnz += outputi_nnz;
1049     ci[i+1] = ci_nnz;
1050   }
1051 
1052   /* Step 3: Create the new symbolic matrix */
1053   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1054   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1055 
1056   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1057   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1058   c          = (Mat_SeqAIJ*)(C->data);
1059   c->free_a  = PETSC_TRUE;
1060   c->free_ij = PETSC_TRUE;
1061   c->nonew   = 0;
1062 
1063   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1064 
1065   /* set MatInfo */
1066   afill = (PetscReal)ci[am]/(ai[am]+bi[bm]) + 1.e-5;
1067   if (afill < 1.0) afill = 1.0;
1068   C->info.mallocs           = ndouble;
1069   C->info.fill_ratio_given  = fill;
1070   C->info.fill_ratio_needed = afill;
1071 
1072 #if defined(PETSC_USE_INFO)
1073   if (ci[am]) {
1074     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1075     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1076   } else {
1077     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1078   }
1079 #endif
1080 
1081   /* Step 4: Free temporary work areas */
1082   PetscCall(PetscFree(workj_L1));
1083   PetscCall(PetscFree(workj_L2));
1084   PetscCall(PetscFree(workj_L3));
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 /* concatenate unique entries and then sort */
1089 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)
1090 {
1091   Mat_SeqAIJ     *a  = (Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c;
1092   const PetscInt *ai = a->i,*bi=b->i,*aj=a->j,*bj=b->j;
1093   PetscInt       *ci,*cj,bcol;
1094   PetscInt       am=A->rmap->N,bn=B->cmap->N,bm=B->rmap->N;
1095   PetscReal      afill;
1096   PetscInt       i,j,ndouble = 0;
1097   PetscSegBuffer seg,segrow;
1098   char           *seen;
1099 
1100   PetscFunctionBegin;
1101   PetscCall(PetscMalloc1(am+1,&ci));
1102   ci[0] = 0;
1103 
1104   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1105   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),(PetscInt)(fill*(ai[am]+bi[bm])),&seg));
1106   PetscCall(PetscSegBufferCreate(sizeof(PetscInt),100,&segrow));
1107   PetscCall(PetscCalloc1(bn,&seen));
1108 
1109   /* Determine ci and cj */
1110   for (i=0; i<am; i++) {
1111     const PetscInt anzi  = ai[i+1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1112     const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
1113     PetscInt packlen = 0,*PETSC_RESTRICT crow;
1114 
1115     /* Pack segrow */
1116     for (j=0; j<anzi; j++) {
1117       PetscInt brow = acol[j],bjstart = bi[brow],bjend = bi[brow+1],k;
1118       for (k=bjstart; k<bjend; k++) {
1119         bcol = bj[k];
1120         if (!seen[bcol]) { /* new entry */
1121           PetscInt *PETSC_RESTRICT slot;
1122           PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1123           *slot = bcol;
1124           seen[bcol] = 1;
1125           packlen++;
1126         }
1127       }
1128     }
1129 
1130     /* Check i-th diagonal entry */
1131     if (C->force_diagonals && !seen[i]) {
1132       PetscInt *PETSC_RESTRICT slot;
1133       PetscCall(PetscSegBufferGetInts(segrow,1,&slot));
1134       *slot   = i;
1135       seen[i] = 1;
1136       packlen++;
1137     }
1138 
1139     PetscCall(PetscSegBufferGetInts(seg,packlen,&crow));
1140     PetscCall(PetscSegBufferExtractTo(segrow,crow));
1141     PetscCall(PetscSortInt(packlen,crow));
1142     ci[i+1] = ci[i] + packlen;
1143     for (j=0; j<packlen; j++) seen[crow[j]] = 0;
1144   }
1145   PetscCall(PetscSegBufferDestroy(&segrow));
1146   PetscCall(PetscFree(seen));
1147 
1148   /* Column indices are in the segmented buffer */
1149   PetscCall(PetscSegBufferExtractAlloc(seg,&cj));
1150   PetscCall(PetscSegBufferDestroy(&seg));
1151 
1152   /* put together the new symbolic matrix */
1153   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A),am,bn,ci,cj,NULL,((PetscObject)A)->type_name,C));
1154   PetscCall(MatSetBlockSizesFromMats(C,A,B));
1155 
1156   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1157   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1158   c          = (Mat_SeqAIJ*)(C->data);
1159   c->free_a  = PETSC_TRUE;
1160   c->free_ij = PETSC_TRUE;
1161   c->nonew   = 0;
1162 
1163   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1164 
1165   /* set MatInfo */
1166   afill = (PetscReal)ci[am]/PetscMax(ai[am]+bi[bm],1) + 1.e-5;
1167   if (afill < 1.0) afill = 1.0;
1168   C->info.mallocs           = ndouble;
1169   C->info.fill_ratio_given  = fill;
1170   C->info.fill_ratio_needed = afill;
1171 
1172 #if defined(PETSC_USE_INFO)
1173   if (ci[am]) {
1174     PetscCall(PetscInfo(C,"Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",ndouble,(double)fill,(double)afill));
1175     PetscCall(PetscInfo(C,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill));
1176   } else {
1177     PetscCall(PetscInfo(C,"Empty matrix product\n"));
1178   }
1179 #endif
1180   PetscFunctionReturn(0);
1181 }
1182 
1183 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1184 {
1185   Mat_MatMatTransMult *abt=(Mat_MatMatTransMult *)data;
1186 
1187   PetscFunctionBegin;
1188   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1189   PetscCall(MatDestroy(&abt->Bt_den));
1190   PetscCall(MatDestroy(&abt->ABt_den));
1191   PetscCall(PetscFree(abt));
1192   PetscFunctionReturn(0);
1193 }
1194 
1195 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1196 {
1197   Mat                 Bt;
1198   Mat_MatMatTransMult *abt;
1199   Mat_Product         *product = C->product;
1200   char                *alg;
1201 
1202   PetscFunctionBegin;
1203   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1204   PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1205 
1206   /* create symbolic Bt */
1207   PetscCall(MatTransposeSymbolic(B,&Bt));
1208   PetscCall(MatSetBlockSizes(Bt,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1209   PetscCall(MatSetType(Bt,((PetscObject)A)->type_name));
1210 
1211   /* get symbolic C=A*Bt */
1212   PetscCall(PetscStrallocpy(product->alg,&alg));
1213   PetscCall(MatProductSetAlgorithm(C,"sorted")); /* set algorithm for C = A*Bt */
1214   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A,Bt,fill,C));
1215   PetscCall(MatProductSetAlgorithm(C,alg)); /* resume original algorithm for ABt product */
1216   PetscCall(PetscFree(alg));
1217 
1218   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1219   PetscCall(PetscNew(&abt));
1220 
1221   product->data    = abt;
1222   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
1223 
1224   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1225 
1226   abt->usecoloring = PETSC_FALSE;
1227   PetscCall(PetscStrcmp(product->alg,"color",&abt->usecoloring));
1228   if (abt->usecoloring) {
1229     /* Create MatTransposeColoring from symbolic C=A*B^T */
1230     MatTransposeColoring matcoloring;
1231     MatColoring          coloring;
1232     ISColoring           iscoloring;
1233     Mat                  Bt_dense,C_dense;
1234 
1235     /* inode causes memory problem */
1236     PetscCall(MatSetOption(C,MAT_USE_INODES,PETSC_FALSE));
1237 
1238     PetscCall(MatColoringCreate(C,&coloring));
1239     PetscCall(MatColoringSetDistance(coloring,2));
1240     PetscCall(MatColoringSetType(coloring,MATCOLORINGSL));
1241     PetscCall(MatColoringSetFromOptions(coloring));
1242     PetscCall(MatColoringApply(coloring,&iscoloring));
1243     PetscCall(MatColoringDestroy(&coloring));
1244     PetscCall(MatTransposeColoringCreate(C,iscoloring,&matcoloring));
1245 
1246     abt->matcoloring = matcoloring;
1247 
1248     PetscCall(ISColoringDestroy(&iscoloring));
1249 
1250     /* Create Bt_dense and C_dense = A*Bt_dense */
1251     PetscCall(MatCreate(PETSC_COMM_SELF,&Bt_dense));
1252     PetscCall(MatSetSizes(Bt_dense,A->cmap->n,matcoloring->ncolors,A->cmap->n,matcoloring->ncolors));
1253     PetscCall(MatSetType(Bt_dense,MATSEQDENSE));
1254     PetscCall(MatSeqDenseSetPreallocation(Bt_dense,NULL));
1255 
1256     Bt_dense->assembled = PETSC_TRUE;
1257     abt->Bt_den         = Bt_dense;
1258 
1259     PetscCall(MatCreate(PETSC_COMM_SELF,&C_dense));
1260     PetscCall(MatSetSizes(C_dense,A->rmap->n,matcoloring->ncolors,A->rmap->n,matcoloring->ncolors));
1261     PetscCall(MatSetType(C_dense,MATSEQDENSE));
1262     PetscCall(MatSeqDenseSetPreallocation(C_dense,NULL));
1263 
1264     Bt_dense->assembled = PETSC_TRUE;
1265     abt->ABt_den  = C_dense;
1266 
1267 #if defined(PETSC_USE_INFO)
1268     {
1269       Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
1270       PetscCall(PetscInfo(C,"Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n",B->cmap->n,B->rmap->n,Bt_dense->rmap->n,Bt_dense->cmap->n,c->nz,A->rmap->n*matcoloring->ncolors,(double)(((PetscReal)(c->nz))/((PetscReal)(A->rmap->n*matcoloring->ncolors)))));
1271     }
1272 #endif
1273   }
1274   /* clean up */
1275   PetscCall(MatDestroy(&Bt));
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1280 {
1281   Mat_SeqAIJ          *a   =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1282   PetscInt            *ai  =a->i,*aj=a->j,*bi=b->i,*bj=b->j,anzi,bnzj,nexta,nextb,*acol,*bcol,brow;
1283   PetscInt            cm   =C->rmap->n,*ci=c->i,*cj=c->j,i,j,cnzi,*ccol;
1284   PetscLogDouble      flops=0.0;
1285   MatScalar           *aa  =a->a,*aval,*ba=b->a,*bval,*ca,*cval;
1286   Mat_MatMatTransMult *abt;
1287   Mat_Product         *product = C->product;
1288 
1289   PetscFunctionBegin;
1290   PetscCheck(product,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1291   abt = (Mat_MatMatTransMult *)product->data;
1292   PetscCheck(abt,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1293   /* clear old values in C */
1294   if (!c->a) {
1295     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1296     c->a      = ca;
1297     c->free_a = PETSC_TRUE;
1298   } else {
1299     ca =  c->a;
1300     PetscCall(PetscArrayzero(ca,ci[cm]+1));
1301   }
1302 
1303   if (abt->usecoloring) {
1304     MatTransposeColoring matcoloring = abt->matcoloring;
1305     Mat                  Bt_dense,C_dense = abt->ABt_den;
1306 
1307     /* Get Bt_dense by Apply MatTransposeColoring to B */
1308     Bt_dense = abt->Bt_den;
1309     PetscCall(MatTransColoringApplySpToDen(matcoloring,B,Bt_dense));
1310 
1311     /* C_dense = A*Bt_dense */
1312     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A,Bt_dense,C_dense));
1313 
1314     /* Recover C from C_dense */
1315     PetscCall(MatTransColoringApplyDenToSp(matcoloring,C_dense,C));
1316     PetscFunctionReturn(0);
1317   }
1318 
1319   for (i=0; i<cm; i++) {
1320     anzi = ai[i+1] - ai[i];
1321     acol = aj + ai[i];
1322     aval = aa + ai[i];
1323     cnzi = ci[i+1] - ci[i];
1324     ccol = cj + ci[i];
1325     cval = ca + ci[i];
1326     for (j=0; j<cnzi; j++) {
1327       brow = ccol[j];
1328       bnzj = bi[brow+1] - bi[brow];
1329       bcol = bj + bi[brow];
1330       bval = ba + bi[brow];
1331 
1332       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1333       nexta = 0; nextb = 0;
1334       while (nexta<anzi && nextb<bnzj) {
1335         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1336         if (nexta == anzi) break;
1337         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1338         if (nextb == bnzj) break;
1339         if (acol[nexta] == bcol[nextb]) {
1340           cval[j] += aval[nexta]*bval[nextb];
1341           nexta++; nextb++;
1342           flops += 2;
1343         }
1344       }
1345     }
1346   }
1347   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1348   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1349   PetscCall(PetscLogFlops(flops));
1350   PetscFunctionReturn(0);
1351 }
1352 
1353 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1354 {
1355   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult*)data;
1356 
1357   PetscFunctionBegin;
1358   PetscCall(MatDestroy(&atb->At));
1359   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
1360   PetscCall(PetscFree(atb));
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)
1365 {
1366   Mat            At = NULL;
1367   Mat_Product    *product = C->product;
1368   PetscBool      flg,def,square;
1369 
1370   PetscFunctionBegin;
1371   MatCheckProduct(C,4);
1372   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1373   /* outerproduct */
1374   PetscCall(PetscStrcmp(product->alg,"outerproduct",&flg));
1375   if (flg) {
1376     /* create symbolic At */
1377     if (!square) {
1378       PetscCall(MatTransposeSymbolic(A,&At));
1379       PetscCall(MatSetBlockSizes(At,PetscAbs(A->cmap->bs),PetscAbs(B->cmap->bs)));
1380       PetscCall(MatSetType(At,((PetscObject)A)->type_name));
1381     }
1382     /* get symbolic C=At*B */
1383     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1384     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1385 
1386     /* clean up */
1387     if (!square) {
1388       PetscCall(MatDestroy(&At));
1389     }
1390 
1391     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1392     PetscCall(MatProductSetAlgorithm(C,"outerproduct"));
1393     PetscFunctionReturn(0);
1394   }
1395 
1396   /* matmatmult */
1397   PetscCall(PetscStrcmp(product->alg,"default",&def));
1398   PetscCall(PetscStrcmp(product->alg,"at*b",&flg));
1399   if (flg || def) {
1400     Mat_MatTransMatMult *atb;
1401 
1402     PetscCheck(!product->data,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Extra product struct not empty");
1403     PetscCall(PetscNew(&atb));
1404     if (!square) {
1405       PetscCall(MatTranspose(A,MAT_INITIAL_MATRIX,&At));
1406     }
1407     PetscCall(MatProductSetAlgorithm(C,"sorted"));
1408     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At,B,fill,C));
1409     PetscCall(MatProductSetAlgorithm(C,"at*b"));
1410     product->data    = atb;
1411     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
1412     atb->At          = At;
1413 
1414     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1415     PetscFunctionReturn(0);
1416   }
1417 
1418   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
1419 }
1420 
1421 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)
1422 {
1423   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data,*c=(Mat_SeqAIJ*)C->data;
1424   PetscInt       am=A->rmap->n,anzi,*ai=a->i,*aj=a->j,*bi=b->i,*bj,bnzi,nextb;
1425   PetscInt       cm=C->rmap->n,*ci=c->i,*cj=c->j,crow,*cjj,i,j,k;
1426   PetscLogDouble flops=0.0;
1427   MatScalar      *aa=a->a,*ba,*ca,*caj;
1428 
1429   PetscFunctionBegin;
1430   if (!c->a) {
1431     PetscCall(PetscCalloc1(ci[cm]+1,&ca));
1432 
1433     c->a      = ca;
1434     c->free_a = PETSC_TRUE;
1435   } else {
1436     ca   = c->a;
1437     PetscCall(PetscArrayzero(ca,ci[cm]));
1438   }
1439 
1440   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1441   for (i=0; i<am; i++) {
1442     bj   = b->j + bi[i];
1443     ba   = b->a + bi[i];
1444     bnzi = bi[i+1] - bi[i];
1445     anzi = ai[i+1] - ai[i];
1446     for (j=0; j<anzi; j++) {
1447       nextb = 0;
1448       crow  = *aj++;
1449       cjj   = cj + ci[crow];
1450       caj   = ca + ci[crow];
1451       /* perform sparse axpy operation.  Note cjj includes bj. */
1452       for (k=0; nextb<bnzi; k++) {
1453         if (cjj[k] == *(bj+nextb)) { /* ccol == bcol */
1454           caj[k] += (*aa)*(*(ba+nextb));
1455           nextb++;
1456         }
1457       }
1458       flops += 2*bnzi;
1459       aa++;
1460     }
1461   }
1462 
1463   /* Assemble the final matrix and clean up */
1464   PetscCall(MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY));
1465   PetscCall(MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY));
1466   PetscCall(PetscLogFlops(flops));
1467   PetscFunctionReturn(0);
1468 }
1469 
1470 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1471 {
1472   PetscFunctionBegin;
1473   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C));
1474   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1475   PetscFunctionReturn(0);
1476 }
1477 
1478 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)
1479 {
1480   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
1481   PetscScalar       *c,r1,r2,r3,r4,*c1,*c2,*c3,*c4;
1482   const PetscScalar *aa,*b,*b1,*b2,*b3,*b4,*av;
1483   const PetscInt    *aj;
1484   PetscInt          cm=C->rmap->n,cn=B->cmap->n,bm,am=A->rmap->n;
1485   PetscInt          clda;
1486   PetscInt          am4,bm4,col,i,j,n;
1487 
1488   PetscFunctionBegin;
1489   if (!cm || !cn) PetscFunctionReturn(0);
1490   PetscCall(MatSeqAIJGetArrayRead(A,&av));
1491   if (add) {
1492     PetscCall(MatDenseGetArray(C,&c));
1493   } else {
1494     PetscCall(MatDenseGetArrayWrite(C,&c));
1495   }
1496   PetscCall(MatDenseGetArrayRead(B,&b));
1497   PetscCall(MatDenseGetLDA(B,&bm));
1498   PetscCall(MatDenseGetLDA(C,&clda));
1499   am4 = 4*clda;
1500   bm4 = 4*bm;
1501   b1 = b; b2 = b1 + bm; b3 = b2 + bm; b4 = b3 + bm;
1502   c1 = c; c2 = c1 + clda; c3 = c2 + clda; c4 = c3 + clda;
1503   for (col=0; col<(cn/4)*4; col += 4) {  /* over columns of C */
1504     for (i=0; i<am; i++) {        /* over rows of A in those columns */
1505       r1 = r2 = r3 = r4 = 0.0;
1506       n  = a->i[i+1] - a->i[i];
1507       aj = a->j + a->i[i];
1508       aa = av + a->i[i];
1509       for (j=0; j<n; j++) {
1510         const PetscScalar aatmp = aa[j];
1511         const PetscInt    ajtmp = aj[j];
1512         r1 += aatmp*b1[ajtmp];
1513         r2 += aatmp*b2[ajtmp];
1514         r3 += aatmp*b3[ajtmp];
1515         r4 += aatmp*b4[ajtmp];
1516       }
1517       if (add) {
1518         c1[i] += r1;
1519         c2[i] += r2;
1520         c3[i] += r3;
1521         c4[i] += r4;
1522       } else {
1523         c1[i] = r1;
1524         c2[i] = r2;
1525         c3[i] = r3;
1526         c4[i] = r4;
1527       }
1528     }
1529     b1 += bm4; b2 += bm4; b3 += bm4; b4 += bm4;
1530     c1 += am4; c2 += am4; c3 += am4; c4 += am4;
1531   }
1532   /* process remaining columns */
1533   if (col != cn) {
1534     PetscInt rc = cn-col;
1535 
1536     if (rc == 1) {
1537       for (i=0; i<am; i++) {
1538         r1 = 0.0;
1539         n  = a->i[i+1] - a->i[i];
1540         aj = a->j + a->i[i];
1541         aa = av + a->i[i];
1542         for (j=0; j<n; j++) r1 += aa[j]*b1[aj[j]];
1543         if (add) c1[i] += r1;
1544         else c1[i] = r1;
1545       }
1546     } else if (rc == 2) {
1547       for (i=0; i<am; i++) {
1548         r1 = r2 = 0.0;
1549         n  = a->i[i+1] - a->i[i];
1550         aj = a->j + a->i[i];
1551         aa = av + a->i[i];
1552         for (j=0; j<n; j++) {
1553           const PetscScalar aatmp = aa[j];
1554           const PetscInt    ajtmp = aj[j];
1555           r1 += aatmp*b1[ajtmp];
1556           r2 += aatmp*b2[ajtmp];
1557         }
1558         if (add) {
1559           c1[i] += r1;
1560           c2[i] += r2;
1561         } else {
1562           c1[i] = r1;
1563           c2[i] = r2;
1564         }
1565       }
1566     } else {
1567       for (i=0; i<am; i++) {
1568         r1 = r2 = r3 = 0.0;
1569         n  = a->i[i+1] - a->i[i];
1570         aj = a->j + a->i[i];
1571         aa = av + a->i[i];
1572         for (j=0; j<n; j++) {
1573           const PetscScalar aatmp = aa[j];
1574           const PetscInt    ajtmp = aj[j];
1575           r1 += aatmp*b1[ajtmp];
1576           r2 += aatmp*b2[ajtmp];
1577           r3 += aatmp*b3[ajtmp];
1578         }
1579         if (add) {
1580           c1[i] += r1;
1581           c2[i] += r2;
1582           c3[i] += r3;
1583         } else {
1584           c1[i] = r1;
1585           c2[i] = r2;
1586           c3[i] = r3;
1587         }
1588       }
1589     }
1590   }
1591   PetscCall(PetscLogFlops(cn*(2.0*a->nz)));
1592   if (add) {
1593     PetscCall(MatDenseRestoreArray(C,&c));
1594   } else {
1595     PetscCall(MatDenseRestoreArrayWrite(C,&c));
1596   }
1597   PetscCall(MatDenseRestoreArrayRead(B,&b));
1598   PetscCall(MatSeqAIJRestoreArrayRead(A,&av));
1599   PetscFunctionReturn(0);
1600 }
1601 
1602 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)
1603 {
1604   PetscFunctionBegin;
1605   PetscCheck(B->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT,A->cmap->n,B->rmap->n);
1606   PetscCheck(A->rmap->n == C->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT,C->rmap->n,A->rmap->n);
1607   PetscCheck(B->cmap->n == C->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT,B->cmap->n,C->cmap->n);
1608 
1609   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A,B,C,PETSC_FALSE));
1610   PetscFunctionReturn(0);
1611 }
1612 
1613 /* ------------------------------------------------------- */
1614 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1615 {
1616   PetscFunctionBegin;
1617   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1618   C->ops->productsymbolic = MatProductSymbolic_AB;
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat,Mat,PetscReal,Mat);
1623 
1624 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1625 {
1626   PetscFunctionBegin;
1627   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1628   C->ops->productsymbolic          = MatProductSymbolic_AtB;
1629   PetscFunctionReturn(0);
1630 }
1631 
1632 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1633 {
1634   PetscFunctionBegin;
1635   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1636   C->ops->productsymbolic          = MatProductSymbolic_ABt;
1637   PetscFunctionReturn(0);
1638 }
1639 
1640 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1641 {
1642   Mat_Product    *product = C->product;
1643 
1644   PetscFunctionBegin;
1645   switch (product->type) {
1646   case MATPRODUCT_AB:
1647     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1648     break;
1649   case MATPRODUCT_AtB:
1650     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1651     break;
1652   case MATPRODUCT_ABt:
1653     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1654     break;
1655   default:
1656     break;
1657   }
1658   PetscFunctionReturn(0);
1659 }
1660 /* ------------------------------------------------------- */
1661 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1662 {
1663   Mat_Product    *product = C->product;
1664   Mat            A = product->A;
1665   PetscBool      baij;
1666 
1667   PetscFunctionBegin;
1668   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQBAIJ,&baij));
1669   if (!baij) { /* A is seqsbaij */
1670     PetscBool sbaij;
1671     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQSBAIJ,&sbaij));
1672     PetscCheck(sbaij,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONGSTATE,"Mat must be either seqbaij or seqsbaij format");
1673 
1674     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1675   } else { /* A is seqbaij */
1676     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1677   }
1678 
1679   C->ops->productsymbolic = MatProductSymbolic_AB;
1680   PetscFunctionReturn(0);
1681 }
1682 
1683 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1684 {
1685   Mat_Product    *product = C->product;
1686 
1687   PetscFunctionBegin;
1688   MatCheckProduct(C,1);
1689   PetscCheck(product->A,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1690   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1691   PetscFunctionReturn(0);
1692 }
1693 
1694 /* ------------------------------------------------------- */
1695 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1696 {
1697   PetscFunctionBegin;
1698   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1699   C->ops->productsymbolic = MatProductSymbolic_AB;
1700   PetscFunctionReturn(0);
1701 }
1702 
1703 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1704 {
1705   Mat_Product    *product = C->product;
1706 
1707   PetscFunctionBegin;
1708   if (product->type == MATPRODUCT_AB) {
1709     PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1710   }
1711   PetscFunctionReturn(0);
1712 }
1713 /* ------------------------------------------------------- */
1714 
1715 PetscErrorCode  MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)
1716 {
1717   Mat_SeqAIJ     *b       = (Mat_SeqAIJ*)B->data;
1718   Mat_SeqDense   *btdense = (Mat_SeqDense*)Btdense->data;
1719   PetscInt       *bi      = b->i,*bj=b->j;
1720   PetscInt       m        = Btdense->rmap->n,n=Btdense->cmap->n,j,k,l,col,anz,*btcol,brow,ncolumns;
1721   MatScalar      *btval,*btval_den,*ba=b->a;
1722   PetscInt       *columns=coloring->columns,*colorforcol=coloring->colorforcol,ncolors=coloring->ncolors;
1723 
1724   PetscFunctionBegin;
1725   btval_den=btdense->v;
1726   PetscCall(PetscArrayzero(btval_den,m*n));
1727   for (k=0; k<ncolors; k++) {
1728     ncolumns = coloring->ncolumns[k];
1729     for (l=0; l<ncolumns; l++) { /* insert a row of B to a column of Btdense */
1730       col   = *(columns + colorforcol[k] + l);
1731       btcol = bj + bi[col];
1732       btval = ba + bi[col];
1733       anz   = bi[col+1] - bi[col];
1734       for (j=0; j<anz; j++) {
1735         brow            = btcol[j];
1736         btval_den[brow] = btval[j];
1737       }
1738     }
1739     btval_den += m;
1740   }
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)
1745 {
1746   Mat_SeqAIJ        *csp = (Mat_SeqAIJ*)Csp->data;
1747   const PetscScalar *ca_den,*ca_den_ptr;
1748   PetscScalar       *ca=csp->a;
1749   PetscInt          k,l,m=Cden->rmap->n,ncolors=matcoloring->ncolors;
1750   PetscInt          brows=matcoloring->brows,*den2sp=matcoloring->den2sp;
1751   PetscInt          nrows,*row,*idx;
1752   PetscInt          *rows=matcoloring->rows,*colorforrow=matcoloring->colorforrow;
1753 
1754   PetscFunctionBegin;
1755   PetscCall(MatDenseGetArrayRead(Cden,&ca_den));
1756 
1757   if (brows > 0) {
1758     PetscInt *lstart,row_end,row_start;
1759     lstart = matcoloring->lstart;
1760     PetscCall(PetscArrayzero(lstart,ncolors));
1761 
1762     row_end = brows;
1763     if (row_end > m) row_end = m;
1764     for (row_start=0; row_start<m; row_start+=brows) { /* loop over row blocks of Csp */
1765       ca_den_ptr = ca_den;
1766       for (k=0; k<ncolors; k++) { /* loop over colors (columns of Cden) */
1767         nrows = matcoloring->nrows[k];
1768         row   = rows  + colorforrow[k];
1769         idx   = den2sp + colorforrow[k];
1770         for (l=lstart[k]; l<nrows; l++) {
1771           if (row[l] >= row_end) {
1772             lstart[k] = l;
1773             break;
1774           } else {
1775             ca[idx[l]] = ca_den_ptr[row[l]];
1776           }
1777         }
1778         ca_den_ptr += m;
1779       }
1780       row_end += brows;
1781       if (row_end > m) row_end = m;
1782     }
1783   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1784     ca_den_ptr = ca_den;
1785     for (k=0; k<ncolors; k++) {
1786       nrows = matcoloring->nrows[k];
1787       row   = rows  + colorforrow[k];
1788       idx   = den2sp + colorforrow[k];
1789       for (l=0; l<nrows; l++) {
1790         ca[idx[l]] = ca_den_ptr[row[l]];
1791       }
1792       ca_den_ptr += m;
1793     }
1794   }
1795 
1796   PetscCall(MatDenseRestoreArrayRead(Cden,&ca_den));
1797 #if defined(PETSC_USE_INFO)
1798   if (matcoloring->brows > 0) {
1799     PetscCall(PetscInfo(Csp,"Loop over %" PetscInt_FMT " row blocks for den2sp\n",brows));
1800   } else {
1801     PetscCall(PetscInfo(Csp,"Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1802   }
1803 #endif
1804   PetscFunctionReturn(0);
1805 }
1806 
1807 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)
1808 {
1809   PetscInt       i,n,nrows,Nbs,j,k,m,ncols,col,cm;
1810   const PetscInt *is,*ci,*cj,*row_idx;
1811   PetscInt       nis = iscoloring->n,*rowhit,bs = 1;
1812   IS             *isa;
1813   Mat_SeqAIJ     *csp = (Mat_SeqAIJ*)mat->data;
1814   PetscInt       *colorforrow,*rows,*rows_i,*idxhit,*spidx,*den2sp,*den2sp_i;
1815   PetscInt       *colorforcol,*columns,*columns_i,brows;
1816   PetscBool      flg;
1817 
1818   PetscFunctionBegin;
1819   PetscCall(ISColoringGetIS(iscoloring,PETSC_USE_POINTER,PETSC_IGNORE,&isa));
1820 
1821   /* bs >1 is not being tested yet! */
1822   Nbs       = mat->cmap->N/bs;
1823   c->M      = mat->rmap->N/bs;  /* set total rows, columns and local rows */
1824   c->N      = Nbs;
1825   c->m      = c->M;
1826   c->rstart = 0;
1827   c->brows  = 100;
1828 
1829   c->ncolors = nis;
1830   PetscCall(PetscMalloc3(nis,&c->ncolumns,nis,&c->nrows,nis+1,&colorforrow));
1831   PetscCall(PetscMalloc1(csp->nz+1,&rows));
1832   PetscCall(PetscMalloc1(csp->nz+1,&den2sp));
1833 
1834   brows = c->brows;
1835   PetscCall(PetscOptionsGetInt(NULL,NULL,"-matden2sp_brows",&brows,&flg));
1836   if (flg) c->brows = brows;
1837   if (brows > 0) {
1838     PetscCall(PetscMalloc1(nis+1,&c->lstart));
1839   }
1840 
1841   colorforrow[0] = 0;
1842   rows_i         = rows;
1843   den2sp_i       = den2sp;
1844 
1845   PetscCall(PetscMalloc1(nis+1,&colorforcol));
1846   PetscCall(PetscMalloc1(Nbs+1,&columns));
1847 
1848   colorforcol[0] = 0;
1849   columns_i      = columns;
1850 
1851   /* get column-wise storage of mat */
1852   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1853 
1854   cm   = c->m;
1855   PetscCall(PetscMalloc1(cm+1,&rowhit));
1856   PetscCall(PetscMalloc1(cm+1,&idxhit));
1857   for (i=0; i<nis; i++) { /* loop over color */
1858     PetscCall(ISGetLocalSize(isa[i],&n));
1859     PetscCall(ISGetIndices(isa[i],&is));
1860 
1861     c->ncolumns[i] = n;
1862     if (n) PetscCall(PetscArraycpy(columns_i,is,n));
1863     colorforcol[i+1] = colorforcol[i] + n;
1864     columns_i       += n;
1865 
1866     /* fast, crude version requires O(N*N) work */
1867     PetscCall(PetscArrayzero(rowhit,cm));
1868 
1869     for (j=0; j<n; j++) { /* loop over columns*/
1870       col     = is[j];
1871       row_idx = cj + ci[col];
1872       m       = ci[col+1] - ci[col];
1873       for (k=0; k<m; k++) { /* loop over columns marking them in rowhit */
1874         idxhit[*row_idx]   = spidx[ci[col] + k];
1875         rowhit[*row_idx++] = col + 1;
1876       }
1877     }
1878     /* count the number of hits */
1879     nrows = 0;
1880     for (j=0; j<cm; j++) {
1881       if (rowhit[j]) nrows++;
1882     }
1883     c->nrows[i]      = nrows;
1884     colorforrow[i+1] = colorforrow[i] + nrows;
1885 
1886     nrows = 0;
1887     for (j=0; j<cm; j++) { /* loop over rows */
1888       if (rowhit[j]) {
1889         rows_i[nrows]   = j;
1890         den2sp_i[nrows] = idxhit[j];
1891         nrows++;
1892       }
1893     }
1894     den2sp_i += nrows;
1895 
1896     PetscCall(ISRestoreIndices(isa[i],&is));
1897     rows_i += nrows;
1898   }
1899   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat,0,PETSC_FALSE,PETSC_FALSE,&ncols,&ci,&cj,&spidx,NULL));
1900   PetscCall(PetscFree(rowhit));
1901   PetscCall(ISColoringRestoreIS(iscoloring,PETSC_USE_POINTER,&isa));
1902   PetscCheck(csp->nz == colorforrow[nis],PETSC_COMM_SELF,PETSC_ERR_PLIB,"csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT,csp->nz,colorforrow[nis]);
1903 
1904   c->colorforrow = colorforrow;
1905   c->rows        = rows;
1906   c->den2sp      = den2sp;
1907   c->colorforcol = colorforcol;
1908   c->columns     = columns;
1909 
1910   PetscCall(PetscFree(idxhit));
1911   PetscFunctionReturn(0);
1912 }
1913 
1914 /* --------------------------------------------------------------- */
1915 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1916 {
1917   Mat_Product    *product = C->product;
1918   Mat            A=product->A,B=product->B;
1919 
1920   PetscFunctionBegin;
1921   if (C->ops->mattransposemultnumeric) {
1922     /* Alg: "outerproduct" */
1923     PetscCall((*C->ops->mattransposemultnumeric)(A,B,C));
1924   } else {
1925     /* Alg: "matmatmult" -- C = At*B */
1926     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
1927 
1928     PetscCheck(atb,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing product struct");
1929     if (atb->At) {
1930       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
1931          user may have called MatProductReplaceMats() to get this A=product->A */
1932       PetscCall(MatTransposeSetPrecursor(A,atb->At));
1933       PetscCall(MatTranspose(A,MAT_REUSE_MATRIX,&atb->At));
1934     }
1935     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A,B,C));
1936   }
1937   PetscFunctionReturn(0);
1938 }
1939 
1940 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1941 {
1942   Mat_Product    *product = C->product;
1943   Mat            A=product->A,B=product->B;
1944   PetscReal      fill=product->fill;
1945 
1946   PetscFunctionBegin;
1947   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A,B,fill,C));
1948 
1949   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1950   PetscFunctionReturn(0);
1951 }
1952 
1953 /* --------------------------------------------------------------- */
1954 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1955 {
1956   Mat_Product    *product = C->product;
1957   PetscInt       alg = 0; /* default algorithm */
1958   PetscBool      flg = PETSC_FALSE;
1959 #if !defined(PETSC_HAVE_HYPRE)
1960   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
1961   PetscInt       nalg = 7;
1962 #else
1963   const char     *algTypes[8] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge","hypre"};
1964   PetscInt       nalg = 8;
1965 #endif
1966 
1967   PetscFunctionBegin;
1968   /* Set default algorithm */
1969   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
1970   if (flg) {
1971     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1972   }
1973 
1974   /* Get runtime option */
1975   if (product->api_user) {
1976     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMult","Mat");
1977     PetscCall(PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,nalg,algTypes[0],&alg,&flg));
1978     PetscOptionsEnd();
1979   } else {
1980     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AB","Mat");
1981     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AB",algTypes,nalg,algTypes[0],&alg,&flg));
1982     PetscOptionsEnd();
1983   }
1984   if (flg) {
1985     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
1986   }
1987 
1988   C->ops->productsymbolic = MatProductSymbolic_AB;
1989   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
1990   PetscFunctionReturn(0);
1991 }
1992 
1993 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
1994 {
1995   Mat_Product    *product = C->product;
1996   PetscInt       alg = 0; /* default algorithm */
1997   PetscBool      flg = PETSC_FALSE;
1998   const char     *algTypes[3] = {"default","at*b","outerproduct"};
1999   PetscInt       nalg = 3;
2000 
2001   PetscFunctionBegin;
2002   /* Get runtime option */
2003   if (product->api_user) {
2004     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatTransposeMatMult","Mat");
2005     PetscCall(PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2006     PetscOptionsEnd();
2007   } else {
2008     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_AtB","Mat");
2009     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_AtB",algTypes,nalg,algTypes[alg],&alg,&flg));
2010     PetscOptionsEnd();
2011   }
2012   if (flg) {
2013     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2014   }
2015 
2016   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2017   PetscFunctionReturn(0);
2018 }
2019 
2020 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2021 {
2022   Mat_Product    *product = C->product;
2023   PetscInt       alg = 0; /* default algorithm */
2024   PetscBool      flg = PETSC_FALSE;
2025   const char     *algTypes[2] = {"default","color"};
2026   PetscInt       nalg = 2;
2027 
2028   PetscFunctionBegin;
2029   /* Set default algorithm */
2030   PetscCall(PetscStrcmp(C->product->alg,"default",&flg));
2031   if (!flg) {
2032     alg = 1;
2033     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2034   }
2035 
2036   /* Get runtime option */
2037   if (product->api_user) {
2038     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatTransposeMult","Mat");
2039     PetscCall(PetscOptionsEList("-matmattransmult_via","Algorithmic approach","MatMatTransposeMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2040     PetscOptionsEnd();
2041   } else {
2042     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABt","Mat");
2043     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABt",algTypes,nalg,algTypes[alg],&alg,&flg));
2044     PetscOptionsEnd();
2045   }
2046   if (flg) {
2047     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2048   }
2049 
2050   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2051   C->ops->productsymbolic          = MatProductSymbolic_ABt;
2052   PetscFunctionReturn(0);
2053 }
2054 
2055 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2056 {
2057   Mat_Product    *product = C->product;
2058   PetscBool      flg = PETSC_FALSE;
2059   PetscInt       alg = 0; /* default algorithm -- alg=1 should be default!!! */
2060 #if !defined(PETSC_HAVE_HYPRE)
2061   const char     *algTypes[2] = {"scalable","rap"};
2062   PetscInt       nalg = 2;
2063 #else
2064   const char     *algTypes[3] = {"scalable","rap","hypre"};
2065   PetscInt       nalg = 3;
2066 #endif
2067 
2068   PetscFunctionBegin;
2069   /* Set default algorithm */
2070   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2071   if (flg) {
2072     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2073   }
2074 
2075   /* Get runtime option */
2076   if (product->api_user) {
2077     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatPtAP","Mat");
2078     PetscCall(PetscOptionsEList("-matptap_via","Algorithmic approach","MatPtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2079     PetscOptionsEnd();
2080   } else {
2081     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_PtAP","Mat");
2082     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_PtAP",algTypes,nalg,algTypes[0],&alg,&flg));
2083     PetscOptionsEnd();
2084   }
2085   if (flg) {
2086     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2087   }
2088 
2089   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2090   PetscFunctionReturn(0);
2091 }
2092 
2093 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2094 {
2095   Mat_Product    *product = C->product;
2096   PetscBool      flg = PETSC_FALSE;
2097   PetscInt       alg = 0; /* default algorithm */
2098   const char     *algTypes[3] = {"r*a*rt","r*art","coloring_rart"};
2099   PetscInt        nalg = 3;
2100 
2101   PetscFunctionBegin;
2102   /* Set default algorithm */
2103   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2104   if (flg) {
2105     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2106   }
2107 
2108   /* Get runtime option */
2109   if (product->api_user) {
2110     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatRARt","Mat");
2111     PetscCall(PetscOptionsEList("-matrart_via","Algorithmic approach","MatRARt",algTypes,nalg,algTypes[0],&alg,&flg));
2112     PetscOptionsEnd();
2113   } else {
2114     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_RARt","Mat");
2115     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_RARt",algTypes,nalg,algTypes[0],&alg,&flg));
2116     PetscOptionsEnd();
2117   }
2118   if (flg) {
2119     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2120   }
2121 
2122   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2123   PetscFunctionReturn(0);
2124 }
2125 
2126 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2127 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2128 {
2129   Mat_Product    *product = C->product;
2130   PetscInt       alg = 0; /* default algorithm */
2131   PetscBool      flg = PETSC_FALSE;
2132   const char     *algTypes[7] = {"sorted","scalable","scalable_fast","heap","btheap","llcondensed","rowmerge"};
2133   PetscInt       nalg = 7;
2134 
2135   PetscFunctionBegin;
2136   /* Set default algorithm */
2137   PetscCall(PetscStrcmp(product->alg,"default",&flg));
2138   if (flg) {
2139     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2140   }
2141 
2142   /* Get runtime option */
2143   if (product->api_user) {
2144     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatMatMatMult","Mat");
2145     PetscCall(PetscOptionsEList("-matmatmatmult_via","Algorithmic approach","MatMatMatMult",algTypes,nalg,algTypes[alg],&alg,&flg));
2146     PetscOptionsEnd();
2147   } else {
2148     PetscOptionsBegin(PetscObjectComm((PetscObject)C),((PetscObject)C)->prefix,"MatProduct_ABC","Mat");
2149     PetscCall(PetscOptionsEList("-mat_product_algorithm","Algorithmic approach","MatProduct_ABC",algTypes,nalg,algTypes[alg],&alg,&flg));
2150     PetscOptionsEnd();
2151   }
2152   if (flg) {
2153     PetscCall(MatProductSetAlgorithm(C,(MatProductAlgorithm)algTypes[alg]));
2154   }
2155 
2156   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2157   C->ops->productsymbolic    = MatProductSymbolic_ABC;
2158   PetscFunctionReturn(0);
2159 }
2160 
2161 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2162 {
2163   Mat_Product    *product = C->product;
2164 
2165   PetscFunctionBegin;
2166   switch (product->type) {
2167   case MATPRODUCT_AB:
2168     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2169     break;
2170   case MATPRODUCT_AtB:
2171     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2172     break;
2173   case MATPRODUCT_ABt:
2174     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2175     break;
2176   case MATPRODUCT_PtAP:
2177     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2178     break;
2179   case MATPRODUCT_RARt:
2180     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2181     break;
2182   case MATPRODUCT_ABC:
2183     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2184     break;
2185   default:
2186     break;
2187   }
2188   PetscFunctionReturn(0);
2189 }
2190