xref: /petsc/src/mat/impls/aij/seq/matmatmult.c (revision 57508ece14a6b1339c0bbf016ecd72f673a062b0)
1d50806bdSBarry Smith /*
22499ec78SHong Zhang   Defines matrix-matrix product routines for pairs of SeqAIJ matrices
3d50806bdSBarry Smith           C = A * B
4d50806bdSBarry Smith */
5d50806bdSBarry Smith 
6c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
8c6db04a5SJed Brown #include <petscbt.h>
9af0996ceSBarry Smith #include <petsc/private/isimpl.h>
1007475bc1SBarry Smith #include <../src/mat/impls/dense/seq/dense.h>
117bab7c10SHong Zhang 
12d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
13d71ae5a4SJacob Faibussowitsch {
14df97dc6dSFande Kong   PetscFunctionBegin;
15dbbe0bcdSBarry Smith   if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
16dbbe0bcdSBarry Smith   else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18df97dc6dSFande Kong }
19df97dc6dSFande Kong 
204222ddf1SHong Zhang /* Modified from MatCreateSeqAIJWithArrays() */
21d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
22d71ae5a4SJacob Faibussowitsch {
234222ddf1SHong Zhang   PetscInt    ii;
244222ddf1SHong Zhang   Mat_SeqAIJ *aij;
259f0612e4SBarry Smith   PetscBool   isseqaij, ofree_a, ofree_ij;
265c66b693SKris Buschelman 
275c66b693SKris Buschelman   PetscFunctionBegin;
28aed4548fSBarry Smith   PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
299566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(mat, m, n, m, n));
304222ddf1SHong Zhang 
31e4e71118SStefano Zampini   if (!mtype) {
329566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
339566063dSJacob Faibussowitsch     if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
34e4e71118SStefano Zampini   } else {
359566063dSJacob Faibussowitsch     PetscCall(MatSetType(mat, mtype));
36e4e71118SStefano Zampini   }
37cbc6b225SStefano Zampini 
38*57508eceSPierre Jolivet   aij      = (Mat_SeqAIJ *)mat->data;
39cbc6b225SStefano Zampini   ofree_a  = aij->free_a;
40cbc6b225SStefano Zampini   ofree_ij = aij->free_ij;
41cbc6b225SStefano Zampini   /* changes the free flags */
429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
43cbc6b225SStefano Zampini 
449566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->ilen));
459566063dSJacob Faibussowitsch   PetscCall(PetscFree(aij->imax));
469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->imax));
479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aij->ilen));
48cbc6b225SStefano Zampini   for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
49cbc6b225SStefano Zampini     const PetscInt rnz = i[ii + 1] - i[ii];
50cbc6b225SStefano Zampini     aij->nonzerorowcnt += !!rnz;
51cbc6b225SStefano Zampini     aij->rmax     = PetscMax(aij->rmax, rnz);
52cbc6b225SStefano Zampini     aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
53cbc6b225SStefano Zampini   }
54cbc6b225SStefano Zampini   aij->maxnz = i[m];
55cbc6b225SStefano Zampini   aij->nz    = i[m];
564222ddf1SHong Zhang 
579f0612e4SBarry Smith   if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a));
589f0612e4SBarry Smith   if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j));
599f0612e4SBarry Smith   if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i));
609f0612e4SBarry Smith 
614222ddf1SHong Zhang   aij->i       = i;
624222ddf1SHong Zhang   aij->j       = j;
634222ddf1SHong Zhang   aij->a       = a;
644222ddf1SHong Zhang   aij->nonew   = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
654222ddf1SHong Zhang   aij->free_a  = PETSC_FALSE;
664222ddf1SHong Zhang   aij->free_ij = PETSC_FALSE;
679566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
686dc08606SJunchao Zhang   // Always build the diag info when i, j are set
696dc08606SJunchao Zhang   PetscCall(MatMarkDiagonal_SeqAIJ(mat));
703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
715c66b693SKris Buschelman }
721c24bd37SHong Zhang 
73d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
74d71ae5a4SJacob Faibussowitsch {
754222ddf1SHong Zhang   Mat_Product        *product = C->product;
764222ddf1SHong Zhang   MatProductAlgorithm alg;
774222ddf1SHong Zhang   PetscBool           flg;
784222ddf1SHong Zhang 
794222ddf1SHong Zhang   PetscFunctionBegin;
804222ddf1SHong Zhang   if (product) {
814222ddf1SHong Zhang     alg = product->alg;
824222ddf1SHong Zhang   } else {
834222ddf1SHong Zhang     alg = "sorted";
844222ddf1SHong Zhang   }
854222ddf1SHong Zhang   /* sorted */
869566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "sorted", &flg));
874222ddf1SHong Zhang   if (flg) {
889566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
893ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
904222ddf1SHong Zhang   }
914222ddf1SHong Zhang 
924222ddf1SHong Zhang   /* scalable */
939566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable", &flg));
944222ddf1SHong Zhang   if (flg) {
959566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
963ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
974222ddf1SHong Zhang   }
984222ddf1SHong Zhang 
994222ddf1SHong Zhang   /* scalable_fast */
1009566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
1014222ddf1SHong Zhang   if (flg) {
1029566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
1033ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1044222ddf1SHong Zhang   }
1054222ddf1SHong Zhang 
1064222ddf1SHong Zhang   /* heap */
1079566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "heap", &flg));
1084222ddf1SHong Zhang   if (flg) {
1099566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
1103ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1114222ddf1SHong Zhang   }
1124222ddf1SHong Zhang 
1134222ddf1SHong Zhang   /* btheap */
1149566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "btheap", &flg));
1154222ddf1SHong Zhang   if (flg) {
1169566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
1173ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1184222ddf1SHong Zhang   }
1194222ddf1SHong Zhang 
1204222ddf1SHong Zhang   /* llcondensed */
1219566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
1224222ddf1SHong Zhang   if (flg) {
1239566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
1243ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1254222ddf1SHong Zhang   }
1264222ddf1SHong Zhang 
1274222ddf1SHong Zhang   /* rowmerge */
1289566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
1294222ddf1SHong Zhang   if (flg) {
1309566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
1313ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1324222ddf1SHong Zhang   }
1334222ddf1SHong Zhang 
1344222ddf1SHong Zhang #if defined(PETSC_HAVE_HYPRE)
1359566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(alg, "hypre", &flg));
1364222ddf1SHong Zhang   if (flg) {
1379566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
1383ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1394222ddf1SHong Zhang   }
1404222ddf1SHong Zhang #endif
1414222ddf1SHong Zhang 
1424222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1434222ddf1SHong Zhang }
1444222ddf1SHong Zhang 
145d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
146d71ae5a4SJacob Faibussowitsch {
147b561aa9dSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
148971236abSHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
149c1ba5575SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
150b561aa9dSHong Zhang   PetscReal          afill;
151eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
152eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
153fb908031SHong Zhang   PetscBT            lnkbt;
1540298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
155b561aa9dSHong Zhang 
156b561aa9dSHong Zhang   PetscFunctionBegin;
157bd958071SHong Zhang   /* Get ci and cj */
158fb908031SHong Zhang   /* Allocate ci array, arrays for fill computation and */
159fb908031SHong Zhang   /* free space for accumulating nonzero column info */
1609566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
161fb908031SHong Zhang   ci[0] = 0;
162fb908031SHong Zhang 
163fb908031SHong Zhang   /* create and initialize a linked list */
164eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
165c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
166eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
167eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
168eca6b86aSHong Zhang 
1699566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
170fb908031SHong Zhang 
171fb908031SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1729566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
1732205254eSKarl Rupp 
174fb908031SHong Zhang   current_space = free_space;
175fb908031SHong Zhang 
176fb908031SHong Zhang   /* Determine ci and cj */
177fb908031SHong Zhang   for (i = 0; i < am; i++) {
178fb908031SHong Zhang     anzi = ai[i + 1] - ai[i];
179fb908031SHong Zhang     aj   = a->j + ai[i];
180fb908031SHong Zhang     for (j = 0; j < anzi; j++) {
181fb908031SHong Zhang       brow = aj[j];
182fb908031SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
183fb908031SHong Zhang       bj   = b->j + bi[brow];
184fb908031SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
1859566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
186fb908031SHong Zhang     }
1878c78258cSHong Zhang     /* add possible missing diagonal entry */
18848a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
189fb908031SHong Zhang     cnzi = lnk[0];
190fb908031SHong Zhang 
191fb908031SHong Zhang     /* If free space is not available, make more free space */
192fb908031SHong Zhang     /* Double the amount of total space in the list */
193fb908031SHong Zhang     if (current_space->local_remaining < cnzi) {
1949566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
195fb908031SHong Zhang       ndouble++;
196fb908031SHong Zhang     }
197fb908031SHong Zhang 
198fb908031SHong Zhang     /* Copy data into free space, then initialize lnk */
1999566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
2002205254eSKarl Rupp 
201fb908031SHong Zhang     current_space->array += cnzi;
202fb908031SHong Zhang     current_space->local_used += cnzi;
203fb908031SHong Zhang     current_space->local_remaining -= cnzi;
2042205254eSKarl Rupp 
205fb908031SHong Zhang     ci[i + 1] = ci[i] + cnzi;
206fb908031SHong Zhang   }
207fb908031SHong Zhang 
208fb908031SHong Zhang   /* Column indices are in the list of free space */
209fb908031SHong Zhang   /* Allocate space for cj, initialize cj, and */
210fb908031SHong Zhang   /* destroy list of free space and other temporary array(s) */
2119566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
2129566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
2139566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
214b561aa9dSHong Zhang 
21526be0446SHong Zhang   /* put together the new symbolic matrix */
2169566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
2179566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
21858c24d83SHong Zhang 
21958c24d83SHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
22058c24d83SHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
221f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
222aa1aec99SHong Zhang   c->free_a  = PETSC_FALSE;
223e6b907acSBarry Smith   c->free_ij = PETSC_TRUE;
22458c24d83SHong Zhang   c->nonew   = 0;
2254222ddf1SHong Zhang 
2264222ddf1SHong Zhang   /* fast, needs non-scalable O(bn) array 'abdense' */
2274222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
2280b7e3e3dSHong Zhang 
2297212da7cSHong Zhang   /* set MatInfo */
2307212da7cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
231f2b054eeSHong Zhang   if (afill < 1.0) afill = 1.0;
2324222ddf1SHong Zhang   C->info.mallocs           = ndouble;
2334222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
2344222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
2357212da7cSHong Zhang 
2367212da7cSHong Zhang #if defined(PETSC_USE_INFO)
2377212da7cSHong Zhang   if (ci[am]) {
2389566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
2399566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
240f2b054eeSHong Zhang   } else {
2419566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
242be0fcf8dSHong Zhang   }
243f2b054eeSHong Zhang #endif
2443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24558c24d83SHong Zhang }
246d50806bdSBarry Smith 
247d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
248d71ae5a4SJacob Faibussowitsch {
249d13dce4bSSatish Balay   PetscLogDouble     flops = 0.0;
250d50806bdSBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
251d50806bdSBarry Smith   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
252d50806bdSBarry Smith   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
25338baddfdSBarry Smith   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
254c58ca2e3SHong Zhang   PetscInt           am = A->rmap->n, cm = C->rmap->n;
255519eb980SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
2562e5835c6SStefano Zampini   PetscScalar       *ca, valtmp;
257aa1aec99SHong Zhang   PetscScalar       *ab_dense;
2586718818eSStefano Zampini   PetscContainer     cab_dense;
2592e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
260d50806bdSBarry Smith 
261d50806bdSBarry Smith   PetscFunctionBegin;
2629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
2639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
264aa1aec99SHong Zhang   if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
2659566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
266aa1aec99SHong Zhang     c->a      = ca;
267aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
2686718818eSStefano Zampini   } else ca = c->a;
2696718818eSStefano Zampini 
2706718818eSStefano Zampini   /* TODO this should be done in the symbolic phase */
2716718818eSStefano Zampini   /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
2726718818eSStefano Zampini      that is hard to eradicate) */
2739566063dSJacob Faibussowitsch   PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
2746718818eSStefano Zampini   if (!cab_dense) {
2759566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
2769566063dSJacob Faibussowitsch     PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense));
2779566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetPointer(cab_dense, ab_dense));
2789566063dSJacob Faibussowitsch     PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault));
2799566063dSJacob Faibussowitsch     PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense));
2809566063dSJacob Faibussowitsch     PetscCall(PetscObjectDereference((PetscObject)cab_dense));
281d90d86d0SMatthew G. Knepley   }
2829566063dSJacob Faibussowitsch   PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense));
2839566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
284aa1aec99SHong Zhang 
285c124e916SHong Zhang   /* clean old values in C */
2869566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
287d50806bdSBarry Smith   /* Traverse A row-wise. */
288d50806bdSBarry Smith   /* Build the ith row in C by summing over nonzero columns in A, */
289d50806bdSBarry Smith   /* the rows of B corresponding to nonzeros of A. */
290d50806bdSBarry Smith   for (i = 0; i < am; i++) {
291d50806bdSBarry Smith     anzi = ai[i + 1] - ai[i];
292d50806bdSBarry Smith     for (j = 0; j < anzi; j++) {
293519eb980SHong Zhang       brow = aj[j];
294d50806bdSBarry Smith       bnzi = bi[brow + 1] - bi[brow];
2953b51c7ffSStefano Zampini       bjj  = PetscSafePointerPlusOffset(bj, bi[brow]);
2963b51c7ffSStefano Zampini       baj  = PetscSafePointerPlusOffset(ba, bi[brow]);
297519eb980SHong Zhang       /* perform dense axpy */
29836ec6d2dSHong Zhang       valtmp = aa[j];
299ad540459SPierre Jolivet       for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
300519eb980SHong Zhang       flops += 2 * bnzi;
301519eb980SHong Zhang     }
3028e3a54c0SPierre Jolivet     aj = PetscSafePointerPlusOffset(aj, anzi);
3038e3a54c0SPierre Jolivet     aa = PetscSafePointerPlusOffset(aa, anzi);
304c58ca2e3SHong Zhang 
305c58ca2e3SHong Zhang     cnzi = ci[i + 1] - ci[i];
306519eb980SHong Zhang     for (k = 0; k < cnzi; k++) {
307519eb980SHong Zhang       ca[k] += ab_dense[cj[k]];
308519eb980SHong Zhang       ab_dense[cj[k]] = 0.0; /* zero ab_dense */
309519eb980SHong Zhang     }
310519eb980SHong Zhang     flops += cnzi;
3118e3a54c0SPierre Jolivet     cj = PetscSafePointerPlusOffset(cj, cnzi);
3129371c9d4SSatish Balay     ca += cnzi;
313519eb980SHong Zhang   }
3142e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3152e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3162e5835c6SStefano Zampini #endif
3179566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3189566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3199566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
323c58ca2e3SHong Zhang }
324c58ca2e3SHong Zhang 
325d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
326d71ae5a4SJacob Faibussowitsch {
327c58ca2e3SHong Zhang   PetscLogDouble     flops = 0.0;
328c58ca2e3SHong Zhang   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
329c58ca2e3SHong Zhang   Mat_SeqAIJ        *b     = (Mat_SeqAIJ *)B->data;
330c58ca2e3SHong Zhang   Mat_SeqAIJ        *c     = (Mat_SeqAIJ *)C->data;
331c58ca2e3SHong Zhang   PetscInt          *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
332c58ca2e3SHong Zhang   PetscInt           am = A->rmap->N, cm = C->rmap->N;
333c58ca2e3SHong Zhang   PetscInt           i, j, k, anzi, bnzi, cnzi, brow;
3342e5835c6SStefano Zampini   PetscScalar       *ca = c->a, valtmp;
3352e5835c6SStefano Zampini   const PetscScalar *aa, *ba, *baj;
336c58ca2e3SHong Zhang   PetscInt           nextb;
337c58ca2e3SHong Zhang 
338c58ca2e3SHong Zhang   PetscFunctionBegin;
3399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &aa));
3409566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(B, &ba));
341cf742fccSHong Zhang   if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
3429566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
343cf742fccSHong Zhang     c->a      = ca;
344cf742fccSHong Zhang     c->free_a = PETSC_TRUE;
345cf742fccSHong Zhang   }
346cf742fccSHong Zhang 
347c58ca2e3SHong Zhang   /* clean old values in C */
3489566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(ca, ci[cm]));
349c58ca2e3SHong Zhang   /* Traverse A row-wise. */
350c58ca2e3SHong Zhang   /* Build the ith row in C by summing over nonzero columns in A, */
351c58ca2e3SHong Zhang   /* the rows of B corresponding to nonzeros of A. */
352519eb980SHong Zhang   for (i = 0; i < am; i++) {
353519eb980SHong Zhang     anzi = ai[i + 1] - ai[i];
354519eb980SHong Zhang     cnzi = ci[i + 1] - ci[i];
355519eb980SHong Zhang     for (j = 0; j < anzi; j++) {
356519eb980SHong Zhang       brow = aj[j];
357519eb980SHong Zhang       bnzi = bi[brow + 1] - bi[brow];
358519eb980SHong Zhang       bjj  = bj + bi[brow];
359519eb980SHong Zhang       baj  = ba + bi[brow];
360519eb980SHong Zhang       /* perform sparse axpy */
36136ec6d2dSHong Zhang       valtmp = aa[j];
362c124e916SHong Zhang       nextb  = 0;
363c124e916SHong Zhang       for (k = 0; nextb < bnzi; k++) {
364c124e916SHong Zhang         if (cj[k] == bjj[nextb]) { /* ccol == bcol */
36536ec6d2dSHong Zhang           ca[k] += valtmp * baj[nextb++];
366c124e916SHong Zhang         }
367d50806bdSBarry Smith       }
368d50806bdSBarry Smith       flops += 2 * bnzi;
369d50806bdSBarry Smith     }
3709371c9d4SSatish Balay     aj += anzi;
3719371c9d4SSatish Balay     aa += anzi;
3729371c9d4SSatish Balay     cj += cnzi;
3739371c9d4SSatish Balay     ca += cnzi;
374d50806bdSBarry Smith   }
3752e5835c6SStefano Zampini #if defined(PETSC_HAVE_DEVICE)
3762e5835c6SStefano Zampini   if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
3772e5835c6SStefano Zampini #endif
3789566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3799566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3809566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
3819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
3829566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
384d50806bdSBarry Smith }
385bc011b1eSHong Zhang 
386d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
387d71ae5a4SJacob Faibussowitsch {
38825296bd5SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
38925296bd5SBarry Smith   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
3903c50cad2SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
39125296bd5SBarry Smith   MatScalar         *ca;
39225296bd5SBarry Smith   PetscReal          afill;
393eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
394eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
3950298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
39625296bd5SBarry Smith 
39725296bd5SBarry Smith   PetscFunctionBegin;
3983c50cad2SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
3993c50cad2SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
4009566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
4013c50cad2SHong Zhang   ci[0] = 0;
4023c50cad2SHong Zhang 
4033c50cad2SHong Zhang   /* create and initialize a linked list */
404eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
405c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
406eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
407eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
408eca6b86aSHong Zhang 
4099566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
4103c50cad2SHong Zhang 
4113c50cad2SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
4129566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
4133c50cad2SHong Zhang   current_space = free_space;
4143c50cad2SHong Zhang 
4153c50cad2SHong Zhang   /* Determine ci and cj */
4163c50cad2SHong Zhang   for (i = 0; i < am; i++) {
4173c50cad2SHong Zhang     anzi = ai[i + 1] - ai[i];
4183c50cad2SHong Zhang     aj   = a->j + ai[i];
4193c50cad2SHong Zhang     for (j = 0; j < anzi; j++) {
4203c50cad2SHong Zhang       brow = aj[j];
4213c50cad2SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
4223c50cad2SHong Zhang       bj   = b->j + bi[brow];
4233c50cad2SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
4249566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
4253c50cad2SHong Zhang     }
4268c78258cSHong Zhang     /* add possible missing diagonal entry */
42748a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
4283c50cad2SHong Zhang     cnzi = lnk[1];
4293c50cad2SHong Zhang 
4303c50cad2SHong Zhang     /* If free space is not available, make more free space */
4313c50cad2SHong Zhang     /* Double the amount of total space in the list */
4323c50cad2SHong Zhang     if (current_space->local_remaining < cnzi) {
4339566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
4343c50cad2SHong Zhang       ndouble++;
4353c50cad2SHong Zhang     }
4363c50cad2SHong Zhang 
4373c50cad2SHong Zhang     /* Copy data into free space, then initialize lnk */
4389566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
4392205254eSKarl Rupp 
4403c50cad2SHong Zhang     current_space->array += cnzi;
4413c50cad2SHong Zhang     current_space->local_used += cnzi;
4423c50cad2SHong Zhang     current_space->local_remaining -= cnzi;
4432205254eSKarl Rupp 
4443c50cad2SHong Zhang     ci[i + 1] = ci[i] + cnzi;
4453c50cad2SHong Zhang   }
4463c50cad2SHong Zhang 
4473c50cad2SHong Zhang   /* Column indices are in the list of free space */
4483c50cad2SHong Zhang   /* Allocate space for cj, initialize cj, and */
4493c50cad2SHong Zhang   /* destroy list of free space and other temporary array(s) */
4509566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
4519566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
4529566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_fast(lnk));
45325296bd5SBarry Smith 
45425296bd5SBarry Smith   /* Allocate space for ca */
4559566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
45625296bd5SBarry Smith 
45725296bd5SBarry Smith   /* put together the new symbolic matrix */
4589566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
4599566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
46025296bd5SBarry Smith 
46125296bd5SBarry Smith   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
46225296bd5SBarry Smith   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
463f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
46425296bd5SBarry Smith   c->free_a  = PETSC_TRUE;
46525296bd5SBarry Smith   c->free_ij = PETSC_TRUE;
46625296bd5SBarry Smith   c->nonew   = 0;
4672205254eSKarl Rupp 
4684222ddf1SHong Zhang   /* slower, less memory */
4694222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
47025296bd5SBarry Smith 
47125296bd5SBarry Smith   /* set MatInfo */
47225296bd5SBarry Smith   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
47325296bd5SBarry Smith   if (afill < 1.0) afill = 1.0;
4744222ddf1SHong Zhang   C->info.mallocs           = ndouble;
4754222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
4764222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
47725296bd5SBarry Smith 
47825296bd5SBarry Smith #if defined(PETSC_USE_INFO)
47925296bd5SBarry Smith   if (ci[am]) {
4809566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
4819566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
48225296bd5SBarry Smith   } else {
4839566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
48425296bd5SBarry Smith   }
48525296bd5SBarry Smith #endif
4863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
48725296bd5SBarry Smith }
48825296bd5SBarry Smith 
489d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
490d71ae5a4SJacob Faibussowitsch {
491e9e4536cSHong Zhang   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
492bf9555e6SHong Zhang   PetscInt          *ai = a->i, *bi = b->i, *ci, *cj;
49325c41797SHong Zhang   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
494e9e4536cSHong Zhang   MatScalar         *ca;
495e9e4536cSHong Zhang   PetscReal          afill;
496eca6b86aSHong Zhang   PetscInt           i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
497eec179cfSJacob Faibussowitsch   PetscHMapI         ta;
4980298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
499e9e4536cSHong Zhang 
500e9e4536cSHong Zhang   PetscFunctionBegin;
501bd958071SHong Zhang   /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
502bd958071SHong Zhang   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
5039566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
504bd958071SHong Zhang   ci[0] = 0;
505bd958071SHong Zhang 
506bd958071SHong Zhang   /* create and initialize a linked list */
507eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapICreateWithSize(bn, &ta));
508c373ccc6SHong Zhang   MatRowMergeMax_SeqAIJ(b, bm, ta);
509eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIGetSize(ta, &Crmax));
510eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&ta));
5119566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
512bd958071SHong Zhang 
513bd958071SHong Zhang   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
5149566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
515bd958071SHong Zhang   current_space = free_space;
516bd958071SHong Zhang 
517bd958071SHong Zhang   /* Determine ci and cj */
518bd958071SHong Zhang   for (i = 0; i < am; i++) {
519bd958071SHong Zhang     anzi = ai[i + 1] - ai[i];
520bd958071SHong Zhang     aj   = a->j + ai[i];
521bd958071SHong Zhang     for (j = 0; j < anzi; j++) {
522bd958071SHong Zhang       brow = aj[j];
523bd958071SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
524bd958071SHong Zhang       bj   = b->j + bi[brow];
525bd958071SHong Zhang       /* add non-zero cols of B into the sorted linked list lnk */
5269566063dSJacob Faibussowitsch       PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
527bd958071SHong Zhang     }
5288c78258cSHong Zhang     /* add possible missing diagonal entry */
52948a46eb9SPierre Jolivet     if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
5308c78258cSHong Zhang 
531bd958071SHong Zhang     cnzi = lnk[0];
532bd958071SHong Zhang 
533bd958071SHong Zhang     /* If free space is not available, make more free space */
534bd958071SHong Zhang     /* Double the amount of total space in the list */
535bd958071SHong Zhang     if (current_space->local_remaining < cnzi) {
5369566063dSJacob Faibussowitsch       PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), &current_space));
537bd958071SHong Zhang       ndouble++;
538bd958071SHong Zhang     }
539bd958071SHong Zhang 
540bd958071SHong Zhang     /* Copy data into free space, then initialize lnk */
5419566063dSJacob Faibussowitsch     PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
5422205254eSKarl Rupp 
543bd958071SHong Zhang     current_space->array += cnzi;
544bd958071SHong Zhang     current_space->local_used += cnzi;
545bd958071SHong Zhang     current_space->local_remaining -= cnzi;
5462205254eSKarl Rupp 
547bd958071SHong Zhang     ci[i + 1] = ci[i] + cnzi;
548bd958071SHong Zhang   }
549bd958071SHong Zhang 
550bd958071SHong Zhang   /* Column indices are in the list of free space */
551bd958071SHong Zhang   /* Allocate space for cj, initialize cj, and */
552bd958071SHong Zhang   /* destroy list of free space and other temporary array(s) */
5539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am] + 1, &cj));
5549566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
5559566063dSJacob Faibussowitsch   PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
556e9e4536cSHong Zhang 
557e9e4536cSHong Zhang   /* Allocate space for ca */
5589566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(ci[am] + 1, &ca));
559e9e4536cSHong Zhang 
560e9e4536cSHong Zhang   /* put together the new symbolic matrix */
5619566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
5629566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
563e9e4536cSHong Zhang 
564e9e4536cSHong Zhang   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
565e9e4536cSHong Zhang   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
566f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
567e9e4536cSHong Zhang   c->free_a  = PETSC_TRUE;
568e9e4536cSHong Zhang   c->free_ij = PETSC_TRUE;
569e9e4536cSHong Zhang   c->nonew   = 0;
5702205254eSKarl Rupp 
5714222ddf1SHong Zhang   /* slower, less memory */
5724222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
573e9e4536cSHong Zhang 
574e9e4536cSHong Zhang   /* set MatInfo */
575e9e4536cSHong Zhang   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
576e9e4536cSHong Zhang   if (afill < 1.0) afill = 1.0;
5774222ddf1SHong Zhang   C->info.mallocs           = ndouble;
5784222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
5794222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
580e9e4536cSHong Zhang 
581e9e4536cSHong Zhang #if defined(PETSC_USE_INFO)
582e9e4536cSHong Zhang   if (ci[am]) {
5839566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
5849566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
585e9e4536cSHong Zhang   } else {
5869566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
587e9e4536cSHong Zhang   }
588e9e4536cSHong Zhang #endif
5893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
590e9e4536cSHong Zhang }
591e9e4536cSHong Zhang 
592d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
593d71ae5a4SJacob Faibussowitsch {
5940ced3a2bSJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
5950ced3a2bSJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
5960ced3a2bSJed Brown   PetscInt          *ci, *cj, *bb;
5970ced3a2bSJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
5980ced3a2bSJed Brown   PetscReal          afill;
5990ced3a2bSJed Brown   PetscInt           i, j, col, ndouble = 0;
6000298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
6010ced3a2bSJed Brown   PetscHeap          h;
6020ced3a2bSJed Brown 
6030ced3a2bSJed Brown   PetscFunctionBegin;
604cd093f1dSJed Brown   /* Get ci and cj - by merging sorted rows using a heap */
6050ced3a2bSJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
6069566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
6070ced3a2bSJed Brown   ci[0] = 0;
6080ced3a2bSJed Brown 
6090ced3a2bSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
6109566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
6110ced3a2bSJed Brown   current_space = free_space;
6120ced3a2bSJed Brown 
6139566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
6149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
6150ced3a2bSJed Brown 
6160ced3a2bSJed Brown   /* Determine ci and cj */
6170ced3a2bSJed Brown   for (i = 0; i < am; i++) {
6180ced3a2bSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
6190ced3a2bSJed Brown     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
6200ced3a2bSJed Brown     ci[i + 1]            = ci[i];
6210ced3a2bSJed Brown     /* Populate the min heap */
6220ced3a2bSJed Brown     for (j = 0; j < anzi; j++) {
6230ced3a2bSJed Brown       bb[j] = bi[acol[j]];           /* bb points at the start of the row */
6240ced3a2bSJed Brown       if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
6259566063dSJacob Faibussowitsch         PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
6260ced3a2bSJed Brown       }
6270ced3a2bSJed Brown     }
6280ced3a2bSJed Brown     /* Pick off the min element, adding it to free space */
6299566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
6300ced3a2bSJed Brown     while (j >= 0) {
6310ced3a2bSJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
6329566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
6330ced3a2bSJed Brown         ndouble++;
6340ced3a2bSJed Brown       }
6350ced3a2bSJed Brown       *(current_space->array++) = col;
6360ced3a2bSJed Brown       current_space->local_used++;
6370ced3a2bSJed Brown       current_space->local_remaining--;
6380ced3a2bSJed Brown       ci[i + 1]++;
6390ced3a2bSJed Brown 
6400ced3a2bSJed Brown       /* stash if anything else remains in this row of B */
6419566063dSJacob Faibussowitsch       if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
6420ced3a2bSJed Brown       while (1) { /* pop and stash any other rows of B that also had an entry in this column */
6430ced3a2bSJed Brown         PetscInt j2, col2;
6449566063dSJacob Faibussowitsch         PetscCall(PetscHeapPeek(h, &j2, &col2));
6450ced3a2bSJed Brown         if (col2 != col) break;
6469566063dSJacob Faibussowitsch         PetscCall(PetscHeapPop(h, &j2, &col2));
6479566063dSJacob Faibussowitsch         if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
6480ced3a2bSJed Brown       }
6490ced3a2bSJed Brown       /* Put any stashed elements back into the min heap */
6509566063dSJacob Faibussowitsch       PetscCall(PetscHeapUnstash(h));
6519566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
6520ced3a2bSJed Brown     }
6530ced3a2bSJed Brown   }
6549566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
6559566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
6560ced3a2bSJed Brown 
6570ced3a2bSJed Brown   /* Column indices are in the list of free space */
6580ced3a2bSJed Brown   /* Allocate space for cj, initialize cj, and */
6590ced3a2bSJed Brown   /* destroy list of free space and other temporary array(s) */
6609566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
6619566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
6620ced3a2bSJed Brown 
6630ced3a2bSJed Brown   /* put together the new symbolic matrix */
6649566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
6659566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6660ced3a2bSJed Brown 
6670ced3a2bSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6680ced3a2bSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
669f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
6700ced3a2bSJed Brown   c->free_a  = PETSC_TRUE;
6710ced3a2bSJed Brown   c->free_ij = PETSC_TRUE;
6720ced3a2bSJed Brown   c->nonew   = 0;
67326fbe8dcSKarl Rupp 
6744222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
6750ced3a2bSJed Brown 
6760ced3a2bSJed Brown   /* set MatInfo */
6770ced3a2bSJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
6780ced3a2bSJed Brown   if (afill < 1.0) afill = 1.0;
6794222ddf1SHong Zhang   C->info.mallocs           = ndouble;
6804222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
6814222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
6820ced3a2bSJed Brown 
6830ced3a2bSJed Brown #if defined(PETSC_USE_INFO)
6840ced3a2bSJed Brown   if (ci[am]) {
6859566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
6869566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
6870ced3a2bSJed Brown   } else {
6889566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
6890ced3a2bSJed Brown   }
6900ced3a2bSJed Brown #endif
6913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6920ced3a2bSJed Brown }
693e9e4536cSHong Zhang 
694d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
695d71ae5a4SJacob Faibussowitsch {
6968a07c6f1SJed Brown   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
6978a07c6f1SJed Brown   const PetscInt    *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
6988a07c6f1SJed Brown   PetscInt          *ci, *cj, *bb;
6998a07c6f1SJed Brown   PetscInt           am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
7008a07c6f1SJed Brown   PetscReal          afill;
7018a07c6f1SJed Brown   PetscInt           i, j, col, ndouble = 0;
7020298fd71SBarry Smith   PetscFreeSpaceList free_space = NULL, current_space = NULL;
7038a07c6f1SJed Brown   PetscHeap          h;
7048a07c6f1SJed Brown   PetscBT            bt;
7058a07c6f1SJed Brown 
7068a07c6f1SJed Brown   PetscFunctionBegin;
707cd093f1dSJed Brown   /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
7088a07c6f1SJed Brown   /* Allocate arrays for fill computation and free space for accumulating nonzero column */
7099566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 2, &ci));
7108a07c6f1SJed Brown   ci[0] = 0;
7118a07c6f1SJed Brown 
7128a07c6f1SJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
7139566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
7142205254eSKarl Rupp 
7158a07c6f1SJed Brown   current_space = free_space;
7168a07c6f1SJed Brown 
7179566063dSJacob Faibussowitsch   PetscCall(PetscHeapCreate(a->rmax, &h));
7189566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a->rmax, &bb));
7199566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(bn, &bt));
7208a07c6f1SJed Brown 
7218a07c6f1SJed Brown   /* Determine ci and cj */
7228a07c6f1SJed Brown   for (i = 0; i < am; i++) {
7238a07c6f1SJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];    /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
7248a07c6f1SJed Brown     const PetscInt *acol = aj + ai[i];           /* column indices of nonzero entries in this row */
7258a07c6f1SJed Brown     const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
7268a07c6f1SJed Brown     ci[i + 1]            = ci[i];
7278a07c6f1SJed Brown     /* Populate the min heap */
7288a07c6f1SJed Brown     for (j = 0; j < anzi; j++) {
7298a07c6f1SJed Brown       PetscInt brow = acol[j];
7308a07c6f1SJed Brown       for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
7318a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7328a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7339566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7348a07c6f1SJed Brown           bb[j]++;
7358a07c6f1SJed Brown           break;
7368a07c6f1SJed Brown         }
7378a07c6f1SJed Brown       }
7388a07c6f1SJed Brown     }
7398a07c6f1SJed Brown     /* Pick off the min element, adding it to free space */
7409566063dSJacob Faibussowitsch     PetscCall(PetscHeapPop(h, &j, &col));
7418a07c6f1SJed Brown     while (j >= 0) {
7428a07c6f1SJed Brown       if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
7430298fd71SBarry Smith         fptr = NULL;                            /* need PetscBTMemzero */
7449566063dSJacob Faibussowitsch         PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), &current_space));
7458a07c6f1SJed Brown         ndouble++;
7468a07c6f1SJed Brown       }
7478a07c6f1SJed Brown       *(current_space->array++) = col;
7488a07c6f1SJed Brown       current_space->local_used++;
7498a07c6f1SJed Brown       current_space->local_remaining--;
7508a07c6f1SJed Brown       ci[i + 1]++;
7518a07c6f1SJed Brown 
7528a07c6f1SJed Brown       /* stash if anything else remains in this row of B */
7538a07c6f1SJed Brown       for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
7548a07c6f1SJed Brown         PetscInt bcol = bj[bb[j]];
7558a07c6f1SJed Brown         if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
7569566063dSJacob Faibussowitsch           PetscCall(PetscHeapAdd(h, j, bcol));
7578a07c6f1SJed Brown           bb[j]++;
7588a07c6f1SJed Brown           break;
7598a07c6f1SJed Brown         }
7608a07c6f1SJed Brown       }
7619566063dSJacob Faibussowitsch       PetscCall(PetscHeapPop(h, &j, &col));
7628a07c6f1SJed Brown     }
7638a07c6f1SJed Brown     if (fptr) { /* Clear the bits for this row */
7649566063dSJacob Faibussowitsch       for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
7658a07c6f1SJed Brown     } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
7669566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(bn, bt));
7678a07c6f1SJed Brown     }
7688a07c6f1SJed Brown   }
7699566063dSJacob Faibussowitsch   PetscCall(PetscFree(bb));
7709566063dSJacob Faibussowitsch   PetscCall(PetscHeapDestroy(&h));
7719566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&bt));
7728a07c6f1SJed Brown 
7738a07c6f1SJed Brown   /* Column indices are in the list of free space */
7748a07c6f1SJed Brown   /* Allocate space for cj, initialize cj, and */
7758a07c6f1SJed Brown   /* destroy list of free space and other temporary array(s) */
7769566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(ci[am], &cj));
7779566063dSJacob Faibussowitsch   PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
7788a07c6f1SJed Brown 
7798a07c6f1SJed Brown   /* put together the new symbolic matrix */
7809566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
7819566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
7828a07c6f1SJed Brown 
7838a07c6f1SJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
7848a07c6f1SJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
785f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
7868a07c6f1SJed Brown   c->free_a  = PETSC_TRUE;
7878a07c6f1SJed Brown   c->free_ij = PETSC_TRUE;
7888a07c6f1SJed Brown   c->nonew   = 0;
78926fbe8dcSKarl Rupp 
7904222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
7918a07c6f1SJed Brown 
7928a07c6f1SJed Brown   /* set MatInfo */
7938a07c6f1SJed Brown   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
7948a07c6f1SJed Brown   if (afill < 1.0) afill = 1.0;
7954222ddf1SHong Zhang   C->info.mallocs           = ndouble;
7964222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
7974222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
7988a07c6f1SJed Brown 
7998a07c6f1SJed Brown #if defined(PETSC_USE_INFO)
8008a07c6f1SJed Brown   if (ci[am]) {
8019566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
8029566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
8038a07c6f1SJed Brown   } else {
8049566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
8058a07c6f1SJed Brown   }
8068a07c6f1SJed Brown #endif
8073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
8088a07c6f1SJed Brown }
8098a07c6f1SJed Brown 
810d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
811d71ae5a4SJacob Faibussowitsch {
812d7ed1a4dSandi selinger   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
813d7ed1a4dSandi selinger   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
814d7ed1a4dSandi selinger   PetscInt       *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
815d7ed1a4dSandi selinger   PetscInt        c_maxmem, a_maxrownnz = 0, a_rownnz;
816d7ed1a4dSandi selinger   const PetscInt  workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
817d7ed1a4dSandi selinger   const PetscInt  am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
818d7ed1a4dSandi selinger   const PetscInt *brow_ptr[8], *brow_end[8];
819d7ed1a4dSandi selinger   PetscInt        window[8];
820d7ed1a4dSandi selinger   PetscInt        window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
821d7ed1a4dSandi selinger   PetscInt        i, k, ndouble = 0, L1_rowsleft, rowsleft;
822d7ed1a4dSandi selinger   PetscReal       afill;
823f83700f2Sandi selinger   PetscInt       *workj_L1, *workj_L2, *workj_L3;
8247660c4dbSandi selinger   PetscInt        L1_nnz, L2_nnz;
825d7ed1a4dSandi selinger 
826d7ed1a4dSandi selinger   /* Step 1: Get upper bound on memory required for allocation.
827d7ed1a4dSandi selinger              Because of the way virtual memory works,
828d7ed1a4dSandi selinger              only the memory pages that are actually needed will be physically allocated. */
829d7ed1a4dSandi selinger   PetscFunctionBegin;
8309566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
831d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
832d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
833d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
834d7ed1a4dSandi selinger     a_rownnz             = 0;
835d7ed1a4dSandi selinger     for (k = 0; k < anzi; ++k) {
836d7ed1a4dSandi selinger       a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
837d7ed1a4dSandi selinger       if (a_rownnz > bn) {
838d7ed1a4dSandi selinger         a_rownnz = bn;
839d7ed1a4dSandi selinger         break;
840d7ed1a4dSandi selinger       }
841d7ed1a4dSandi selinger     }
842d7ed1a4dSandi selinger     a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
843d7ed1a4dSandi selinger   }
844d7ed1a4dSandi selinger   /* temporary work areas for merging rows */
8459566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
8469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
8479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
848d7ed1a4dSandi selinger 
8497660c4dbSandi selinger   /* This should be enough for almost all matrices. If not, memory is reallocated later. */
8507660c4dbSandi selinger   c_maxmem = 8 * (ai[am] + bi[bm]);
851d7ed1a4dSandi selinger   /* Step 2: Populate pattern for C */
8529566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c_maxmem, &cj));
853d7ed1a4dSandi selinger 
854d7ed1a4dSandi selinger   ci_nnz      = 0;
855d7ed1a4dSandi selinger   ci[0]       = 0;
856d7ed1a4dSandi selinger   worki_L1[0] = 0;
857d7ed1a4dSandi selinger   worki_L2[0] = 0;
858d7ed1a4dSandi selinger   for (i = 0; i < am; i++) {
859d7ed1a4dSandi selinger     const PetscInt  anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
860d7ed1a4dSandi selinger     const PetscInt *acol = aj + ai[i];        /* column indices of nonzero entries in this row */
861d7ed1a4dSandi selinger     rowsleft             = anzi;
862d7ed1a4dSandi selinger     inputcol_L1          = acol;
863d7ed1a4dSandi selinger     L2_nnz               = 0;
8647660c4dbSandi selinger     L2_nrows             = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1   */
8657660c4dbSandi selinger     worki_L2[1]          = 0;
86608fe1b3cSKarl Rupp     outputi_nnz          = 0;
867d7ed1a4dSandi selinger 
868d7ed1a4dSandi selinger     /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem  -> allocate more memory */
869d7ed1a4dSandi selinger     while (ci_nnz + a_maxrownnz > c_maxmem) {
870d7ed1a4dSandi selinger       c_maxmem *= 2;
871d7ed1a4dSandi selinger       ndouble++;
8729566063dSJacob Faibussowitsch       PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
873d7ed1a4dSandi selinger     }
874d7ed1a4dSandi selinger 
875d7ed1a4dSandi selinger     while (rowsleft) {
876d7ed1a4dSandi selinger       L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
877d7ed1a4dSandi selinger       L1_nrows    = 0;
8787660c4dbSandi selinger       L1_nnz      = 0;
879d7ed1a4dSandi selinger       inputcol    = inputcol_L1;
880d7ed1a4dSandi selinger       inputi      = bi;
881d7ed1a4dSandi selinger       inputj      = bj;
882d7ed1a4dSandi selinger 
883d7ed1a4dSandi selinger       /* The following macro is used to specialize for small rows in A.
884d7ed1a4dSandi selinger          This helps with compiler unrolling, improving performance substantially.
885f83700f2Sandi selinger           Input:  inputj   inputi  inputcol  bn
886d7ed1a4dSandi selinger           Output: outputj  outputi_nnz                       */
887d7ed1a4dSandi selinger #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
888a8f51744SPierre Jolivet   do { \
889d7ed1a4dSandi selinger     window_min  = bn; \
8907660c4dbSandi selinger     outputi_nnz = 0; \
8917660c4dbSandi selinger     for (k = 0; k < ANNZ; ++k) { \
892d7ed1a4dSandi selinger       brow_ptr[k] = inputj + inputi[inputcol[k]]; \
893d7ed1a4dSandi selinger       brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
894d7ed1a4dSandi selinger       window[k]   = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
895d7ed1a4dSandi selinger       window_min  = PetscMin(window[k], window_min); \
896d7ed1a4dSandi selinger     } \
897d7ed1a4dSandi selinger     while (window_min < bn) { \
898d7ed1a4dSandi selinger       outputj[outputi_nnz++] = window_min; \
899d7ed1a4dSandi selinger       /* advance front and compute new minimum */ \
900d7ed1a4dSandi selinger       old_window_min = window_min; \
901d7ed1a4dSandi selinger       window_min     = bn; \
902d7ed1a4dSandi selinger       for (k = 0; k < ANNZ; ++k) { \
903d7ed1a4dSandi selinger         if (window[k] == old_window_min) { \
904d7ed1a4dSandi selinger           brow_ptr[k]++; \
905d7ed1a4dSandi selinger           window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
906d7ed1a4dSandi selinger         } \
907d7ed1a4dSandi selinger         window_min = PetscMin(window[k], window_min); \
908d7ed1a4dSandi selinger       } \
909a8f51744SPierre Jolivet     } \
910a8f51744SPierre Jolivet   } while (0)
911d7ed1a4dSandi selinger 
912d7ed1a4dSandi selinger       /************** L E V E L  1 ***************/
913d7ed1a4dSandi selinger       /* Merge up to 8 rows of B to L1 work array*/
914d7ed1a4dSandi selinger       while (L1_rowsleft) {
9157660c4dbSandi selinger         outputi_nnz = 0;
9167660c4dbSandi selinger         if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
9177660c4dbSandi selinger         else outputj = cj + ci_nnz;                /* Merge directly to C */
9187660c4dbSandi selinger 
919d7ed1a4dSandi selinger         switch (L1_rowsleft) {
9209371c9d4SSatish Balay         case 1:
9219371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
922d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
923d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
924d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
925d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
926d7ed1a4dSandi selinger           L1_rowsleft = 0;
927d7ed1a4dSandi selinger           break;
9289371c9d4SSatish Balay         case 2:
9299371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(2);
930d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
931d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
932d7ed1a4dSandi selinger           L1_rowsleft = 0;
933d7ed1a4dSandi selinger           break;
9349371c9d4SSatish Balay         case 3:
9359371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(3);
936d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
937d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
938d7ed1a4dSandi selinger           L1_rowsleft = 0;
939d7ed1a4dSandi selinger           break;
9409371c9d4SSatish Balay         case 4:
9419371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(4);
942d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
943d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
944d7ed1a4dSandi selinger           L1_rowsleft = 0;
945d7ed1a4dSandi selinger           break;
9469371c9d4SSatish Balay         case 5:
9479371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(5);
948d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
949d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
950d7ed1a4dSandi selinger           L1_rowsleft = 0;
951d7ed1a4dSandi selinger           break;
9529371c9d4SSatish Balay         case 6:
9539371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(6);
954d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
955d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
956d7ed1a4dSandi selinger           L1_rowsleft = 0;
957d7ed1a4dSandi selinger           break;
9589371c9d4SSatish Balay         case 7:
9599371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(7);
960d7ed1a4dSandi selinger           inputcol += L1_rowsleft;
961d7ed1a4dSandi selinger           rowsleft -= L1_rowsleft;
962d7ed1a4dSandi selinger           L1_rowsleft = 0;
963d7ed1a4dSandi selinger           break;
9649371c9d4SSatish Balay         default:
9659371c9d4SSatish Balay           MatMatMultSymbolic_RowMergeMacro(8);
966d7ed1a4dSandi selinger           inputcol += 8;
967d7ed1a4dSandi selinger           rowsleft -= 8;
968d7ed1a4dSandi selinger           L1_rowsleft -= 8;
969d7ed1a4dSandi selinger           break;
970d7ed1a4dSandi selinger         }
971d7ed1a4dSandi selinger         inputcol_L1 = inputcol;
9727660c4dbSandi selinger         L1_nnz += outputi_nnz;
9737660c4dbSandi selinger         worki_L1[++L1_nrows] = L1_nnz;
974d7ed1a4dSandi selinger       }
975d7ed1a4dSandi selinger 
976d7ed1a4dSandi selinger       /********************** L E V E L  2 ************************/
977d7ed1a4dSandi selinger       /* Merge from L1 work array to either C or to L2 work array */
978d7ed1a4dSandi selinger       if (anzi > 8) {
979d7ed1a4dSandi selinger         inputi      = worki_L1;
980d7ed1a4dSandi selinger         inputj      = workj_L1;
981d7ed1a4dSandi selinger         inputcol    = workcol;
982d7ed1a4dSandi selinger         outputi_nnz = 0;
983d7ed1a4dSandi selinger 
984d7ed1a4dSandi selinger         if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
985d7ed1a4dSandi selinger         else outputj = workj_L2 + L2_nnz;      /* Merge from L1 work array to L2 work array */
986d7ed1a4dSandi selinger 
987d7ed1a4dSandi selinger         switch (L1_nrows) {
9889371c9d4SSatish Balay         case 1:
9899371c9d4SSatish Balay           brow_ptr[0] = inputj + inputi[inputcol[0]];
990d7ed1a4dSandi selinger           brow_end[0] = inputj + inputi[inputcol[0] + 1];
991d7ed1a4dSandi selinger           for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
992d7ed1a4dSandi selinger           break;
993d71ae5a4SJacob Faibussowitsch         case 2:
994d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(2);
995d71ae5a4SJacob Faibussowitsch           break;
996d71ae5a4SJacob Faibussowitsch         case 3:
997d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(3);
998d71ae5a4SJacob Faibussowitsch           break;
999d71ae5a4SJacob Faibussowitsch         case 4:
1000d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(4);
1001d71ae5a4SJacob Faibussowitsch           break;
1002d71ae5a4SJacob Faibussowitsch         case 5:
1003d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(5);
1004d71ae5a4SJacob Faibussowitsch           break;
1005d71ae5a4SJacob Faibussowitsch         case 6:
1006d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(6);
1007d71ae5a4SJacob Faibussowitsch           break;
1008d71ae5a4SJacob Faibussowitsch         case 7:
1009d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(7);
1010d71ae5a4SJacob Faibussowitsch           break;
1011d71ae5a4SJacob Faibussowitsch         case 8:
1012d71ae5a4SJacob Faibussowitsch           MatMatMultSymbolic_RowMergeMacro(8);
1013d71ae5a4SJacob Faibussowitsch           break;
1014d71ae5a4SJacob Faibussowitsch         default:
1015d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1016d7ed1a4dSandi selinger         }
1017d7ed1a4dSandi selinger         L2_nnz += outputi_nnz;
1018d7ed1a4dSandi selinger         worki_L2[++L2_nrows] = L2_nnz;
1019d7ed1a4dSandi selinger 
10207660c4dbSandi selinger         /************************ L E V E L  3 **********************/
10217660c4dbSandi selinger         /* Merge from L2 work array to either C or to L2 work array */
1022d7ed1a4dSandi selinger         if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1023d7ed1a4dSandi selinger           inputi      = worki_L2;
1024d7ed1a4dSandi selinger           inputj      = workj_L2;
1025d7ed1a4dSandi selinger           inputcol    = workcol;
1026d7ed1a4dSandi selinger           outputi_nnz = 0;
1027f83700f2Sandi selinger           if (rowsleft) outputj = workj_L3;
1028d7ed1a4dSandi selinger           else outputj = cj + ci_nnz;
1029d7ed1a4dSandi selinger           switch (L2_nrows) {
10309371c9d4SSatish Balay           case 1:
10319371c9d4SSatish Balay             brow_ptr[0] = inputj + inputi[inputcol[0]];
1032d7ed1a4dSandi selinger             brow_end[0] = inputj + inputi[inputcol[0] + 1];
1033d7ed1a4dSandi selinger             for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1034d7ed1a4dSandi selinger             break;
1035d71ae5a4SJacob Faibussowitsch           case 2:
1036d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(2);
1037d71ae5a4SJacob Faibussowitsch             break;
1038d71ae5a4SJacob Faibussowitsch           case 3:
1039d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(3);
1040d71ae5a4SJacob Faibussowitsch             break;
1041d71ae5a4SJacob Faibussowitsch           case 4:
1042d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(4);
1043d71ae5a4SJacob Faibussowitsch             break;
1044d71ae5a4SJacob Faibussowitsch           case 5:
1045d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(5);
1046d71ae5a4SJacob Faibussowitsch             break;
1047d71ae5a4SJacob Faibussowitsch           case 6:
1048d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(6);
1049d71ae5a4SJacob Faibussowitsch             break;
1050d71ae5a4SJacob Faibussowitsch           case 7:
1051d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(7);
1052d71ae5a4SJacob Faibussowitsch             break;
1053d71ae5a4SJacob Faibussowitsch           case 8:
1054d71ae5a4SJacob Faibussowitsch             MatMatMultSymbolic_RowMergeMacro(8);
1055d71ae5a4SJacob Faibussowitsch             break;
1056d71ae5a4SJacob Faibussowitsch           default:
1057d71ae5a4SJacob Faibussowitsch             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1058d7ed1a4dSandi selinger           }
1059d7ed1a4dSandi selinger           L2_nrows    = 1;
10607660c4dbSandi selinger           L2_nnz      = outputi_nnz;
10617660c4dbSandi selinger           worki_L2[1] = outputi_nnz;
10627660c4dbSandi selinger           /* Copy to workj_L2 */
1063d7ed1a4dSandi selinger           if (rowsleft) {
10647660c4dbSandi selinger             for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1065d7ed1a4dSandi selinger           }
1066d7ed1a4dSandi selinger         }
1067d7ed1a4dSandi selinger       }
1068d7ed1a4dSandi selinger     } /* while (rowsleft) */
1069d7ed1a4dSandi selinger #undef MatMatMultSymbolic_RowMergeMacro
1070d7ed1a4dSandi selinger 
1071d7ed1a4dSandi selinger     /* terminate current row */
1072d7ed1a4dSandi selinger     ci_nnz += outputi_nnz;
1073d7ed1a4dSandi selinger     ci[i + 1] = ci_nnz;
1074d7ed1a4dSandi selinger   }
1075d7ed1a4dSandi selinger 
1076d7ed1a4dSandi selinger   /* Step 3: Create the new symbolic matrix */
10779566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
10789566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1079d7ed1a4dSandi selinger 
1080d7ed1a4dSandi selinger   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1081d7ed1a4dSandi selinger   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1082f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
1083d7ed1a4dSandi selinger   c->free_a  = PETSC_TRUE;
1084d7ed1a4dSandi selinger   c->free_ij = PETSC_TRUE;
1085d7ed1a4dSandi selinger   c->nonew   = 0;
1086d7ed1a4dSandi selinger 
10874222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1088d7ed1a4dSandi selinger 
1089d7ed1a4dSandi selinger   /* set MatInfo */
1090d7ed1a4dSandi selinger   afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1091d7ed1a4dSandi selinger   if (afill < 1.0) afill = 1.0;
10924222ddf1SHong Zhang   C->info.mallocs           = ndouble;
10934222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
10944222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1095d7ed1a4dSandi selinger 
1096d7ed1a4dSandi selinger #if defined(PETSC_USE_INFO)
1097d7ed1a4dSandi selinger   if (ci[am]) {
10989566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
10999566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1100d7ed1a4dSandi selinger   } else {
11019566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1102d7ed1a4dSandi selinger   }
1103d7ed1a4dSandi selinger #endif
1104d7ed1a4dSandi selinger 
1105d7ed1a4dSandi selinger   /* Step 4: Free temporary work areas */
11069566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L1));
11079566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L2));
11089566063dSJacob Faibussowitsch   PetscCall(PetscFree(workj_L3));
11093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1110d7ed1a4dSandi selinger }
1111d7ed1a4dSandi selinger 
1112cd093f1dSJed Brown /* concatenate unique entries and then sort */
1113d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1114d71ae5a4SJacob Faibussowitsch {
1115cd093f1dSJed Brown   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1116cd093f1dSJed Brown   const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
11178c78258cSHong Zhang   PetscInt       *ci, *cj, bcol;
1118cd093f1dSJed Brown   PetscInt        am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1119cd093f1dSJed Brown   PetscReal       afill;
1120cd093f1dSJed Brown   PetscInt        i, j, ndouble = 0;
1121cd093f1dSJed Brown   PetscSegBuffer  seg, segrow;
1122cd093f1dSJed Brown   char           *seen;
1123cd093f1dSJed Brown 
1124cd093f1dSJed Brown   PetscFunctionBegin;
11259566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(am + 1, &ci));
1126cd093f1dSJed Brown   ci[0] = 0;
1127cd093f1dSJed Brown 
1128cd093f1dSJed Brown   /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
11299566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
11309566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
11319566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(bn, &seen));
1132cd093f1dSJed Brown 
1133cd093f1dSJed Brown   /* Determine ci and cj */
1134cd093f1dSJed Brown   for (i = 0; i < am; i++) {
1135cd093f1dSJed Brown     const PetscInt  anzi = ai[i + 1] - ai[i];                     /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
11368e3a54c0SPierre Jolivet     const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */
1137cd093f1dSJed Brown     PetscInt packlen     = 0, *PETSC_RESTRICT crow;
11388c78258cSHong Zhang 
1139cd093f1dSJed Brown     /* Pack segrow */
1140cd093f1dSJed Brown     for (j = 0; j < anzi; j++) {
1141cd093f1dSJed Brown       PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1142cd093f1dSJed Brown       for (k = bjstart; k < bjend; k++) {
11438c78258cSHong Zhang         bcol = bj[k];
1144cd093f1dSJed Brown         if (!seen[bcol]) { /* new entry */
1145cd093f1dSJed Brown           PetscInt *PETSC_RESTRICT slot;
11469566063dSJacob Faibussowitsch           PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1147cd093f1dSJed Brown           *slot      = bcol;
1148cd093f1dSJed Brown           seen[bcol] = 1;
1149cd093f1dSJed Brown           packlen++;
1150cd093f1dSJed Brown         }
1151cd093f1dSJed Brown       }
1152cd093f1dSJed Brown     }
11538c78258cSHong Zhang 
11548c78258cSHong Zhang     /* Check i-th diagonal entry */
11558c78258cSHong Zhang     if (C->force_diagonals && !seen[i]) {
11568c78258cSHong Zhang       PetscInt *PETSC_RESTRICT slot;
11579566063dSJacob Faibussowitsch       PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
11588c78258cSHong Zhang       *slot   = i;
11598c78258cSHong Zhang       seen[i] = 1;
11608c78258cSHong Zhang       packlen++;
11618c78258cSHong Zhang     }
11628c78258cSHong Zhang 
11639566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
11649566063dSJacob Faibussowitsch     PetscCall(PetscSegBufferExtractTo(segrow, crow));
11659566063dSJacob Faibussowitsch     PetscCall(PetscSortInt(packlen, crow));
1166cd093f1dSJed Brown     ci[i + 1] = ci[i] + packlen;
1167cd093f1dSJed Brown     for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1168cd093f1dSJed Brown   }
11699566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&segrow));
11709566063dSJacob Faibussowitsch   PetscCall(PetscFree(seen));
1171cd093f1dSJed Brown 
1172cd093f1dSJed Brown   /* Column indices are in the segmented buffer */
11739566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
11749566063dSJacob Faibussowitsch   PetscCall(PetscSegBufferDestroy(&seg));
1175cd093f1dSJed Brown 
1176cd093f1dSJed Brown   /* put together the new symbolic matrix */
11779566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
11789566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizesFromMats(C, A, B));
1179cd093f1dSJed Brown 
1180cd093f1dSJed Brown   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1181cd093f1dSJed Brown   /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1182f4f49eeaSPierre Jolivet   c          = (Mat_SeqAIJ *)C->data;
1183cd093f1dSJed Brown   c->free_a  = PETSC_TRUE;
1184cd093f1dSJed Brown   c->free_ij = PETSC_TRUE;
1185cd093f1dSJed Brown   c->nonew   = 0;
1186cd093f1dSJed Brown 
11874222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1188cd093f1dSJed Brown 
1189cd093f1dSJed Brown   /* set MatInfo */
11902a09556fSStefano Zampini   afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1191cd093f1dSJed Brown   if (afill < 1.0) afill = 1.0;
11924222ddf1SHong Zhang   C->info.mallocs           = ndouble;
11934222ddf1SHong Zhang   C->info.fill_ratio_given  = fill;
11944222ddf1SHong Zhang   C->info.fill_ratio_needed = afill;
1195cd093f1dSJed Brown 
1196cd093f1dSJed Brown #if defined(PETSC_USE_INFO)
1197cd093f1dSJed Brown   if (ci[am]) {
11989566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
11999566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1200cd093f1dSJed Brown   } else {
12019566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Empty matrix product\n"));
1202cd093f1dSJed Brown   }
1203cd093f1dSJed Brown #endif
12043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1205cd093f1dSJed Brown }
1206cd093f1dSJed Brown 
120766976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data)
1208d71ae5a4SJacob Faibussowitsch {
12096718818eSStefano Zampini   Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data;
12102128a86cSHong Zhang 
12112128a86cSHong Zhang   PetscFunctionBegin;
12129566063dSJacob Faibussowitsch   PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
12139566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->Bt_den));
12149566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&abt->ABt_den));
12159566063dSJacob Faibussowitsch   PetscCall(PetscFree(abt));
12163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
12172128a86cSHong Zhang }
12182128a86cSHong Zhang 
1219d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1220d71ae5a4SJacob Faibussowitsch {
122181d82fe4SHong Zhang   Mat                  Bt;
122240192850SHong Zhang   Mat_MatMatTransMult *abt;
12234222ddf1SHong Zhang   Mat_Product         *product = C->product;
12246718818eSStefano Zampini   char                *alg;
1225d2853540SHong Zhang 
122681d82fe4SHong Zhang   PetscFunctionBegin;
122728b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
122828b400f6SJacob Faibussowitsch   PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
12296718818eSStefano Zampini 
123081d82fe4SHong Zhang   /* create symbolic Bt */
12317fb60732SBarry Smith   PetscCall(MatTransposeSymbolic(B, &Bt));
12329566063dSJacob Faibussowitsch   PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
12339566063dSJacob Faibussowitsch   PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
123481d82fe4SHong Zhang 
123581d82fe4SHong Zhang   /* get symbolic C=A*Bt */
12369566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(product->alg, &alg));
12379566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
12389566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
12399566063dSJacob Faibussowitsch   PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
12409566063dSJacob Faibussowitsch   PetscCall(PetscFree(alg));
124181d82fe4SHong Zhang 
1242a5b23f4aSJose E. Roman   /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
12439566063dSJacob Faibussowitsch   PetscCall(PetscNew(&abt));
12442128a86cSHong Zhang 
12456718818eSStefano Zampini   product->data    = abt;
12466718818eSStefano Zampini   product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans;
12476718818eSStefano Zampini 
12484222ddf1SHong Zhang   C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
12492128a86cSHong Zhang 
12504222ddf1SHong Zhang   abt->usecoloring = PETSC_FALSE;
12519566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
125240192850SHong Zhang   if (abt->usecoloring) {
1253b9af6bddSHong Zhang     /* Create MatTransposeColoring from symbolic C=A*B^T */
1254b9af6bddSHong Zhang     MatTransposeColoring matcoloring;
1255335efc43SPeter Brune     MatColoring          coloring;
12562128a86cSHong Zhang     ISColoring           iscoloring;
12572128a86cSHong Zhang     Mat                  Bt_dense, C_dense;
1258e8354b3eSHong Zhang 
12594222ddf1SHong Zhang     /* inode causes memory problem */
12609566063dSJacob Faibussowitsch     PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
12614222ddf1SHong Zhang 
12629566063dSJacob Faibussowitsch     PetscCall(MatColoringCreate(C, &coloring));
12639566063dSJacob Faibussowitsch     PetscCall(MatColoringSetDistance(coloring, 2));
12649566063dSJacob Faibussowitsch     PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
12659566063dSJacob Faibussowitsch     PetscCall(MatColoringSetFromOptions(coloring));
12669566063dSJacob Faibussowitsch     PetscCall(MatColoringApply(coloring, &iscoloring));
12679566063dSJacob Faibussowitsch     PetscCall(MatColoringDestroy(&coloring));
12689566063dSJacob Faibussowitsch     PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
12692205254eSKarl Rupp 
127040192850SHong Zhang     abt->matcoloring = matcoloring;
12712205254eSKarl Rupp 
12729566063dSJacob Faibussowitsch     PetscCall(ISColoringDestroy(&iscoloring));
12732128a86cSHong Zhang 
12742128a86cSHong Zhang     /* Create Bt_dense and C_dense = A*Bt_dense */
12759566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
12769566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
12779566063dSJacob Faibussowitsch     PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
12789566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
12792205254eSKarl Rupp 
12802128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
128140192850SHong Zhang     abt->Bt_den         = Bt_dense;
12822128a86cSHong Zhang 
12839566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
12849566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
12859566063dSJacob Faibussowitsch     PetscCall(MatSetType(C_dense, MATSEQDENSE));
12869566063dSJacob Faibussowitsch     PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
12872205254eSKarl Rupp 
12882128a86cSHong Zhang     Bt_dense->assembled = PETSC_TRUE;
128940192850SHong Zhang     abt->ABt_den        = C_dense;
1290f94ccd6cSHong Zhang 
1291f94ccd6cSHong Zhang #if defined(PETSC_USE_INFO)
12921ce71dffSSatish Balay     {
12934222ddf1SHong Zhang       Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
12949371c9d4SSatish Balay       PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1295f4f49eeaSPierre Jolivet                           Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
12961ce71dffSSatish Balay     }
1297f94ccd6cSHong Zhang #endif
12982128a86cSHong Zhang   }
1299e99be685SHong Zhang   /* clean up */
13009566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&Bt));
13013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13025df89d91SHong Zhang }
13035df89d91SHong Zhang 
1304d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1305d71ae5a4SJacob Faibussowitsch {
13065df89d91SHong Zhang   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1307e2cac8caSJed Brown   PetscInt            *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
130881d82fe4SHong Zhang   PetscInt             cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
13095df89d91SHong Zhang   PetscLogDouble       flops = 0.0;
1310aa1aec99SHong Zhang   MatScalar           *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
13116718818eSStefano Zampini   Mat_MatMatTransMult *abt;
13126718818eSStefano Zampini   Mat_Product         *product = C->product;
13135df89d91SHong Zhang 
13145df89d91SHong Zhang   PetscFunctionBegin;
131528b400f6SJacob Faibussowitsch   PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
13166718818eSStefano Zampini   abt = (Mat_MatMatTransMult *)product->data;
131728b400f6SJacob Faibussowitsch   PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
131858ed3ceaSHong Zhang   /* clear old values in C */
131958ed3ceaSHong Zhang   if (!c->a) {
13209566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
132158ed3ceaSHong Zhang     c->a      = ca;
132258ed3ceaSHong Zhang     c->free_a = PETSC_TRUE;
132358ed3ceaSHong Zhang   } else {
132458ed3ceaSHong Zhang     ca = c->a;
13259566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm] + 1));
132658ed3ceaSHong Zhang   }
132758ed3ceaSHong Zhang 
132840192850SHong Zhang   if (abt->usecoloring) {
132940192850SHong Zhang     MatTransposeColoring matcoloring = abt->matcoloring;
133040192850SHong Zhang     Mat                  Bt_dense, C_dense = abt->ABt_den;
1331c8db22f6SHong Zhang 
1332b9af6bddSHong Zhang     /* Get Bt_dense by Apply MatTransposeColoring to B */
133340192850SHong Zhang     Bt_dense = abt->Bt_den;
13349566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1335c8db22f6SHong Zhang 
1336c8db22f6SHong Zhang     /* C_dense = A*Bt_dense */
13379566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1338c8db22f6SHong Zhang 
13392128a86cSHong Zhang     /* Recover C from C_dense */
13409566063dSJacob Faibussowitsch     PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
13413ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
1342c8db22f6SHong Zhang   }
1343c8db22f6SHong Zhang 
13441fa1209cSHong Zhang   for (i = 0; i < cm; i++) {
134581d82fe4SHong Zhang     anzi = ai[i + 1] - ai[i];
13468e3a54c0SPierre Jolivet     acol = PetscSafePointerPlusOffset(aj, ai[i]);
13478e3a54c0SPierre Jolivet     aval = PetscSafePointerPlusOffset(aa, ai[i]);
13481fa1209cSHong Zhang     cnzi = ci[i + 1] - ci[i];
13498e3a54c0SPierre Jolivet     ccol = PetscSafePointerPlusOffset(cj, ci[i]);
13506973769bSHong Zhang     cval = ca + ci[i];
13511fa1209cSHong Zhang     for (j = 0; j < cnzi; j++) {
135281d82fe4SHong Zhang       brow = ccol[j];
135381d82fe4SHong Zhang       bnzj = bi[brow + 1] - bi[brow];
135481d82fe4SHong Zhang       bcol = bj + bi[brow];
13556973769bSHong Zhang       bval = ba + bi[brow];
13566973769bSHong Zhang 
135781d82fe4SHong Zhang       /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
13589371c9d4SSatish Balay       nexta = 0;
13599371c9d4SSatish Balay       nextb = 0;
136081d82fe4SHong Zhang       while (nexta < anzi && nextb < bnzj) {
13617b6d5e96SMark Adams         while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
136281d82fe4SHong Zhang         if (nexta == anzi) break;
13637b6d5e96SMark Adams         while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
136481d82fe4SHong Zhang         if (nextb == bnzj) break;
136581d82fe4SHong Zhang         if (acol[nexta] == bcol[nextb]) {
13666973769bSHong Zhang           cval[j] += aval[nexta] * bval[nextb];
13679371c9d4SSatish Balay           nexta++;
13689371c9d4SSatish Balay           nextb++;
136981d82fe4SHong Zhang           flops += 2;
13701fa1209cSHong Zhang         }
13711fa1209cSHong Zhang       }
137281d82fe4SHong Zhang     }
137381d82fe4SHong Zhang   }
13749566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
13759566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
13769566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
13773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13785df89d91SHong Zhang }
13795df89d91SHong Zhang 
1380d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data)
1381d71ae5a4SJacob Faibussowitsch {
13826718818eSStefano Zampini   Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data;
13836d373c3eSHong Zhang 
13846d373c3eSHong Zhang   PetscFunctionBegin;
13859566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&atb->At));
13861baa6e33SBarry Smith   if (atb->destroy) PetscCall((*atb->destroy)(atb->data));
13879566063dSJacob Faibussowitsch   PetscCall(PetscFree(atb));
13883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13896d373c3eSHong Zhang }
13906d373c3eSHong Zhang 
1391d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1392d71ae5a4SJacob Faibussowitsch {
1393089a957eSStefano Zampini   Mat          At      = NULL;
13944222ddf1SHong Zhang   Mat_Product *product = C->product;
1395089a957eSStefano Zampini   PetscBool    flg, def, square;
1396bc011b1eSHong Zhang 
1397bc011b1eSHong Zhang   PetscFunctionBegin;
1398089a957eSStefano Zampini   MatCheckProduct(C, 4);
1399b94d7dedSBarry Smith   square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
14004222ddf1SHong Zhang   /* outerproduct */
14019566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
14024222ddf1SHong Zhang   if (flg) {
1403bc011b1eSHong Zhang     /* create symbolic At */
1404089a957eSStefano Zampini     if (!square) {
14057fb60732SBarry Smith       PetscCall(MatTransposeSymbolic(A, &At));
14069566063dSJacob Faibussowitsch       PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs)));
14079566063dSJacob Faibussowitsch       PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1408089a957eSStefano Zampini     }
1409bc011b1eSHong Zhang     /* get symbolic C=At*B */
14109566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14119566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1412bc011b1eSHong Zhang 
1413bc011b1eSHong Zhang     /* clean up */
141448a46eb9SPierre Jolivet     if (!square) PetscCall(MatDestroy(&At));
14156d373c3eSHong Zhang 
14164222ddf1SHong Zhang     C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
14179566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
14183ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14194222ddf1SHong Zhang   }
14204222ddf1SHong Zhang 
14214222ddf1SHong Zhang   /* matmatmult */
14229566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &def));
14239566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1424089a957eSStefano Zampini   if (flg || def) {
14254222ddf1SHong Zhang     Mat_MatTransMatMult *atb;
14264222ddf1SHong Zhang 
142728b400f6SJacob Faibussowitsch     PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
14289566063dSJacob Faibussowitsch     PetscCall(PetscNew(&atb));
142948a46eb9SPierre Jolivet     if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
14309566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "sorted"));
14319566063dSJacob Faibussowitsch     PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
14329566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, "at*b"));
14336718818eSStefano Zampini     product->data    = atb;
14346718818eSStefano Zampini     product->destroy = MatDestroy_SeqAIJ_MatTransMatMult;
14354222ddf1SHong Zhang     atb->At          = At;
14364222ddf1SHong Zhang 
14374222ddf1SHong Zhang     C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
14383ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
14394222ddf1SHong Zhang   }
14404222ddf1SHong Zhang 
14414222ddf1SHong Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1442bc011b1eSHong Zhang }
1443bc011b1eSHong Zhang 
1444d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1445d71ae5a4SJacob Faibussowitsch {
14460fbc74f4SHong Zhang   Mat_SeqAIJ    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1447d0f46423SBarry Smith   PetscInt       am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1448d0f46423SBarry Smith   PetscInt       cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1449d13dce4bSSatish Balay   PetscLogDouble flops = 0.0;
1450aa1aec99SHong Zhang   MatScalar     *aa    = a->a, *ba, *ca, *caj;
1451bc011b1eSHong Zhang 
1452bc011b1eSHong Zhang   PetscFunctionBegin;
1453aa1aec99SHong Zhang   if (!c->a) {
14549566063dSJacob Faibussowitsch     PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
14552205254eSKarl Rupp 
1456aa1aec99SHong Zhang     c->a      = ca;
1457aa1aec99SHong Zhang     c->free_a = PETSC_TRUE;
1458aa1aec99SHong Zhang   } else {
1459aa1aec99SHong Zhang     ca = c->a;
14609566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(ca, ci[cm]));
1461aa1aec99SHong Zhang   }
1462bc011b1eSHong Zhang 
1463bc011b1eSHong Zhang   /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1464bc011b1eSHong Zhang   for (i = 0; i < am; i++) {
1465bc011b1eSHong Zhang     bj   = b->j + bi[i];
1466bc011b1eSHong Zhang     ba   = b->a + bi[i];
1467bc011b1eSHong Zhang     bnzi = bi[i + 1] - bi[i];
1468bc011b1eSHong Zhang     anzi = ai[i + 1] - ai[i];
1469bc011b1eSHong Zhang     for (j = 0; j < anzi; j++) {
1470bc011b1eSHong Zhang       nextb = 0;
14710fbc74f4SHong Zhang       crow  = *aj++;
1472bc011b1eSHong Zhang       cjj   = cj + ci[crow];
1473bc011b1eSHong Zhang       caj   = ca + ci[crow];
1474bc011b1eSHong Zhang       /* perform sparse axpy operation.  Note cjj includes bj. */
1475bc011b1eSHong Zhang       for (k = 0; nextb < bnzi; k++) {
14760fbc74f4SHong Zhang         if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
14770fbc74f4SHong Zhang           caj[k] += (*aa) * (*(ba + nextb));
1478bc011b1eSHong Zhang           nextb++;
1479bc011b1eSHong Zhang         }
1480bc011b1eSHong Zhang       }
1481bc011b1eSHong Zhang       flops += 2 * bnzi;
14820fbc74f4SHong Zhang       aa++;
1483bc011b1eSHong Zhang     }
1484bc011b1eSHong Zhang   }
1485bc011b1eSHong Zhang 
1486bc011b1eSHong Zhang   /* Assemble the final matrix and clean up */
14879566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
14889566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
14899566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(flops));
14903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1491bc011b1eSHong Zhang }
14929123193aSHong Zhang 
1493d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1494d71ae5a4SJacob Faibussowitsch {
14959123193aSHong Zhang   PetscFunctionBegin;
14969566063dSJacob Faibussowitsch   PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
14974222ddf1SHong Zhang   C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
14983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
14999123193aSHong Zhang }
15009123193aSHong Zhang 
1501d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1502d71ae5a4SJacob Faibussowitsch {
1503f32d5d43SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
15041ca9667aSStefano Zampini   PetscScalar       *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1505a4af7ca8SStefano Zampini   const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1506daf57211SHong Zhang   const PetscInt    *aj;
150775f6d85dSStefano Zampini   PetscInt           cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
150875f6d85dSStefano Zampini   PetscInt           clda;
150975f6d85dSStefano Zampini   PetscInt           am4, bm4, col, i, j, n;
15109123193aSHong Zhang 
15119123193aSHong Zhang   PetscFunctionBegin;
15123ba16761SJacob Faibussowitsch   if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
15139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJGetArrayRead(A, &av));
151493aa15f2SStefano Zampini   if (add) {
15159566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArray(C, &c));
151693aa15f2SStefano Zampini   } else {
15179566063dSJacob Faibussowitsch     PetscCall(MatDenseGetArrayWrite(C, &c));
151893aa15f2SStefano Zampini   }
15199566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(B, &b));
15209566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &bm));
15219566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(C, &clda));
152275f6d85dSStefano Zampini   am4 = 4 * clda;
152375f6d85dSStefano Zampini   bm4 = 4 * bm;
15245c0db29aSPierre Jolivet   if (b) {
15259371c9d4SSatish Balay     b1 = b;
15269371c9d4SSatish Balay     b2 = b1 + bm;
15279371c9d4SSatish Balay     b3 = b2 + bm;
15289371c9d4SSatish Balay     b4 = b3 + bm;
15295c0db29aSPierre Jolivet   } else b1 = b2 = b3 = b4 = NULL;
15309371c9d4SSatish Balay   c1 = c;
15319371c9d4SSatish Balay   c2 = c1 + clda;
15329371c9d4SSatish Balay   c3 = c2 + clda;
15339371c9d4SSatish Balay   c4 = c3 + clda;
15341ca9667aSStefano Zampini   for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
15351ca9667aSStefano Zampini     for (i = 0; i < am; i++) {                  /* over rows of A in those columns */
1536f32d5d43SBarry Smith       r1 = r2 = r3 = r4 = 0.0;
1537f32d5d43SBarry Smith       n                 = a->i[i + 1] - a->i[i];
15388e3a54c0SPierre Jolivet       aj                = PetscSafePointerPlusOffset(a->j, a->i[i]);
15398e3a54c0SPierre Jolivet       aa                = PetscSafePointerPlusOffset(av, a->i[i]);
1540f32d5d43SBarry Smith       for (j = 0; j < n; j++) {
15411ca9667aSStefano Zampini         const PetscScalar aatmp = aa[j];
15421ca9667aSStefano Zampini         const PetscInt    ajtmp = aj[j];
1543730858b9SHong Zhang         r1 += aatmp * b1[ajtmp];
1544730858b9SHong Zhang         r2 += aatmp * b2[ajtmp];
1545730858b9SHong Zhang         r3 += aatmp * b3[ajtmp];
1546730858b9SHong Zhang         r4 += aatmp * b4[ajtmp];
15479123193aSHong Zhang       }
154893aa15f2SStefano Zampini       if (add) {
154987753ddeSHong Zhang         c1[i] += r1;
155087753ddeSHong Zhang         c2[i] += r2;
155187753ddeSHong Zhang         c3[i] += r3;
155287753ddeSHong Zhang         c4[i] += r4;
155393aa15f2SStefano Zampini       } else {
155493aa15f2SStefano Zampini         c1[i] = r1;
155593aa15f2SStefano Zampini         c2[i] = r2;
155693aa15f2SStefano Zampini         c3[i] = r3;
155793aa15f2SStefano Zampini         c4[i] = r4;
155893aa15f2SStefano Zampini       }
1559f32d5d43SBarry Smith     }
15605c0db29aSPierre Jolivet     if (b) {
15619371c9d4SSatish Balay       b1 += bm4;
15629371c9d4SSatish Balay       b2 += bm4;
15639371c9d4SSatish Balay       b3 += bm4;
15649371c9d4SSatish Balay       b4 += bm4;
15655c0db29aSPierre Jolivet     }
15669371c9d4SSatish Balay     c1 += am4;
15679371c9d4SSatish Balay     c2 += am4;
15689371c9d4SSatish Balay     c3 += am4;
15699371c9d4SSatish Balay     c4 += am4;
1570f32d5d43SBarry Smith   }
157193aa15f2SStefano Zampini   /* process remaining columns */
157293aa15f2SStefano Zampini   if (col != cn) {
157393aa15f2SStefano Zampini     PetscInt rc = cn - col;
157493aa15f2SStefano Zampini 
157593aa15f2SStefano Zampini     if (rc == 1) {
157693aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
1577f32d5d43SBarry Smith         r1 = 0.0;
1578f32d5d43SBarry Smith         n  = a->i[i + 1] - a->i[i];
15798e3a54c0SPierre Jolivet         aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
15808e3a54c0SPierre Jolivet         aa = PetscSafePointerPlusOffset(av, a->i[i]);
158193aa15f2SStefano Zampini         for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
158293aa15f2SStefano Zampini         if (add) c1[i] += r1;
158393aa15f2SStefano Zampini         else c1[i] = r1;
158493aa15f2SStefano Zampini       }
158593aa15f2SStefano Zampini     } else if (rc == 2) {
158693aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
158793aa15f2SStefano Zampini         r1 = r2 = 0.0;
158893aa15f2SStefano Zampini         n       = a->i[i + 1] - a->i[i];
15898e3a54c0SPierre Jolivet         aj      = PetscSafePointerPlusOffset(a->j, a->i[i]);
15908e3a54c0SPierre Jolivet         aa      = PetscSafePointerPlusOffset(av, a->i[i]);
1591f32d5d43SBarry Smith         for (j = 0; j < n; j++) {
159293aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
159393aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
159493aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
159593aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
1596f32d5d43SBarry Smith         }
159793aa15f2SStefano Zampini         if (add) {
159887753ddeSHong Zhang           c1[i] += r1;
159993aa15f2SStefano Zampini           c2[i] += r2;
160093aa15f2SStefano Zampini         } else {
160193aa15f2SStefano Zampini           c1[i] = r1;
160293aa15f2SStefano Zampini           c2[i] = r2;
1603f32d5d43SBarry Smith         }
160493aa15f2SStefano Zampini       }
160593aa15f2SStefano Zampini     } else {
160693aa15f2SStefano Zampini       for (i = 0; i < am; i++) {
160793aa15f2SStefano Zampini         r1 = r2 = r3 = 0.0;
160893aa15f2SStefano Zampini         n            = a->i[i + 1] - a->i[i];
16098e3a54c0SPierre Jolivet         aj           = PetscSafePointerPlusOffset(a->j, a->i[i]);
16108e3a54c0SPierre Jolivet         aa           = PetscSafePointerPlusOffset(av, a->i[i]);
161193aa15f2SStefano Zampini         for (j = 0; j < n; j++) {
161293aa15f2SStefano Zampini           const PetscScalar aatmp = aa[j];
161393aa15f2SStefano Zampini           const PetscInt    ajtmp = aj[j];
161493aa15f2SStefano Zampini           r1 += aatmp * b1[ajtmp];
161593aa15f2SStefano Zampini           r2 += aatmp * b2[ajtmp];
161693aa15f2SStefano Zampini           r3 += aatmp * b3[ajtmp];
161793aa15f2SStefano Zampini         }
161893aa15f2SStefano Zampini         if (add) {
161993aa15f2SStefano Zampini           c1[i] += r1;
162093aa15f2SStefano Zampini           c2[i] += r2;
162193aa15f2SStefano Zampini           c3[i] += r3;
162293aa15f2SStefano Zampini         } else {
162393aa15f2SStefano Zampini           c1[i] = r1;
162493aa15f2SStefano Zampini           c2[i] = r2;
162593aa15f2SStefano Zampini           c3[i] = r3;
162693aa15f2SStefano Zampini         }
162793aa15f2SStefano Zampini       }
162893aa15f2SStefano Zampini     }
1629f32d5d43SBarry Smith   }
16309566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
163193aa15f2SStefano Zampini   if (add) {
16329566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArray(C, &c));
163393aa15f2SStefano Zampini   } else {
16349566063dSJacob Faibussowitsch     PetscCall(MatDenseRestoreArrayWrite(C, &c));
163593aa15f2SStefano Zampini   }
16369566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(B, &b));
16379566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
16383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1639f32d5d43SBarry Smith }
1640f32d5d43SBarry Smith 
1641d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1642d71ae5a4SJacob Faibussowitsch {
1643f32d5d43SBarry Smith   PetscFunctionBegin;
164408401ef6SPierre Jolivet   PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
164508401ef6SPierre Jolivet   PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
164608401ef6SPierre Jolivet   PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
16474324174eSBarry Smith 
16489566063dSJacob Faibussowitsch   PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
16493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16509123193aSHong Zhang }
1651b1683b59SHong Zhang 
1652d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1653d71ae5a4SJacob Faibussowitsch {
16544222ddf1SHong Zhang   PetscFunctionBegin;
16554222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
16564222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
16573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16584222ddf1SHong Zhang }
16594222ddf1SHong Zhang 
16606718818eSStefano Zampini PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
16616718818eSStefano Zampini 
1662d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1663d71ae5a4SJacob Faibussowitsch {
16644222ddf1SHong Zhang   PetscFunctionBegin;
16656718818eSStefano Zampini   C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16664222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_AtB;
16673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16686718818eSStefano Zampini }
16696718818eSStefano Zampini 
1670d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1671d71ae5a4SJacob Faibussowitsch {
16726718818eSStefano Zampini   PetscFunctionBegin;
16736718818eSStefano Zampini   C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
16746718818eSStefano Zampini   C->ops->productsymbolic          = MatProductSymbolic_ABt;
16753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16764222ddf1SHong Zhang }
16774222ddf1SHong Zhang 
1678d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1679d71ae5a4SJacob Faibussowitsch {
16804222ddf1SHong Zhang   Mat_Product *product = C->product;
16814222ddf1SHong Zhang 
16824222ddf1SHong Zhang   PetscFunctionBegin;
16834222ddf1SHong Zhang   switch (product->type) {
1684d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
1685d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1686d71ae5a4SJacob Faibussowitsch     break;
1687d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
1688d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1689d71ae5a4SJacob Faibussowitsch     break;
1690d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
1691d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1692d71ae5a4SJacob Faibussowitsch     break;
1693d71ae5a4SJacob Faibussowitsch   default:
1694d71ae5a4SJacob Faibussowitsch     break;
16954222ddf1SHong Zhang   }
16963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16974222ddf1SHong Zhang }
16982ef1f0ffSBarry Smith 
1699d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1700d71ae5a4SJacob Faibussowitsch {
17014222ddf1SHong Zhang   Mat_Product *product = C->product;
17024222ddf1SHong Zhang   Mat          A       = product->A;
17034222ddf1SHong Zhang   PetscBool    baij;
17044222ddf1SHong Zhang 
17054222ddf1SHong Zhang   PetscFunctionBegin;
17069566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
17074222ddf1SHong Zhang   if (!baij) { /* A is seqsbaij */
17084222ddf1SHong Zhang     PetscBool sbaij;
17099566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
171028b400f6SJacob Faibussowitsch     PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
17114222ddf1SHong Zhang 
17124222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
17134222ddf1SHong Zhang   } else { /* A is seqbaij */
17144222ddf1SHong Zhang     C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
17154222ddf1SHong Zhang   }
17164222ddf1SHong Zhang 
17174222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17194222ddf1SHong Zhang }
17204222ddf1SHong Zhang 
1721d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1722d71ae5a4SJacob Faibussowitsch {
17234222ddf1SHong Zhang   Mat_Product *product = C->product;
17244222ddf1SHong Zhang 
17254222ddf1SHong Zhang   PetscFunctionBegin;
17266718818eSStefano Zampini   MatCheckProduct(C, 1);
172728b400f6SJacob Faibussowitsch   PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1728b94d7dedSBarry Smith   if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
17293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17304222ddf1SHong Zhang }
17316718818eSStefano Zampini 
1732d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1733d71ae5a4SJacob Faibussowitsch {
17344222ddf1SHong Zhang   PetscFunctionBegin;
17354222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
17364222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
17373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17384222ddf1SHong Zhang }
17394222ddf1SHong Zhang 
1740d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1741d71ae5a4SJacob Faibussowitsch {
17424222ddf1SHong Zhang   Mat_Product *product = C->product;
17434222ddf1SHong Zhang 
17444222ddf1SHong Zhang   PetscFunctionBegin;
174548a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
17463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17474222ddf1SHong Zhang }
17484222ddf1SHong Zhang 
1749d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1750d71ae5a4SJacob Faibussowitsch {
17512f5f1f90SHong Zhang   Mat_SeqAIJ   *b       = (Mat_SeqAIJ *)B->data;
17522f5f1f90SHong Zhang   Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
17532f5f1f90SHong Zhang   PetscInt     *bi = b->i, *bj = b->j;
17542f5f1f90SHong Zhang   PetscInt      m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
17552f5f1f90SHong Zhang   MatScalar    *btval, *btval_den, *ba = b->a;
17562f5f1f90SHong Zhang   PetscInt     *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1757c8db22f6SHong Zhang 
1758c8db22f6SHong Zhang   PetscFunctionBegin;
17592f5f1f90SHong Zhang   btval_den = btdense->v;
17609566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(btval_den, m * n));
17612f5f1f90SHong Zhang   for (k = 0; k < ncolors; k++) {
17622f5f1f90SHong Zhang     ncolumns = coloring->ncolumns[k];
17632f5f1f90SHong Zhang     for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1764d2853540SHong Zhang       col   = *(columns + colorforcol[k] + l);
17652f5f1f90SHong Zhang       btcol = bj + bi[col];
17662f5f1f90SHong Zhang       btval = ba + bi[col];
17672f5f1f90SHong Zhang       anz   = bi[col + 1] - bi[col];
1768d2853540SHong Zhang       for (j = 0; j < anz; j++) {
17692f5f1f90SHong Zhang         brow            = btcol[j];
17702f5f1f90SHong Zhang         btval_den[brow] = btval[j];
1771c8db22f6SHong Zhang       }
1772c8db22f6SHong Zhang     }
17732f5f1f90SHong Zhang     btval_den += m;
1774c8db22f6SHong Zhang   }
17753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1776c8db22f6SHong Zhang }
1777c8db22f6SHong Zhang 
1778d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1779d71ae5a4SJacob Faibussowitsch {
1780b2d2b04fSHong Zhang   Mat_SeqAIJ        *csp = (Mat_SeqAIJ *)Csp->data;
17811683a169SBarry Smith   const PetscScalar *ca_den, *ca_den_ptr;
17821683a169SBarry Smith   PetscScalar       *ca = csp->a;
1783f99a636bSHong Zhang   PetscInt           k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1784e88777f2SHong Zhang   PetscInt           brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1785077f23c2SHong Zhang   PetscInt           nrows, *row, *idx;
1786077f23c2SHong Zhang   PetscInt          *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
17878972f759SHong Zhang 
17888972f759SHong Zhang   PetscFunctionBegin;
17899566063dSJacob Faibussowitsch   PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1790f99a636bSHong Zhang 
1791077f23c2SHong Zhang   if (brows > 0) {
1792077f23c2SHong Zhang     PetscInt *lstart, row_end, row_start;
1793980ae229SHong Zhang     lstart = matcoloring->lstart;
17949566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(lstart, ncolors));
1795eeb4fd02SHong Zhang 
1796077f23c2SHong Zhang     row_end = brows;
1797eeb4fd02SHong Zhang     if (row_end > m) row_end = m;
1798077f23c2SHong Zhang     for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1799077f23c2SHong Zhang       ca_den_ptr = ca_den;
1800980ae229SHong Zhang       for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1801eeb4fd02SHong Zhang         nrows = matcoloring->nrows[k];
1802eeb4fd02SHong Zhang         row   = rows + colorforrow[k];
1803eeb4fd02SHong Zhang         idx   = den2sp + colorforrow[k];
1804eeb4fd02SHong Zhang         for (l = lstart[k]; l < nrows; l++) {
1805eeb4fd02SHong Zhang           if (row[l] >= row_end) {
1806eeb4fd02SHong Zhang             lstart[k] = l;
1807eeb4fd02SHong Zhang             break;
1808eeb4fd02SHong Zhang           } else {
1809077f23c2SHong Zhang             ca[idx[l]] = ca_den_ptr[row[l]];
1810eeb4fd02SHong Zhang           }
1811eeb4fd02SHong Zhang         }
1812077f23c2SHong Zhang         ca_den_ptr += m;
1813eeb4fd02SHong Zhang       }
1814077f23c2SHong Zhang       row_end += brows;
1815eeb4fd02SHong Zhang       if (row_end > m) row_end = m;
1816eeb4fd02SHong Zhang     }
1817077f23c2SHong Zhang   } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1818077f23c2SHong Zhang     ca_den_ptr = ca_den;
1819077f23c2SHong Zhang     for (k = 0; k < ncolors; k++) {
1820077f23c2SHong Zhang       nrows = matcoloring->nrows[k];
1821077f23c2SHong Zhang       row   = rows + colorforrow[k];
1822077f23c2SHong Zhang       idx   = den2sp + colorforrow[k];
1823ad540459SPierre Jolivet       for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1824077f23c2SHong Zhang       ca_den_ptr += m;
1825077f23c2SHong Zhang     }
1826f99a636bSHong Zhang   }
1827f99a636bSHong Zhang 
18289566063dSJacob Faibussowitsch   PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1829f99a636bSHong Zhang #if defined(PETSC_USE_INFO)
1830077f23c2SHong Zhang   if (matcoloring->brows > 0) {
18319566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1832e88777f2SHong Zhang   } else {
18339566063dSJacob Faibussowitsch     PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1834e88777f2SHong Zhang   }
1835f99a636bSHong Zhang #endif
18363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18378972f759SHong Zhang }
18388972f759SHong Zhang 
1839d71ae5a4SJacob Faibussowitsch PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1840d71ae5a4SJacob Faibussowitsch {
1841e88777f2SHong Zhang   PetscInt        i, n, nrows, Nbs, j, k, m, ncols, col, cm;
18421a83f524SJed Brown   const PetscInt *is, *ci, *cj, *row_idx;
1843b28d80bdSHong Zhang   PetscInt        nis = iscoloring->n, *rowhit, bs = 1;
1844b1683b59SHong Zhang   IS             *isa;
1845b28d80bdSHong Zhang   Mat_SeqAIJ     *csp = (Mat_SeqAIJ *)mat->data;
1846e88777f2SHong Zhang   PetscInt       *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1847e88777f2SHong Zhang   PetscInt       *colorforcol, *columns, *columns_i, brows;
1848e88777f2SHong Zhang   PetscBool       flg;
1849b1683b59SHong Zhang 
1850b1683b59SHong Zhang   PetscFunctionBegin;
18519566063dSJacob Faibussowitsch   PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1852e99be685SHong Zhang 
18537ecccc15SHong Zhang   /* bs >1 is not being tested yet! */
1854e88777f2SHong Zhang   Nbs       = mat->cmap->N / bs;
1855b1683b59SHong Zhang   c->M      = mat->rmap->N / bs; /* set total rows, columns and local rows */
1856e88777f2SHong Zhang   c->N      = Nbs;
1857e88777f2SHong Zhang   c->m      = c->M;
1858b1683b59SHong Zhang   c->rstart = 0;
1859e88777f2SHong Zhang   c->brows  = 100;
1860b1683b59SHong Zhang 
1861b1683b59SHong Zhang   c->ncolors = nis;
18629566063dSJacob Faibussowitsch   PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
18639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &rows));
18649566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1865e88777f2SHong Zhang 
1866e88777f2SHong Zhang   brows = c->brows;
18679566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1868e88777f2SHong Zhang   if (flg) c->brows = brows;
186948a46eb9SPierre Jolivet   if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
18702205254eSKarl Rupp 
1871d2853540SHong Zhang   colorforrow[0] = 0;
1872d2853540SHong Zhang   rows_i         = rows;
1873f99a636bSHong Zhang   den2sp_i       = den2sp;
1874b1683b59SHong Zhang 
18759566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nis + 1, &colorforcol));
18769566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(Nbs + 1, &columns));
18772205254eSKarl Rupp 
1878d2853540SHong Zhang   colorforcol[0] = 0;
1879d2853540SHong Zhang   columns_i      = columns;
1880d2853540SHong Zhang 
1881077f23c2SHong Zhang   /* get column-wise storage of mat */
18829566063dSJacob Faibussowitsch   PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1883b1683b59SHong Zhang 
1884b28d80bdSHong Zhang   cm = c->m;
18859566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &rowhit));
18869566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cm + 1, &idxhit));
1887f99a636bSHong Zhang   for (i = 0; i < nis; i++) { /* loop over color */
18889566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(isa[i], &n));
18899566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isa[i], &is));
18902205254eSKarl Rupp 
1891b1683b59SHong Zhang     c->ncolumns[i] = n;
18921baa6e33SBarry Smith     if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1893d2853540SHong Zhang     colorforcol[i + 1] = colorforcol[i] + n;
1894d2853540SHong Zhang     columns_i += n;
1895b1683b59SHong Zhang 
1896b1683b59SHong Zhang     /* fast, crude version requires O(N*N) work */
18979566063dSJacob Faibussowitsch     PetscCall(PetscArrayzero(rowhit, cm));
1898e99be685SHong Zhang 
1899b7caf3d6SHong Zhang     for (j = 0; j < n; j++) { /* loop over columns*/
1900b1683b59SHong Zhang       col     = is[j];
1901d2853540SHong Zhang       row_idx = cj + ci[col];
1902b1683b59SHong Zhang       m       = ci[col + 1] - ci[col];
1903b7caf3d6SHong Zhang       for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1904e99be685SHong Zhang         idxhit[*row_idx]   = spidx[ci[col] + k];
1905d2853540SHong Zhang         rowhit[*row_idx++] = col + 1;
1906b1683b59SHong Zhang       }
1907b1683b59SHong Zhang     }
1908b1683b59SHong Zhang     /* count the number of hits */
1909b1683b59SHong Zhang     nrows = 0;
1910e8354b3eSHong Zhang     for (j = 0; j < cm; j++) {
1911b1683b59SHong Zhang       if (rowhit[j]) nrows++;
1912b1683b59SHong Zhang     }
1913b1683b59SHong Zhang     c->nrows[i]        = nrows;
1914d2853540SHong Zhang     colorforrow[i + 1] = colorforrow[i] + nrows;
1915d2853540SHong Zhang 
1916b1683b59SHong Zhang     nrows = 0;
1917b7caf3d6SHong Zhang     for (j = 0; j < cm; j++) { /* loop over rows */
1918b1683b59SHong Zhang       if (rowhit[j]) {
1919d2853540SHong Zhang         rows_i[nrows]   = j;
192012b89a43SHong Zhang         den2sp_i[nrows] = idxhit[j];
1921b1683b59SHong Zhang         nrows++;
1922b1683b59SHong Zhang       }
1923b1683b59SHong Zhang     }
1924e88777f2SHong Zhang     den2sp_i += nrows;
1925077f23c2SHong Zhang 
19269566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isa[i], &is));
1927f99a636bSHong Zhang     rows_i += nrows;
1928b1683b59SHong Zhang   }
19299566063dSJacob Faibussowitsch   PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
19309566063dSJacob Faibussowitsch   PetscCall(PetscFree(rowhit));
19319566063dSJacob Faibussowitsch   PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
19322c71b3e2SJacob Faibussowitsch   PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1933b28d80bdSHong Zhang 
1934d2853540SHong Zhang   c->colorforrow = colorforrow;
1935d2853540SHong Zhang   c->rows        = rows;
1936f99a636bSHong Zhang   c->den2sp      = den2sp;
1937d2853540SHong Zhang   c->colorforcol = colorforcol;
1938d2853540SHong Zhang   c->columns     = columns;
19392205254eSKarl Rupp 
19409566063dSJacob Faibussowitsch   PetscCall(PetscFree(idxhit));
19413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1942b1683b59SHong Zhang }
1943d013fc79Sandi selinger 
1944d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1945d71ae5a4SJacob Faibussowitsch {
19464222ddf1SHong Zhang   Mat_Product *product = C->product;
19474222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19484222ddf1SHong Zhang 
1949df97dc6dSFande Kong   PetscFunctionBegin;
19504222ddf1SHong Zhang   if (C->ops->mattransposemultnumeric) {
19514222ddf1SHong Zhang     /* Alg: "outerproduct" */
19529566063dSJacob Faibussowitsch     PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
19534222ddf1SHong Zhang   } else {
19544222ddf1SHong Zhang     /* Alg: "matmatmult" -- C = At*B */
19556718818eSStefano Zampini     Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data;
19564222ddf1SHong Zhang 
195728b400f6SJacob Faibussowitsch     PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
19581cdffd5eSHong Zhang     if (atb->At) {
19591cdffd5eSHong Zhang       /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
19601cdffd5eSHong Zhang          user may have called MatProductReplaceMats() to get this A=product->A */
19611cdffd5eSHong Zhang       PetscCall(MatTransposeSetPrecursor(A, atb->At));
19627fb60732SBarry Smith       PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
19634222ddf1SHong Zhang     }
19647fb60732SBarry Smith     PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
19654222ddf1SHong Zhang   }
19663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1967df97dc6dSFande Kong }
1968df97dc6dSFande Kong 
1969d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1970d71ae5a4SJacob Faibussowitsch {
19714222ddf1SHong Zhang   Mat_Product *product = C->product;
19724222ddf1SHong Zhang   Mat          A = product->A, B = product->B;
19734222ddf1SHong Zhang   PetscReal    fill = product->fill;
1974d013fc79Sandi selinger 
1975d013fc79Sandi selinger   PetscFunctionBegin;
19769566063dSJacob Faibussowitsch   PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
19772869b61bSandi selinger 
19784222ddf1SHong Zhang   C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
19793ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
19802869b61bSandi selinger }
1981d013fc79Sandi selinger 
1982d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1983d71ae5a4SJacob Faibussowitsch {
19844222ddf1SHong Zhang   Mat_Product *product = C->product;
19854222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm */
19864222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
19874222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
19884222ddf1SHong Zhang   const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
19894222ddf1SHong Zhang   PetscInt    nalg        = 7;
19904222ddf1SHong Zhang #else
19914222ddf1SHong Zhang   const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
19924222ddf1SHong Zhang   PetscInt    nalg        = 8;
19934222ddf1SHong Zhang #endif
19944222ddf1SHong Zhang 
19954222ddf1SHong Zhang   PetscFunctionBegin;
19964222ddf1SHong Zhang   /* Set default algorithm */
19979566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
199848a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
1999d013fc79Sandi selinger 
20004222ddf1SHong Zhang   /* Get runtime option */
20014222ddf1SHong Zhang   if (product->api_user) {
2002d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
20039566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
2004d0609cedSBarry Smith     PetscOptionsEnd();
20054222ddf1SHong Zhang   } else {
2006d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
20079566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2008d0609cedSBarry Smith     PetscOptionsEnd();
2009d013fc79Sandi selinger   }
201048a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2011d013fc79Sandi selinger 
20124222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AB;
20134222ddf1SHong Zhang   C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
20143ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20154222ddf1SHong Zhang }
2016d013fc79Sandi selinger 
2017d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2018d71ae5a4SJacob Faibussowitsch {
20194222ddf1SHong Zhang   Mat_Product *product     = C->product;
20204222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20214222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
2022089a957eSStefano Zampini   const char  *algTypes[3] = {"default", "at*b", "outerproduct"};
2023089a957eSStefano Zampini   PetscInt     nalg        = 3;
2024d013fc79Sandi selinger 
20254222ddf1SHong Zhang   PetscFunctionBegin;
20264222ddf1SHong Zhang   /* Get runtime option */
20274222ddf1SHong Zhang   if (product->api_user) {
2028d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
20299566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2030d0609cedSBarry Smith     PetscOptionsEnd();
20314222ddf1SHong Zhang   } else {
2032d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
20339566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2034d0609cedSBarry Smith     PetscOptionsEnd();
20354222ddf1SHong Zhang   }
203648a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
2037d013fc79Sandi selinger 
20384222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
20393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20404222ddf1SHong Zhang }
20414222ddf1SHong Zhang 
2042d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2043d71ae5a4SJacob Faibussowitsch {
20444222ddf1SHong Zhang   Mat_Product *product     = C->product;
20454222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
20464222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
20474222ddf1SHong Zhang   const char  *algTypes[2] = {"default", "color"};
20484222ddf1SHong Zhang   PetscInt     nalg        = 2;
20494222ddf1SHong Zhang 
20504222ddf1SHong Zhang   PetscFunctionBegin;
20514222ddf1SHong Zhang   /* Set default algorithm */
20529566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
20534222ddf1SHong Zhang   if (!flg) {
20544222ddf1SHong Zhang     alg = 1;
20559566063dSJacob Faibussowitsch     PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20564222ddf1SHong Zhang   }
20574222ddf1SHong Zhang 
20584222ddf1SHong Zhang   /* Get runtime option */
20594222ddf1SHong Zhang   if (product->api_user) {
2060d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
20619566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2062d0609cedSBarry Smith     PetscOptionsEnd();
20634222ddf1SHong Zhang   } else {
2064d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
20659566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2066d0609cedSBarry Smith     PetscOptionsEnd();
20674222ddf1SHong Zhang   }
206848a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20694222ddf1SHong Zhang 
20704222ddf1SHong Zhang   C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
20714222ddf1SHong Zhang   C->ops->productsymbolic          = MatProductSymbolic_ABt;
20723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20734222ddf1SHong Zhang }
20744222ddf1SHong Zhang 
2075d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2076d71ae5a4SJacob Faibussowitsch {
20774222ddf1SHong Zhang   Mat_Product *product = C->product;
20784222ddf1SHong Zhang   PetscBool    flg     = PETSC_FALSE;
20794222ddf1SHong Zhang   PetscInt     alg     = 0; /* default algorithm -- alg=1 should be default!!! */
20804222ddf1SHong Zhang #if !defined(PETSC_HAVE_HYPRE)
20814222ddf1SHong Zhang   const char *algTypes[2] = {"scalable", "rap"};
20824222ddf1SHong Zhang   PetscInt    nalg        = 2;
20834222ddf1SHong Zhang #else
20844222ddf1SHong Zhang   const char *algTypes[3] = {"scalable", "rap", "hypre"};
20854222ddf1SHong Zhang   PetscInt    nalg        = 3;
20864222ddf1SHong Zhang #endif
20874222ddf1SHong Zhang 
20884222ddf1SHong Zhang   PetscFunctionBegin;
20894222ddf1SHong Zhang   /* Set default algorithm */
20909566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
209148a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
20924222ddf1SHong Zhang 
20934222ddf1SHong Zhang   /* Get runtime option */
20944222ddf1SHong Zhang   if (product->api_user) {
2095d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
20969566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2097d0609cedSBarry Smith     PetscOptionsEnd();
20984222ddf1SHong Zhang   } else {
2099d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
21009566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2101d0609cedSBarry Smith     PetscOptionsEnd();
21024222ddf1SHong Zhang   }
210348a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21044222ddf1SHong Zhang 
21054222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
21063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21074222ddf1SHong Zhang }
21084222ddf1SHong Zhang 
2109d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2110d71ae5a4SJacob Faibussowitsch {
21114222ddf1SHong Zhang   Mat_Product *product     = C->product;
21124222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21134222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21144222ddf1SHong Zhang   const char  *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
21154222ddf1SHong Zhang   PetscInt     nalg        = 3;
21164222ddf1SHong Zhang 
21174222ddf1SHong Zhang   PetscFunctionBegin;
21184222ddf1SHong Zhang   /* Set default algorithm */
21199566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
212048a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21214222ddf1SHong Zhang 
21224222ddf1SHong Zhang   /* Get runtime option */
21234222ddf1SHong Zhang   if (product->api_user) {
2124d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
21259566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2126d0609cedSBarry Smith     PetscOptionsEnd();
21274222ddf1SHong Zhang   } else {
2128d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
21299566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2130d0609cedSBarry Smith     PetscOptionsEnd();
21314222ddf1SHong Zhang   }
213248a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21334222ddf1SHong Zhang 
21344222ddf1SHong Zhang   C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
21353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21364222ddf1SHong Zhang }
21374222ddf1SHong Zhang 
21384222ddf1SHong Zhang /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
2139d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2140d71ae5a4SJacob Faibussowitsch {
21414222ddf1SHong Zhang   Mat_Product *product     = C->product;
21424222ddf1SHong Zhang   PetscInt     alg         = 0; /* default algorithm */
21434222ddf1SHong Zhang   PetscBool    flg         = PETSC_FALSE;
21444222ddf1SHong Zhang   const char  *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
21454222ddf1SHong Zhang   PetscInt     nalg        = 7;
21464222ddf1SHong Zhang 
21474222ddf1SHong Zhang   PetscFunctionBegin;
21484222ddf1SHong Zhang   /* Set default algorithm */
21499566063dSJacob Faibussowitsch   PetscCall(PetscStrcmp(product->alg, "default", &flg));
215048a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21514222ddf1SHong Zhang 
21524222ddf1SHong Zhang   /* Get runtime option */
21534222ddf1SHong Zhang   if (product->api_user) {
2154d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
21559566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2156d0609cedSBarry Smith     PetscOptionsEnd();
21574222ddf1SHong Zhang   } else {
2158d0609cedSBarry Smith     PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
21599566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2160d0609cedSBarry Smith     PetscOptionsEnd();
21614222ddf1SHong Zhang   }
216248a46eb9SPierre Jolivet   if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg]));
21634222ddf1SHong Zhang 
21644222ddf1SHong Zhang   C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
21654222ddf1SHong Zhang   C->ops->productsymbolic    = MatProductSymbolic_ABC;
21663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21674222ddf1SHong Zhang }
21684222ddf1SHong Zhang 
2169d71ae5a4SJacob Faibussowitsch PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2170d71ae5a4SJacob Faibussowitsch {
21714222ddf1SHong Zhang   Mat_Product *product = C->product;
21724222ddf1SHong Zhang 
21734222ddf1SHong Zhang   PetscFunctionBegin;
21744222ddf1SHong Zhang   switch (product->type) {
2175d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AB:
2176d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2177d71ae5a4SJacob Faibussowitsch     break;
2178d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_AtB:
2179d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2180d71ae5a4SJacob Faibussowitsch     break;
2181d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABt:
2182d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2183d71ae5a4SJacob Faibussowitsch     break;
2184d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_PtAP:
2185d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2186d71ae5a4SJacob Faibussowitsch     break;
2187d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_RARt:
2188d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2189d71ae5a4SJacob Faibussowitsch     break;
2190d71ae5a4SJacob Faibussowitsch   case MATPRODUCT_ABC:
2191d71ae5a4SJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2192d71ae5a4SJacob Faibussowitsch     break;
2193d71ae5a4SJacob Faibussowitsch   default:
2194d71ae5a4SJacob Faibussowitsch     break;
21954222ddf1SHong Zhang   }
21963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2197d013fc79Sandi selinger }
2198